summaryrefslogtreecommitdiffstats
path: root/src/backend/access/transam
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/backend/access/transam
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/transam')
-rw-r--r--src/backend/access/transam/Makefile43
-rw-r--r--src/backend/access/transam/README896
-rw-r--r--src/backend/access/transam/README.parallel237
-rw-r--r--src/backend/access/transam/clog.c1030
-rw-r--r--src/backend/access/transam/commit_ts.c1035
-rw-r--r--src/backend/access/transam/generic_xlog.c540
-rw-r--r--src/backend/access/transam/multixact.c3428
-rw-r--r--src/backend/access/transam/parallel.c1597
-rw-r--r--src/backend/access/transam/rmgr.c161
-rw-r--r--src/backend/access/transam/slru.c1615
-rw-r--r--src/backend/access/transam/subtrans.c374
-rw-r--r--src/backend/access/transam/timeline.c600
-rw-r--r--src/backend/access/transam/transam.c398
-rw-r--r--src/backend/access/transam/twophase.c2662
-rw-r--r--src/backend/access/transam/twophase_rmgr.c58
-rw-r--r--src/backend/access/transam/varsup.c678
-rw-r--r--src/backend/access/transam/xact.c6249
-rw-r--r--src/backend/access/transam/xlog.c8906
-rw-r--r--src/backend/access/transam/xlogarchive.c762
-rw-r--r--src/backend/access/transam/xlogfuncs.c648
-rw-r--r--src/backend/access/transam/xloginsert.c1318
-rw-r--r--src/backend/access/transam/xlogprefetcher.c1105
-rw-r--r--src/backend/access/transam/xlogreader.c2165
-rw-r--r--src/backend/access/transam/xlogrecovery.c4699
-rw-r--r--src/backend/access/transam/xlogstats.c96
-rw-r--r--src/backend/access/transam/xlogutils.c1064
26 files changed, 42364 insertions, 0 deletions
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
new file mode 100644
index 0000000..3e5444a
--- /dev/null
+++ b/src/backend/access/transam/Makefile
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/transam
+#
+# IDENTIFICATION
+# src/backend/access/transam/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/transam
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ clog.o \
+ commit_ts.o \
+ generic_xlog.o \
+ multixact.o \
+ parallel.o \
+ rmgr.o \
+ slru.o \
+ subtrans.o \
+ timeline.o \
+ transam.o \
+ twophase.o \
+ twophase_rmgr.o \
+ varsup.o \
+ xact.o \
+ xlog.o \
+ xlogarchive.o \
+ xlogfuncs.o \
+ xloginsert.o \
+ xlogprefetcher.o \
+ xlogreader.o \
+ xlogrecovery.o \
+ xlogstats.o \
+ xlogutils.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# ensure that version checks in xlog.c get recompiled when catversion.h changes
+xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
new file mode 100644
index 0000000..26fd77b
--- /dev/null
+++ b/src/backend/access/transam/README
@@ -0,0 +1,896 @@
+src/backend/access/transam/README
+
+The Transaction System
+======================
+
+PostgreSQL's transaction system is a three-layer system. The bottom layer
+implements low-level transactions and subtransactions, on top of which rests
+the mainloop's control code, which in turn implements user-visible
+transactions and savepoints.
+
+The middle layer of code is called by postgres.c before and after the
+processing of each query, or after detecting an error:
+
+ StartTransactionCommand
+ CommitTransactionCommand
+ AbortCurrentTransaction
+
+Meanwhile, the user can alter the system's state by issuing the SQL commands
+BEGIN, COMMIT, ROLLBACK, SAVEPOINT, ROLLBACK TO or RELEASE. The traffic cop
+redirects these calls to the toplevel routines
+
+ BeginTransactionBlock
+ EndTransactionBlock
+ UserAbortTransactionBlock
+ DefineSavepoint
+ RollbackToSavepoint
+ ReleaseSavepoint
+
+respectively. Depending on the current state of the system, these functions
+call low level functions to activate the real transaction system:
+
+ StartTransaction
+ CommitTransaction
+ AbortTransaction
+ CleanupTransaction
+ StartSubTransaction
+ CommitSubTransaction
+ AbortSubTransaction
+ CleanupSubTransaction
+
+Additionally, within a transaction, CommandCounterIncrement is called to
+increment the command counter, which allows future commands to "see" the
+effects of previous commands within the same transaction. Note that this is
+done automatically by CommitTransactionCommand after each query inside a
+transaction block, but some utility functions also do it internally to allow
+some operations (usually in the system catalogs) to be seen by future
+operations in the same utility command. (For example, in DefineRelation it is
+done after creating the heap so the pg_class row is visible, to be able to
+lock it.)
+
+
+For example, consider the following sequence of user commands:
+
+1) BEGIN
+2) SELECT * FROM foo
+3) INSERT INTO foo VALUES (...)
+4) COMMIT
+
+In the main processing loop, this results in the following function call
+sequence:
+
+ / StartTransactionCommand;
+ / StartTransaction;
+1) < ProcessUtility; << BEGIN
+ \ BeginTransactionBlock;
+ \ CommitTransactionCommand;
+
+ / StartTransactionCommand;
+2) / PortalRunSelect; << SELECT ...
+ \ CommitTransactionCommand;
+ \ CommandCounterIncrement;
+
+ / StartTransactionCommand;
+3) / ProcessQuery; << INSERT ...
+ \ CommitTransactionCommand;
+ \ CommandCounterIncrement;
+
+ / StartTransactionCommand;
+ / ProcessUtility; << COMMIT
+4) < EndTransactionBlock;
+ \ CommitTransactionCommand;
+ \ CommitTransaction;
+
+The point of this example is to demonstrate the need for
+StartTransactionCommand and CommitTransactionCommand to be state smart -- they
+should call CommandCounterIncrement between the calls to BeginTransactionBlock
+and EndTransactionBlock and outside these calls they need to do normal start,
+commit or abort processing.
+
+Furthermore, suppose the "SELECT * FROM foo" caused an abort condition. In
+this case AbortCurrentTransaction is called, and the transaction is put in
+aborted state. In this state, any user input is ignored except for
+transaction-termination statements, or ROLLBACK TO <savepoint> commands.
+
+Transaction aborts can occur in two ways:
+
+1) system dies from some internal cause (syntax error, etc)
+2) user types ROLLBACK
+
+The reason we have to distinguish them is illustrated by the following two
+situations:
+
+ case 1 case 2
+ ------ ------
+1) user types BEGIN 1) user types BEGIN
+2) user does something 2) user does something
+3) user does not like what 3) system aborts for some reason
+ she sees and types ABORT (syntax error, etc)
+
+In case 1, we want to abort the transaction and return to the default state.
+In case 2, there may be more commands coming our way which are part of the
+same transaction block; we have to ignore these commands until we see a COMMIT
+or ROLLBACK.
+
+Internal aborts are handled by AbortCurrentTransaction, while user aborts are
+handled by UserAbortTransactionBlock. Both of them rely on AbortTransaction
+to do all the real work. The only difference is what state we enter after
+AbortTransaction does its work:
+
+* AbortCurrentTransaction leaves us in TBLOCK_ABORT,
+* UserAbortTransactionBlock leaves us in TBLOCK_ABORT_END
+
+Low-level transaction abort handling is divided in two phases:
+* AbortTransaction executes as soon as we realize the transaction has
+ failed. It should release all shared resources (locks etc) so that we do
+ not delay other backends unnecessarily.
+* CleanupTransaction executes when we finally see a user COMMIT
+ or ROLLBACK command; it cleans things up and gets us out of the transaction
+ completely. In particular, we mustn't destroy TopTransactionContext until
+ this point.
+
+Also, note that when a transaction is committed, we don't close it right away.
+Rather it's put in TBLOCK_END state, which means that when
+CommitTransactionCommand is called after the query has finished processing,
+the transaction has to be closed. The distinction is subtle but important,
+because it means that control will leave the xact.c code with the transaction
+open, and the main loop will be able to keep processing inside the same
+transaction. So, in a sense, transaction commit is also handled in two
+phases, the first at EndTransactionBlock and the second at
+CommitTransactionCommand (which is where CommitTransaction is actually
+called).
+
+The rest of the code in xact.c are routines to support the creation and
+finishing of transactions and subtransactions. For example, AtStart_Memory
+takes care of initializing the memory subsystem at main transaction start.
+
+
+Subtransaction Handling
+-----------------------
+
+Subtransactions are implemented using a stack of TransactionState structures,
+each of which has a pointer to its parent transaction's struct. When a new
+subtransaction is to be opened, PushTransaction is called, which creates a new
+TransactionState, with its parent link pointing to the current transaction.
+StartSubTransaction is in charge of initializing the new TransactionState to
+sane values, and properly initializing other subsystems (AtSubStart routines).
+
+When closing a subtransaction, either CommitSubTransaction has to be called
+(if the subtransaction is committing), or AbortSubTransaction and
+CleanupSubTransaction (if it's aborting). In either case, PopTransaction is
+called so the system returns to the parent transaction.
+
+One important point regarding subtransaction handling is that several may need
+to be closed in response to a single user command. That's because savepoints
+have names, and we allow to commit or rollback a savepoint by name, which is
+not necessarily the one that was last opened. Also a COMMIT or ROLLBACK
+command must be able to close out the entire stack. We handle this by having
+the utility command subroutine mark all the state stack entries as commit-
+pending or abort-pending, and then when the main loop reaches
+CommitTransactionCommand, the real work is done. The main point of doing
+things this way is that if we get an error while popping state stack entries,
+the remaining stack entries still show what we need to do to finish up.
+
+In the case of ROLLBACK TO <savepoint>, we abort all the subtransactions up
+through the one identified by the savepoint name, and then re-create that
+subtransaction level with the same name. So it's a completely new
+subtransaction as far as the internals are concerned.
+
+Other subsystems are allowed to start "internal" subtransactions, which are
+handled by BeginInternalSubTransaction. This is to allow implementing
+exception handling, e.g. in PL/pgSQL. ReleaseCurrentSubTransaction and
+RollbackAndReleaseCurrentSubTransaction allows the subsystem to close said
+subtransactions. The main difference between this and the savepoint/release
+path is that we execute the complete state transition immediately in each
+subroutine, rather than deferring some work until CommitTransactionCommand.
+Another difference is that BeginInternalSubTransaction is allowed when no
+explicit transaction block has been established, while DefineSavepoint is not.
+
+
+Transaction and Subtransaction Numbering
+----------------------------------------
+
+Transactions and subtransactions are assigned permanent XIDs only when/if
+they first do something that requires one --- typically, insert/update/delete
+a tuple, though there are a few other places that need an XID assigned.
+If a subtransaction requires an XID, we always first assign one to its
+parent. This maintains the invariant that child transactions have XIDs later
+than their parents, which is assumed in a number of places.
+
+The subsidiary actions of obtaining a lock on the XID and entering it into
+pg_subtrans and PG_PROC are done at the time it is assigned.
+
+A transaction that has no XID still needs to be identified for various
+purposes, notably holding locks. For this purpose we assign a "virtual
+transaction ID" or VXID to each top-level transaction. VXIDs are formed from
+two fields, the backendID and a backend-local counter; this arrangement allows
+assignment of a new VXID at transaction start without any contention for
+shared memory. To ensure that a VXID isn't re-used too soon after backend
+exit, we store the last local counter value into shared memory at backend
+exit, and initialize it from the previous value for the same backendID slot
+at backend start. All these counters go back to zero at shared memory
+re-initialization, but that's OK because VXIDs never appear anywhere on-disk.
+
+Internally, a backend needs a way to identify subtransactions whether or not
+they have XIDs; but this need only lasts as long as the parent top transaction
+endures. Therefore, we have SubTransactionId, which is somewhat like
+CommandId in that it's generated from a counter that we reset at the start of
+each top transaction. The top-level transaction itself has SubTransactionId 1,
+and subtransactions have IDs 2 and up. (Zero is reserved for
+InvalidSubTransactionId.) Note that subtransactions do not have their
+own VXIDs; they use the parent top transaction's VXID.
+
+
+Interlocking Transaction Begin, Transaction End, and Snapshots
+--------------------------------------------------------------
+
+We try hard to minimize the amount of overhead and lock contention involved
+in the frequent activities of beginning/ending a transaction and taking a
+snapshot. Unfortunately, we must have some interlocking for this, because
+we must ensure consistency about the commit order of transactions.
+For example, suppose an UPDATE in xact A is blocked by xact B's prior
+update of the same row, and xact B is doing commit while xact C gets a
+snapshot. Xact A can complete and commit as soon as B releases its locks.
+If xact C's GetSnapshotData sees xact B as still running, then it had
+better see xact A as still running as well, or it will be able to see two
+tuple versions - one deleted by xact B and one inserted by xact A. Another
+reason why this would be bad is that C would see (in the row inserted by A)
+earlier changes by B, and it would be inconsistent for C not to see any
+of B's changes elsewhere in the database.
+
+Formally, the correctness requirement is "if a snapshot A considers
+transaction X as committed, and any of transaction X's snapshots considered
+transaction Y as committed, then snapshot A must consider transaction Y as
+committed".
+
+What we actually enforce is strict serialization of commits and rollbacks
+with snapshot-taking: we do not allow any transaction to exit the set of
+running transactions while a snapshot is being taken. (This rule is
+stronger than necessary for consistency, but is relatively simple to
+enforce, and it assists with some other issues as explained below.) The
+implementation of this is that GetSnapshotData takes the ProcArrayLock in
+shared mode (so that multiple backends can take snapshots in parallel),
+but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode
+while clearing the ProcGlobal->xids[] entry at transaction end (either
+commit or abort). (To reduce context switching, when multiple transactions
+commit nearly simultaneously, we have one backend take ProcArrayLock and
+clear the XIDs of multiple processes at once.)
+
+ProcArrayEndTransaction also holds the lock while advancing the shared
+latestCompletedXid variable. This allows GetSnapshotData to use
+latestCompletedXid + 1 as xmax for its snapshot: there can be no
+transaction >= this xid value that the snapshot needs to consider as
+completed.
+
+In short, then, the rule is that no transaction may exit the set of
+currently-running transactions between the time we fetch latestCompletedXid
+and the time we finish building our snapshot. However, this restriction
+only applies to transactions that have an XID --- read-only transactions
+can end without acquiring ProcArrayLock, since they don't affect anyone
+else's snapshot nor latestCompletedXid.
+
+Transaction start, per se, doesn't have any interlocking with these
+considerations, since we no longer assign an XID immediately at transaction
+start. But when we do decide to allocate an XID, GetNewTransactionId must
+store the new XID into the shared ProcArray before releasing XidGenLock.
+This ensures that all top-level XIDs <= latestCompletedXid are either
+present in the ProcArray, or not running anymore. (This guarantee doesn't
+apply to subtransaction XIDs, because of the possibility that there's not
+room for them in the subxid array; instead we guarantee that they are
+present or the overflow flag is set.) If a backend released XidGenLock
+before storing its XID into ProcGlobal->xids[], then it would be possible for
+another backend to allocate and commit a later XID, causing latestCompletedXid
+to pass the first backend's XID, before that value became visible in the
+ProcArray. That would break ComputeXidHorizons, as discussed below.
+
+We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the
+subxid array) without taking ProcArrayLock. This was once necessary to
+avoid deadlock; while that is no longer the case, it's still beneficial for
+performance. We are thereby relying on fetch/store of an XID to be atomic,
+else other backends might see a partially-set XID. This also means that
+readers of the ProcArray xid fields must be careful to fetch a value only
+once, rather than assume they can read it multiple times and get the same
+answer each time. (Use volatile-qualified pointers when doing this, to
+ensure that the C compiler does exactly what you tell it to.)
+
+Another important activity that uses the shared ProcArray is
+ComputeXidHorizons, which must determine a lower bound for the oldest xmin
+of any active MVCC snapshot, system-wide. Each individual backend
+advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero
+if it currently has no live snapshots (eg, if it's between transactions or
+hasn't yet set a snapshot for a new transaction). ComputeXidHorizons takes
+the MIN() of the valid xmin fields. It does this with only shared lock on
+ProcArrayLock, which means there is a potential race condition against other
+backends doing GetSnapshotData concurrently: we must be certain that a
+concurrent backend that is about to set its xmin does not compute an xmin
+less than what ComputeXidHorizons determines. We ensure that by including
+all the active XIDs into the MIN() calculation, along with the valid xmins.
+The rule that transactions can't exit without taking exclusive ProcArrayLock
+ensures that concurrent holders of shared ProcArrayLock will compute the
+same minimum of currently-active XIDs: no xact, in particular not the
+oldest, can exit while we hold shared ProcArrayLock. So
+ComputeXidHorizons's view of the minimum active XID will be the same as that
+of any concurrent GetSnapshotData, and so it can't produce an overestimate.
+If there is no active transaction at all, ComputeXidHorizons uses
+latestCompletedXid + 1, which is a lower bound for the xmin that might
+be computed by concurrent or later GetSnapshotData calls. (We know that no
+XID less than this could be about to appear in the ProcArray, because of the
+XidGenLock interlock discussed above.)
+
+As GetSnapshotData is performance critical, it does not perform an accurate
+oldest-xmin calculation (it used to, until v14). The contents of a snapshot
+only depend on the xids of other backends, not their xmin. As backend's xmin
+changes much more often than its xid, having GetSnapshotData look at xmins
+can lead to a lot of unnecessary cacheline ping-pong. Instead
+GetSnapshotData updates approximate thresholds (one that guarantees that all
+deleted rows older than it can be removed, another determining that deleted
+rows newer than it can not be removed). GlobalVisTest* uses those thresholds
+to make invisibility decision, falling back to ComputeXidHorizons if
+necessary.
+
+Note that while it is certain that two concurrent executions of
+GetSnapshotData will compute the same xmin for their own snapshots, there is
+no such guarantee for the horizons computed by ComputeXidHorizons. This is
+because we allow XID-less transactions to clear their MyProc->xmin
+asynchronously (without taking ProcArrayLock), so one execution might see
+what had been the oldest xmin, and another not. This is OK since the
+thresholds need only be a valid lower bound. As noted above, we are already
+assuming that fetch/store of the xid fields is atomic, so assuming it for
+xmin as well is no extra risk.
+
+
+pg_xact and pg_subtrans
+-----------------------
+
+pg_xact and pg_subtrans are permanent (on-disk) storage of transaction related
+information. There is a limited number of pages of each kept in memory, so
+in many cases there is no need to actually read from disk. However, if
+there's a long running transaction or a backend sitting idle with an open
+transaction, it may be necessary to be able to read and write this information
+from disk. They also allow information to be permanent across server restarts.
+
+pg_xact records the commit status for each transaction that has been assigned
+an XID. A transaction can be in progress, committed, aborted, or
+"sub-committed". This last state means that it's a subtransaction that's no
+longer running, but its parent has not updated its state yet. It is not
+necessary to update a subtransaction's transaction status to subcommit, so we
+can just defer it until main transaction commit. The main role of marking
+transactions as sub-committed is to provide an atomic commit protocol when
+transaction status is spread across multiple clog pages. As a result, whenever
+transaction status spreads across multiple pages we must use a two-phase commit
+protocol: the first phase is to mark the subtransactions as sub-committed, then
+we mark the top level transaction and all its subtransactions committed (in
+that order). Thus, subtransactions that have not aborted appear as in-progress
+even when they have already finished, and the subcommit status appears as a
+very short transitory state during main transaction commit. Subtransaction
+abort is always marked in clog as soon as it occurs. When the transaction
+status all fit in a single CLOG page, we atomically mark them all as committed
+without bothering with the intermediate sub-commit state.
+
+Savepoints are implemented using subtransactions. A subtransaction is a
+transaction inside a transaction; its commit or abort status is not only
+dependent on whether it committed itself, but also whether its parent
+transaction committed. To implement multiple savepoints in a transaction we
+allow unlimited transaction nesting depth, so any particular subtransaction's
+commit state is dependent on the commit status of each and every ancestor
+transaction.
+
+The "subtransaction parent" (pg_subtrans) mechanism records, for each
+transaction with an XID, the TransactionId of its parent transaction. This
+information is stored as soon as the subtransaction is assigned an XID.
+Top-level transactions do not have a parent, so they leave their pg_subtrans
+entries set to the default value of zero (InvalidTransactionId).
+
+pg_subtrans is used to check whether the transaction in question is still
+running --- the main Xid of a transaction is recorded in ProcGlobal->xids[],
+with a copy in PGPROC->xid, but since we allow arbitrary nesting of
+subtransactions, we can't fit all Xids in shared memory, so we have to store
+them on disk. Note, however, that for each transaction we keep a "cache" of
+Xids that are known to be part of the transaction tree, so we can skip looking
+at pg_subtrans unless we know the cache has been overflowed. See
+storage/ipc/procarray.c for the gory details.
+
+slru.c is the supporting mechanism for both pg_xact and pg_subtrans. It
+implements the LRU policy for in-memory buffer pages. The high-level routines
+for pg_xact are implemented in transam.c, while the low-level functions are in
+clog.c. pg_subtrans is contained completely in subtrans.c.
+
+
+Write-Ahead Log Coding
+----------------------
+
+The WAL subsystem (also called XLOG in the code) exists to guarantee crash
+recovery. It can also be used to provide point-in-time recovery, as well as
+hot-standby replication via log shipping. Here are some notes about
+non-obvious aspects of its design.
+
+A basic assumption of a write AHEAD log is that log entries must reach stable
+storage before the data-page changes they describe. This ensures that
+replaying the log to its end will bring us to a consistent state where there
+are no partially-performed transactions. To guarantee this, each data page
+(either heap or index) is marked with the LSN (log sequence number --- in
+practice, a WAL file location) of the latest XLOG record affecting the page.
+Before the bufmgr can write out a dirty page, it must ensure that xlog has
+been flushed to disk at least up to the page's LSN. This low-level
+interaction improves performance by not waiting for XLOG I/O until necessary.
+The LSN check exists only in the shared-buffer manager, not in the local
+buffer manager used for temp tables; hence operations on temp tables must not
+be WAL-logged.
+
+During WAL replay, we can check the LSN of a page to detect whether the change
+recorded by the current log entry is already applied (it has been, if the page
+LSN is >= the log entry's WAL location).
+
+Usually, log entries contain just enough information to redo a single
+incremental update on a page (or small group of pages). This will work only
+if the filesystem and hardware implement data page writes as atomic actions,
+so that a page is never left in a corrupt partly-written state. Since that's
+often an untenable assumption in practice, we log additional information to
+allow complete reconstruction of modified pages. The first WAL record
+affecting a given page after a checkpoint is made to contain a copy of the
+entire page, and we implement replay by restoring that page copy instead of
+redoing the update. (This is more reliable than the data storage itself would
+be because we can check the validity of the WAL record's CRC.) We can detect
+the "first change after checkpoint" by noting whether the page's old LSN
+precedes the end of WAL as of the last checkpoint (the RedoRecPtr).
+
+The general schema for executing a WAL-logged action is
+
+1. Pin and exclusive-lock the shared buffer(s) containing the data page(s)
+to be modified.
+
+2. START_CRIT_SECTION() (Any error during the next three steps must cause a
+PANIC because the shared buffers will contain unlogged changes, which we
+have to ensure don't get to disk. Obviously, you should check conditions
+such as whether there's enough free space on the page before you start the
+critical section.)
+
+3. Apply the required changes to the shared buffer(s).
+
+4. Mark the shared buffer(s) as dirty with MarkBufferDirty(). (This must
+happen before the WAL record is inserted; see notes in SyncOneBuffer().)
+Note that marking a buffer dirty with MarkBufferDirty() should only
+happen iff you write a WAL record; see Writing Hints below.
+
+5. If the relation requires WAL-logging, build a WAL record using
+XLogBeginInsert and XLogRegister* functions, and insert it. (See
+"Constructing a WAL record" below). Then update the page's LSN using the
+returned XLOG location. For instance,
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(...)
+ XLogRegisterData(...)
+ recptr = XLogInsert(rmgr_id, info);
+
+ PageSetLSN(dp, recptr);
+
+6. END_CRIT_SECTION()
+
+7. Unlock and unpin the buffer(s).
+
+Complex changes (such as a multilevel index insertion) normally need to be
+described by a series of atomic-action WAL records. The intermediate states
+must be self-consistent, so that if the replay is interrupted between any
+two actions, the system is fully functional. In btree indexes, for example,
+a page split requires a new page to be allocated, and an insertion of a new
+key in the parent btree level, but for locking reasons this has to be
+reflected by two separate WAL records. Replaying the first record, to
+allocate the new page and move tuples to it, sets a flag on the page to
+indicate that the key has not been inserted to the parent yet. Replaying the
+second record clears the flag. This intermediate state is never seen by
+other backends during normal operation, because the lock on the child page
+is held across the two actions, but will be seen if the operation is
+interrupted before writing the second WAL record. The search algorithm works
+with the intermediate state as normal, but if an insertion encounters a page
+with the incomplete-split flag set, it will finish the interrupted split by
+inserting the key to the parent, before proceeding.
+
+
+Constructing a WAL record
+-------------------------
+
+A WAL record consists of a header common to all WAL record types,
+record-specific data, and information about the data blocks modified. Each
+modified data block is identified by an ID number, and can optionally have
+more record-specific data associated with the block. If XLogInsert decides
+that a full-page image of a block needs to be taken, the data associated
+with that block is not included.
+
+The API for constructing a WAL record consists of five functions:
+XLogBeginInsert, XLogRegisterBuffer, XLogRegisterData, XLogRegisterBufData,
+and XLogInsert. First, call XLogBeginInsert(). Then register all the buffers
+modified, and data needed to replay the changes, using XLogRegister*
+functions. Finally, insert the constructed record to the WAL by calling
+XLogInsert().
+
+ XLogBeginInsert();
+
+ /* register buffers modified as part of this WAL-logged action */
+ XLogRegisterBuffer(0, lbuffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, rbuffer, REGBUF_STANDARD);
+
+ /* register data that is always included in the WAL record */
+ XLogRegisterData(&xlrec, SizeOfFictionalAction);
+
+ /*
+ * register data associated with a buffer. This will not be included
+ * in the record if a full-page image is taken.
+ */
+ XLogRegisterBufData(0, tuple->data, tuple->len);
+
+ /* more data associated with the buffer */
+ XLogRegisterBufData(0, data2, len2);
+
+ /*
+ * Ok, all the data and buffers to include in the WAL record have
+ * been registered. Insert the record.
+ */
+ recptr = XLogInsert(RM_FOO_ID, XLOG_FOOBAR_DO_STUFF);
+
+Details of the API functions:
+
+void XLogBeginInsert(void)
+
+ Must be called before XLogRegisterBuffer and XLogRegisterData.
+
+void XLogResetInsertion(void)
+
+ Clear any currently registered data and buffers from the WAL record
+ construction workspace. This is only needed if you have already called
+ XLogBeginInsert(), but decide to not insert the record after all.
+
+void XLogEnsureRecordSpace(int max_block_id, int ndatas)
+
+ Normally, the WAL record construction buffers have the following limits:
+
+ * highest block ID that can be used is 4 (allowing five block references)
+ * Max 20 chunks of registered data
+
+ These default limits are enough for most record types that change some
+ on-disk structures. For the odd case that requires more data, or needs to
+ modify more buffers, these limits can be raised by calling
+ XLogEnsureRecordSpace(). XLogEnsureRecordSpace() must be called before
+ XLogBeginInsert(), and outside a critical section.
+
+void XLogRegisterBuffer(uint8 block_id, Buffer buf, uint8 flags);
+
+ XLogRegisterBuffer adds information about a data block to the WAL record.
+ block_id is an arbitrary number used to identify this page reference in
+ the redo routine. The information needed to re-find the page at redo -
+ relfilenode, fork, and block number - are included in the WAL record.
+
+ XLogInsert will automatically include a full copy of the page contents, if
+ this is the first modification of the buffer since the last checkpoint.
+ It is important to register every buffer modified by the action with
+ XLogRegisterBuffer, to avoid torn-page hazards.
+
+ The flags control when and how the buffer contents are included in the
+ WAL record. Normally, a full-page image is taken only if the page has not
+ been modified since the last checkpoint, and only if full_page_writes=on
+ or an online backup is in progress. The REGBUF_FORCE_IMAGE flag can be
+ used to force a full-page image to always be included; that is useful
+ e.g. for an operation that rewrites most of the page, so that tracking the
+ details is not worth it. For the rare case where it is not necessary to
+ protect from torn pages, REGBUF_NO_IMAGE flag can be used to suppress
+ full page image from being taken. REGBUF_WILL_INIT also suppresses a full
+ page image, but the redo routine must re-generate the page from scratch,
+ without looking at the old page contents. Re-initializing the page
+ protects from torn page hazards like a full page image does.
+
+ The REGBUF_STANDARD flag can be specified together with the other flags to
+ indicate that the page follows the standard page layout. It causes the
+ area between pd_lower and pd_upper to be left out from the image, reducing
+ WAL volume.
+
+ If the REGBUF_KEEP_DATA flag is given, any per-buffer data registered with
+ XLogRegisterBufData() is included in the WAL record even if a full-page
+ image is taken.
+
+void XLogRegisterData(char *data, int len);
+
+ XLogRegisterData is used to include arbitrary data in the WAL record. If
+ XLogRegisterData() is called multiple times, the data are appended, and
+ will be made available to the redo routine as one contiguous chunk.
+
+void XLogRegisterBufData(uint8 block_id, char *data, int len);
+
+ XLogRegisterBufData is used to include data associated with a particular
+ buffer that was registered earlier with XLogRegisterBuffer(). If
+ XLogRegisterBufData() is called multiple times with the same block ID, the
+ data are appended, and will be made available to the redo routine as one
+ contiguous chunk.
+
+ If a full-page image of the buffer is taken at insertion, the data is not
+ included in the WAL record, unless the REGBUF_KEEP_DATA flag is used.
+
+
+Writing a REDO routine
+----------------------
+
+A REDO routine uses the data and page references included in the WAL record
+to reconstruct the new state of the page. The record decoding functions
+and macros in xlogreader.c/h can be used to extract the data from the record.
+
+When replaying a WAL record that describes changes on multiple pages, you
+must be careful to lock the pages properly to prevent concurrent Hot Standby
+queries from seeing an inconsistent state. If this requires that two
+or more buffer locks be held concurrently, you must lock the pages in
+appropriate order, and not release the locks until all the changes are done.
+
+Note that we must only use PageSetLSN/PageGetLSN() when we know the action
+is serialised. Only Startup process may modify data blocks during recovery,
+so Startup process may execute PageGetLSN() without fear of serialisation
+problems. All other processes must only call PageSet/GetLSN when holding
+either an exclusive buffer lock or a shared lock plus buffer header lock,
+or be writing the data block directly rather than through shared buffers
+while holding AccessExclusiveLock on the relation.
+
+
+Writing Hints
+-------------
+
+In some cases, we write additional information to data blocks without
+writing a preceding WAL record. This should only happen iff the data can
+be reconstructed later following a crash and the action is simply a way
+of optimising for performance. When a hint is written we use
+MarkBufferDirtyHint() to mark the block dirty.
+
+If the buffer is clean and checksums are in use then MarkBufferDirtyHint()
+inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image
+that includes the hint. We do this to avoid a partial page write, when we
+write the dirtied page. WAL is not written during recovery, so we simply skip
+dirtying blocks because of hints when in recovery.
+
+If you do decide to optimise away a WAL record, then any calls to
+MarkBufferDirty() must be replaced by MarkBufferDirtyHint(),
+otherwise you will expose the risk of partial page writes.
+
+
+Write-Ahead Logging for Filesystem Actions
+------------------------------------------
+
+The previous section described how to WAL-log actions that only change page
+contents within shared buffers. For that type of action it is generally
+possible to check all likely error cases (such as insufficient space on the
+page) before beginning to make the actual change. Therefore we can make
+the change and the creation of the associated WAL log record "atomic" by
+wrapping them into a critical section --- the odds of failure partway
+through are low enough that PANIC is acceptable if it does happen.
+
+Clearly, that approach doesn't work for cases where there's a significant
+probability of failure within the action to be logged, such as creation
+of a new file or database. We don't want to PANIC, and we especially don't
+want to PANIC after having already written a WAL record that says we did
+the action --- if we did, replay of the record would probably fail again
+and PANIC again, making the failure unrecoverable. This means that the
+ordinary WAL rule of "write WAL before the changes it describes" doesn't
+work, and we need a different design for such cases.
+
+There are several basic types of filesystem actions that have this
+issue. Here is how we deal with each:
+
+1. Adding a disk page to an existing table.
+
+This action isn't WAL-logged at all. We extend a table by writing a page
+of zeroes at its end. We must actually do this write so that we are sure
+the filesystem has allocated the space. If the write fails we can just
+error out normally. Once the space is known allocated, we can initialize
+and fill the page via one or more normal WAL-logged actions. Because it's
+possible that we crash between extending the file and writing out the WAL
+entries, we have to treat discovery of an all-zeroes page in a table or
+index as being a non-error condition. In such cases we can just reclaim
+the space for re-use.
+
+2. Creating a new table, which requires a new file in the filesystem.
+
+We try to create the file, and if successful we make a WAL record saying
+we did it. If not successful, we can just throw an error. Notice that
+there is a window where we have created the file but not yet written any
+WAL about it to disk. If we crash during this window, the file remains
+on disk as an "orphan". It would be possible to clean up such orphans
+by having database restart search for files that don't have any committed
+entry in pg_class, but that currently isn't done because of the possibility
+of deleting data that is useful for forensic analysis of the crash.
+Orphan files are harmless --- at worst they waste a bit of disk space ---
+because we check for on-disk collisions when allocating new relfilenode
+OIDs. So cleaning up isn't really necessary.
+
+3. Deleting a table, which requires an unlink() that could fail.
+
+Our approach here is to WAL-log the operation first, but to treat failure
+of the actual unlink() call as a warning rather than error condition.
+Again, this can leave an orphan file behind, but that's cheap compared to
+the alternatives. Since we can't actually do the unlink() until after
+we've committed the DROP TABLE transaction, throwing an error would be out
+of the question anyway. (It may be worth noting that the WAL entry about
+the file deletion is actually part of the commit record for the dropping
+transaction.)
+
+4. Creating and deleting databases and tablespaces, which requires creating
+and deleting directories and entire directory trees.
+
+These cases are handled similarly to creating individual files, ie, we
+try to do the action first and then write a WAL entry if it succeeded.
+The potential amount of wasted disk space is rather larger, of course.
+In the creation case we try to delete the directory tree again if creation
+fails, so as to reduce the risk of wasted space. Failure partway through
+a deletion operation results in a corrupt database: the DROP failed, but
+some of the data is gone anyway. There is little we can do about that,
+though, and in any case it was presumably data the user no longer wants.
+
+In all of these cases, if WAL replay fails to redo the original action
+we must panic and abort recovery. The DBA will have to manually clean up
+(for instance, free up some disk space or fix directory permissions) and
+then restart recovery. This is part of the reason for not writing a WAL
+entry until we've successfully done the original action.
+
+
+Skipping WAL for New RelFileNode
+--------------------------------
+
+Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
+would unlink, in-tree access methods write no WAL for that change. Code that
+writes WAL without calling RelationNeedsWAL() must check for this case. This
+skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change
+for the same block, REDO could overwrite the WAL-skipping change. If a
+WAL-writing change followed a WAL-skipping change for the same block, a
+related problem would arise. When a WAL record contains no full-page image,
+REDO expects the page to match its contents from just before record insertion.
+A WAL-skipping change may not reach disk at all, violating REDO's expectation
+under full_page_writes=off. For any access method, CommitTransaction() writes
+and fsyncs affected blocks before recording the commit.
+
+Prefer to do the same in future access methods. However, two other approaches
+can work. First, an access method can irreversibly transition a given fork
+from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
+smgrimmedsync(). Second, an access method can opt to write WAL
+unconditionally for permanent relations. Under these approaches, the access
+method callbacks must not call functions that react to RelationNeedsWAL().
+
+This applies only to WAL records whose replay would modify bytes stored in the
+new relfilenode. It does not apply to other records about the relfilenode,
+such as XLOG_SMGR_CREATE. Because it operates at the level of individual
+relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
+Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
+ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while
+the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table
+to skip WAL, but that won't affect its indexes.
+
+
+Asynchronous Commit
+-------------------
+
+As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e.,
+we don't wait while the WAL record for the commit is fsync'ed.
+We perform an asynchronous commit when synchronous_commit = off. Instead
+of performing an XLogFlush() up to the LSN of the commit, we merely note
+the LSN in shared memory. The backend then continues with other work.
+We record the LSN only for an asynchronous commit, not an abort; there's
+never any need to flush an abort record, since the presumption after a
+crash would be that the transaction aborted anyway.
+
+We always force synchronous commit when the transaction is deleting
+relations, to ensure the commit record is down to disk before the relations
+are removed from the filesystem. Also, certain utility commands that have
+non-roll-backable side effects (such as filesystem changes) force sync
+commit to minimize the window in which the filesystem change has been made
+but the transaction isn't guaranteed committed.
+
+The walwriter regularly wakes up (via wal_writer_delay) or is woken up
+(via its latch, which is set by backends committing asynchronously) and
+performs an XLogBackgroundFlush(). This checks the location of the last
+completely filled WAL page. If that has moved forwards, then we write all
+the changed buffers up to that point, so that under full load we write
+only whole buffers. If there has been a break in activity and the current
+WAL page is the same as before, then we find out the LSN of the most
+recent asynchronous commit, and write up to that point, if required (i.e.
+if it's in the current WAL page). If more than wal_writer_delay has
+passed, or more than wal_writer_flush_after blocks have been written, since
+the last flush, WAL is also flushed up to the current location. This
+arrangement in itself would guarantee that an async commit record reaches
+disk after at most two times wal_writer_delay after the transaction
+completes. However, we also allow XLogFlush to write/flush full buffers
+"flexibly" (ie, not wrapping around at the end of the circular WAL buffer
+area), so as to minimize the number of writes issued under high load when
+multiple WAL pages are filled per walwriter cycle. This makes the worst-case
+delay three wal_writer_delay cycles.
+
+There are some other subtle points to consider with asynchronous commits.
+First, for each page of CLOG we must remember the LSN of the latest commit
+affecting the page, so that we can enforce the same flush-WAL-before-write
+rule that we do for ordinary relation pages. Otherwise the record of the
+commit might reach disk before the WAL record does. Again, abort records
+need not factor into this consideration.
+
+In fact, we store more than one LSN for each clog page. This relates to
+the way we set transaction status hint bits during visibility tests.
+We must not set a transaction-committed hint bit on a relation page and
+have that record make it to disk prior to the WAL record of the commit.
+Since visibility tests are normally made while holding buffer share locks,
+we do not have the option of changing the page's LSN to guarantee WAL
+synchronization. Instead, we defer the setting of the hint bit if we have
+not yet flushed WAL as far as the LSN associated with the transaction.
+This requires tracking the LSN of each unflushed async commit. It is
+convenient to associate this data with clog buffers: because we will flush
+WAL before writing a clog page, we know that we do not need to remember a
+transaction's LSN longer than the clog page holding its commit status
+remains in memory. However, the naive approach of storing an LSN for each
+clog position is unattractive: the LSNs are 32x bigger than the two-bit
+commit status fields, and so we'd need 256K of additional shared memory for
+each 8K clog buffer page. We choose instead to store a smaller number of
+LSNs per page, where each LSN is the highest LSN associated with any
+transaction commit in a contiguous range of transaction IDs on that page.
+This saves storage at the price of some possibly-unnecessary delay in
+setting transaction hint bits.
+
+How many transactions should share the same cached LSN (N)? If the
+system's workload consists only of small async-commit transactions, then
+it's reasonable to have N similar to the number of transactions per
+walwriter cycle, since that is the granularity with which transactions will
+become truly committed (and thus hintable) anyway. The worst case is where
+a sync-commit xact shares a cached LSN with an async-commit xact that
+commits a bit later; even though we paid to sync the first xact to disk,
+we won't be able to hint its outputs until the second xact is sync'd, up to
+three walwriter cycles later. This argues for keeping N (the group size)
+as small as possible. For the moment we are setting the group size to 32,
+which makes the LSN cache space the same size as the actual clog buffer
+space (independently of BLCKSZ).
+
+It is useful that we can run both synchronous and asynchronous commit
+transactions concurrently, but the safety of this is perhaps not
+immediately obvious. Assume we have two transactions, T1 and T2. The Log
+Sequence Number (LSN) is the point in the WAL sequence where a transaction
+commit is recorded, so LSN1 and LSN2 are the commit records of those
+transactions. If T2 can see changes made by T1 then when T2 commits it
+must be true that LSN2 follows LSN1. Thus when T2 commits it is certain
+that all of the changes made by T1 are also now recorded in the WAL. This
+is true whether T1 was asynchronous or synchronous. As a result, it is
+safe for asynchronous commits and synchronous commits to work concurrently
+without endangering data written by synchronous commits. Sub-transactions
+are not important here since the final write to disk only occurs at the
+commit of the top level transaction.
+
+Changes to data blocks cannot reach disk unless WAL is flushed up to the
+point of the LSN of the data blocks. Any attempt to write unsafe data to
+disk will trigger a write which ensures the safety of all data written by
+that and prior transactions. Data blocks and clog pages are both protected
+by LSNs.
+
+Changes to a temp table are not WAL-logged, hence could reach disk in
+advance of T1's commit, but we don't care since temp table contents don't
+survive crashes anyway.
+
+Database writes that skip WAL for new relfilenodes are also safe. In these
+cases it's entirely possible for the data to reach disk before T1's commit,
+because T1 will fsync it down to disk without any sort of interlock. However,
+all these paths are designed to write data that no other transaction can see
+until after T1 commits. The situation is thus not different from ordinary
+WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining a
+list of XIDs belonging to transactions that are being replayed, so that
+each transaction that has recorded WAL records for database writes exist
+in the array until it commits. Further details are given in comments in
+procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog, multixact and commit_ts entries are made normally.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel
new file mode 100644
index 0000000..99c588d
--- /dev/null
+++ b/src/backend/access/transam/README.parallel
@@ -0,0 +1,237 @@
+Overview
+========
+
+PostgreSQL provides some simple facilities to make writing parallel algorithms
+easier. Using a data structure called a ParallelContext, you can arrange to
+launch background worker processes, initialize their state to match that of
+the backend which initiated parallelism, communicate with them via dynamic
+shared memory, and write reasonably complex code that can run either in the
+user backend or in one of the parallel workers without needing to be aware of
+where it's running.
+
+The backend which starts a parallel operation (hereafter, the initiating
+backend) starts by creating a dynamic shared memory segment which will last
+for the lifetime of the parallel operation. This dynamic shared memory segment
+will contain (1) a shm_mq that can be used to transport errors (and other
+messages reported via elog/ereport) from the worker back to the initiating
+backend; (2) serialized representations of the initiating backend's private
+state, so that the worker can synchronize its state with of the initiating
+backend; and (3) any other data structures which a particular user of the
+ParallelContext data structure may wish to add for its own purposes. Once
+the initiating backend has initialized the dynamic shared memory segment, it
+asks the postmaster to launch the appropriate number of parallel workers.
+These workers then connect to the dynamic shared memory segment, initiate
+their state, and then invoke the appropriate entrypoint, as further detailed
+below.
+
+Error Reporting
+===============
+
+When started, each parallel worker begins by attaching the dynamic shared
+memory segment and locating the shm_mq to be used for error reporting; it
+redirects all of its protocol messages to this shm_mq. Prior to this point,
+any failure of the background worker will not be reported to the initiating
+backend; from the point of view of the initiating backend, the worker simply
+failed to start. The initiating backend must anyway be prepared to cope
+with fewer parallel workers than it originally requested, so catering to
+this case imposes no additional burden.
+
+Whenever a new message (or partial message; very large messages may wrap) is
+sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the
+initiating backend. This causes the next CHECK_FOR_INTERRUPTS() in the
+initiating backend to read and rethrow the message. For the most part, this
+makes error reporting in parallel mode "just work". Of course, to work
+properly, it is important that the code the initiating backend is executing
+CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for
+long periods of time, but those are good things to do anyway.
+
+(A currently-unsolved problem is that some messages may get written to the
+system log twice, once in the backend where the report was originally
+generated, and again when the initiating backend rethrows the message. If
+we decide to suppress one of these reports, it should probably be second one;
+otherwise, if the worker is for some reason unable to propagate the message
+back to the initiating backend, the message will be lost altogether.)
+
+State Sharing
+=============
+
+It's possible to write C code which works correctly without parallelism, but
+which fails when parallelism is used. No parallel infrastructure can
+completely eliminate this problem, because any global variable is a risk.
+There's no general mechanism for ensuring that every global variable in the
+worker will have the same value that it does in the initiating backend; even
+if we could ensure that, some function we're calling could update the variable
+after each call, and only the backend where that update is performed will see
+the new value. Similar problems can arise with any more-complex data
+structure we might choose to use. For example, a pseudo-random number
+generator should, given a particular seed value, produce the same predictable
+series of values every time. But it does this by relying on some private
+state which won't automatically be shared between cooperating backends. A
+parallel-safe PRNG would need to store its state in dynamic shared memory, and
+would require locking. The parallelism infrastructure has no way of knowing
+whether the user intends to call code that has this sort of problem, and can't
+do anything about it anyway.
+
+Instead, we take a more pragmatic approach. First, we try to make as many of
+the operations that are safe outside of parallel mode work correctly in
+parallel mode as well. Second, we try to prohibit common unsafe operations
+via suitable error checks. These checks are intended to catch 100% of
+unsafe things that a user might do from the SQL interface, but code written
+in C can do unsafe things that won't trigger these checks. The error checks
+are engaged via EnterParallelMode(), which should be called before creating
+a parallel context, and disarmed via ExitParallelMode(), which should be
+called after all parallel contexts have been destroyed. The most
+significant restriction imposed by parallel mode is that all operations must
+be strictly read-only; we allow no writes to the database and no DDL. We
+might try to relax these restrictions in the future.
+
+To make as many operations as possible safe in parallel mode, we try to copy
+the most important pieces of state from the initiating backend to each parallel
+worker. This includes:
+
+ - The set of libraries dynamically loaded by dfmgr.c.
+
+ - The authenticated user ID and current database. Each parallel worker
+ will connect to the same database as the initiating backend, using the
+ same user ID.
+
+ - The values of all GUCs. Accordingly, permanent changes to the value of
+ any GUC are forbidden while in parallel mode; but temporary changes,
+ such as entering a function with non-NULL proconfig, are OK.
+
+ - The current subtransaction's XID, the top-level transaction's XID, and
+ the list of XIDs considered current (that is, they are in-progress or
+ subcommitted). This information is needed to ensure that tuple visibility
+ checks return the same results in the worker as they do in the
+ initiating backend. See also the section Transaction Integration, below.
+
+ - The combo CID mappings. This is needed to ensure consistent answers to
+ tuple visibility checks. The need to synchronize this data structure is
+ a major reason why we can't support writes in parallel mode: such writes
+ might create new combo CIDs, and we have no way to let other workers
+ (or the initiating backend) know about them.
+
+ - The transaction snapshot.
+
+ - The active snapshot, which might be different from the transaction
+ snapshot.
+
+ - The currently active user ID and security context. Note that this is
+ the fourth user ID we restore: the initial step of binding to the correct
+ database also involves restoring the authenticated user ID. When GUC
+ values are restored, this incidentally sets SessionUserId and OuterUserId
+ to the correct values. This final step restores CurrentUserId.
+
+ - State related to pending REINDEX operations, which prevents access to
+ an index that is currently being rebuilt.
+
+ - Active relmapper.c mapping state. This is needed to allow consistent
+ answers when fetching the current relfilenode for relation oids of
+ mapped relations.
+
+To prevent unprincipled deadlocks when running in parallel mode, this code
+also arranges for the leader and all workers to participate in group
+locking. See src/backend/storage/lmgr/README for more details.
+
+Transaction Integration
+=======================
+
+Regardless of what the TransactionState stack looks like in the parallel
+leader, each parallel worker ends up with a stack of depth 1. This stack
+entry is marked with the special transaction block state
+TBLOCK_PARALLEL_INPROGRESS so that it's not confused with an ordinary
+toplevel transaction. The XID of this TransactionState is set to the XID of
+the innermost currently-active subtransaction in the initiating backend. The
+initiating backend's toplevel XID, and the XIDs of all current (in-progress
+or subcommitted) XIDs are stored separately from the TransactionState stack,
+but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(), and
+TransactionIdIsCurrentTransactionId() return the same values that they would
+in the initiating backend. We could copy the entire transaction state stack,
+but most of it would be useless: for example, you can't roll back to a
+savepoint from within a parallel worker, and there are no resources to
+associated with the memory contexts or resource owners of intermediate
+subtransactions.
+
+No meaningful change to the transaction state can be made while in parallel
+mode. No XIDs can be assigned, and no subtransactions can start or end,
+because we have no way of communicating these state changes to cooperating
+backends, or of synchronizing them. It's clearly unworkable for the initiating
+backend to exit any transaction or subtransaction that was in progress when
+parallelism was started before all parallel workers have exited; and it's even
+more clearly crazy for a parallel worker to try to subcommit or subabort the
+current subtransaction and execute in some other transaction context than was
+present in the initiating backend. It might be practical to allow internal
+sub-transactions (e.g. to implement a PL/pgSQL EXCEPTION block) to be used in
+parallel mode, provided that they are XID-less, because other backends
+wouldn't really need to know about those transactions or do anything
+differently because of them. Right now, we don't even allow that.
+
+At the end of a parallel operation, which can happen either because it
+completed successfully or because it was interrupted by an error, parallel
+workers associated with that operation exit. In the error case, transaction
+abort processing in the parallel leader kills off any remaining workers, and
+the parallel leader then waits for them to die. In the case of a successful
+parallel operation, the parallel leader does not send any signals, but must
+wait for workers to complete and exit of their own volition. In either
+case, it is very important that all workers actually exit before the
+parallel leader cleans up the (sub)transaction in which they were created;
+otherwise, chaos can ensue. For example, if the leader is rolling back the
+transaction that created the relation being scanned by a worker, the
+relation could disappear while the worker is still busy scanning it. That's
+not safe.
+
+Generally, the cleanup performed by each worker at this point is similar to
+top-level commit or abort. Each backend has its own resource owners: buffer
+pins, catcache or relcache reference counts, tuple descriptors, and so on
+are managed separately by each backend, and must free them before exiting.
+There are, however, some important differences between parallel worker
+commit or abort and a real top-level transaction commit or abort. Most
+importantly:
+
+ - No commit or abort record is written; the initiating backend is
+ responsible for this.
+
+ - Cleanup of pg_temp namespaces is not done. Parallel workers cannot
+ safely access the initiating backend's pg_temp namespace, and should
+ not create one of their own.
+
+Coding Conventions
+===================
+
+Before beginning any parallel operation, call EnterParallelMode(); after all
+parallel operations are completed, call ExitParallelMode(). To actually
+parallelize a particular operation, use a ParallelContext. The basic coding
+pattern looks like this:
+
+ EnterParallelMode(); /* prohibit unsafe state changes */
+
+ pcxt = CreateParallelContext("library_name", "function_name", nworkers);
+
+ /* Allow space for application-specific data here. */
+ shm_toc_estimate_chunk(&pcxt->estimator, size);
+ shm_toc_estimate_keys(&pcxt->estimator, keys);
+
+ InitializeParallelDSM(pcxt); /* create DSM and copy state to it */
+
+ /* Store the data for which we reserved space. */
+ space = shm_toc_allocate(pcxt->toc, size);
+ shm_toc_insert(pcxt->toc, key, space);
+
+ LaunchParallelWorkers(pcxt);
+
+ /* do parallel stuff */
+
+ WaitForParallelWorkersToFinish(pcxt);
+
+ /* read any final results from dynamic shared memory */
+
+ DestroyParallelContext(pcxt);
+
+ ExitParallelMode();
+
+If desired, after WaitForParallelWorkersToFinish() has been called, the
+context can be reset so that workers can be launched anew using the same
+parallel context. To do this, first call ReinitializeParallelDSM() to
+reinitialize state managed by the parallel context machinery itself; then,
+perform any other necessary resetting of state; after that, you can again
+call LaunchParallelWorkers.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
new file mode 100644
index 0000000..3d9088a
--- /dev/null
+++ b/src/backend/access/transam/clog.c
@@ -0,0 +1,1030 @@
+/*-------------------------------------------------------------------------
+ *
+ * clog.c
+ * PostgreSQL transaction-commit-log manager
+ *
+ * This module replaces the old "pg_log" access code, which treated pg_log
+ * essentially like a relation, in that it went through the regular buffer
+ * manager. The problem with that was that there wasn't any good way to
+ * recycle storage space for transactions so old that they'll never be
+ * looked up again. Now we use specialized access code so that the commit
+ * log can be broken into relatively small, independent segments.
+ *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * CLOG page is initialized to zeroes. Other writes of CLOG come from
+ * recording of transaction commit or abort in xact.c, which generates its
+ * own XLOG records for these events and will re-perform the status update
+ * on redo; so we need make no additional XLOG entry here. For synchronous
+ * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
+ * record before we are called to log a commit, so the WAL rule "write xlog
+ * before data" is satisfied automatically. However, for async commits we
+ * must track the latest LSN affecting each CLOG page, so that we can flush
+ * XLOG that far and satisfy the WAL rule. We don't have to worry about this
+ * for aborts (whether sync or async), since the post-crash assumption would
+ * be that such transactions failed anyway.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/clog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/sync.h"
+
+/*
+ * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
+ * and CLOG segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
+ */
+
+/* We need two bits per xact, so four xacts fit in a byte */
+#define CLOG_BITS_PER_XACT 2
+#define CLOG_XACTS_PER_BYTE 4
+#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
+
+#define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
+#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
+#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
+
+/* We store the latest async LSN for each group of transactions */
+#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
+#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
+ ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
+
+/*
+ * The number of subtransactions below which we consider to apply clog group
+ * update optimization. Testing reveals that the number higher than this can
+ * hurt performance.
+ */
+#define THRESHOLD_SUBTRANS_CLOG_OPT 5
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData XactCtlData;
+
+#define XactCtl (&XactCtlData)
+
+
+static int ZeroCLOGPage(int pageno, bool writeXlog);
+static bool CLOGPagePrecedes(int page1, int page2);
+static void WriteZeroPageXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact,
+ Oid oldestXactDb);
+static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status,
+ XLogRecPtr lsn, int pageno,
+ bool all_xact_same_page);
+static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
+ XLogRecPtr lsn, int slotno);
+static void set_status_by_pages(int nsubxids, TransactionId *subxids,
+ XidStatus status, XLogRecPtr lsn);
+static bool TransactionGroupUpdateXidStatus(TransactionId xid,
+ XidStatus status, XLogRecPtr lsn, int pageno);
+static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status,
+ XLogRecPtr lsn, int pageno);
+
+
+/*
+ * TransactionIdSetTreeStatus
+ *
+ * Record the final state of transaction entries in the commit log for
+ * a transaction and its subtransaction tree. Take care to ensure this is
+ * efficient, and as atomic as possible.
+ *
+ * xid is a single xid to set status for. This will typically be
+ * the top level transactionid for a top level commit or abort. It can
+ * also be a subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * lsn must be the WAL location of the commit record when recording an async
+ * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
+ * caller guarantees the commit record is already flushed in that case. It
+ * should be InvalidXLogRecPtr for abort cases, too.
+ *
+ * In the commit case, atomicity is limited by whether all the subxids are in
+ * the same CLOG page as xid. If they all are, then the lock will be grabbed
+ * only once, and the status will be set to committed directly. Otherwise
+ * we must
+ * 1. set sub-committed all subxids that are not on the same page as the
+ * main xid
+ * 2. atomically set committed the main xid and the subxids on the same page
+ * 3. go over the first bunch again and set them committed
+ * Note that as far as concurrent checkers are concerned, main transaction
+ * commit as a whole is still atomic.
+ *
+ * Example:
+ * TransactionId t commits and has subxids t1, t2, t3, t4
+ * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
+ * 1. update pages2-3:
+ * page2: set t2,t3 as sub-committed
+ * page3: set t4 as sub-committed
+ * 2. update page1:
+ * set t1 as sub-committed,
+ * then set t as committed,
+ then set t1 as committed
+ * 3. update pages2-3:
+ * page2: set t2,t3 as committed
+ * page3: set t4 as committed
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; functions in transam.c are the intended callers.
+ *
+ * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
+ * but aren't yet in cache, as well as hinting pages not to fall out of
+ * cache yet.
+ */
+void
+TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
+{
+ int pageno = TransactionIdToPage(xid); /* get page of parent */
+ int i;
+
+ Assert(status == TRANSACTION_STATUS_COMMITTED ||
+ status == TRANSACTION_STATUS_ABORTED);
+
+ /*
+ * See how many subxids, if any, are on the same page as the parent, if
+ * any.
+ */
+ for (i = 0; i < nsubxids; i++)
+ {
+ if (TransactionIdToPage(subxids[i]) != pageno)
+ break;
+ }
+
+ /*
+ * Do all items fit on a single page?
+ */
+ if (i == nsubxids)
+ {
+ /*
+ * Set the parent and all subtransactions in a single call
+ */
+ TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
+ pageno, true);
+ }
+ else
+ {
+ int nsubxids_on_first_page = i;
+
+ /*
+ * If this is a commit then we care about doing this correctly (i.e.
+ * using the subcommitted intermediate status). By here, we know
+ * we're updating more than one page of clog, so we must mark entries
+ * that are *not* on the first page so that they show as subcommitted
+ * before we then return to update the status to fully committed.
+ *
+ * To avoid touching the first page twice, skip marking subcommitted
+ * for the subxids on that first page.
+ */
+ if (status == TRANSACTION_STATUS_COMMITTED)
+ set_status_by_pages(nsubxids - nsubxids_on_first_page,
+ subxids + nsubxids_on_first_page,
+ TRANSACTION_STATUS_SUB_COMMITTED, lsn);
+
+ /*
+ * Now set the parent and subtransactions on same page as the parent,
+ * if any
+ */
+ pageno = TransactionIdToPage(xid);
+ TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
+ lsn, pageno, false);
+
+ /*
+ * Now work through the rest of the subxids one clog page at a time,
+ * starting from the second page onwards, like we did above.
+ */
+ set_status_by_pages(nsubxids - nsubxids_on_first_page,
+ subxids + nsubxids_on_first_page,
+ status, lsn);
+ }
+}
+
+/*
+ * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
+ * transactions, chunking in the separate CLOG pages involved. We never
+ * pass the whole transaction tree to this function, only subtransactions
+ * that are on different pages to the top level transaction id.
+ */
+static void
+set_status_by_pages(int nsubxids, TransactionId *subxids,
+ XidStatus status, XLogRecPtr lsn)
+{
+ int pageno = TransactionIdToPage(subxids[0]);
+ int offset = 0;
+ int i = 0;
+
+ Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
+
+ while (i < nsubxids)
+ {
+ int num_on_page = 0;
+ int nextpageno;
+
+ do
+ {
+ nextpageno = TransactionIdToPage(subxids[i]);
+ if (nextpageno != pageno)
+ break;
+ num_on_page++;
+ i++;
+ } while (i < nsubxids);
+
+ TransactionIdSetPageStatus(InvalidTransactionId,
+ num_on_page, subxids + offset,
+ status, lsn, pageno, false);
+ offset = i;
+ pageno = nextpageno;
+ }
+}
+
+/*
+ * Record the final state of transaction entries in the commit log for all
+ * entries on a single page. Atomic only on this page.
+ */
+static void
+TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status,
+ XLogRecPtr lsn, int pageno,
+ bool all_xact_same_page)
+{
+ /* Can't use group update when PGPROC overflows. */
+ StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
+ "group clog threshold less than PGPROC cached subxids");
+
+ /*
+ * When there is contention on XactSLRULock, we try to group multiple
+ * updates; a single leader process will perform transaction status
+ * updates for multiple backends so that the number of times XactSLRULock
+ * needs to be acquired is reduced.
+ *
+ * For this optimization to be safe, the XID and subxids in MyProc must be
+ * the same as the ones for which we're setting the status. Check that
+ * this is the case.
+ *
+ * For this optimization to be efficient, we shouldn't have too many
+ * sub-XIDs and all of the XIDs for which we're adjusting clog should be
+ * on the same page. Check those conditions, too.
+ */
+ if (all_xact_same_page && xid == MyProc->xid &&
+ nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
+ nsubxids == MyProc->subxidStatus.count &&
+ (nsubxids == 0 ||
+ memcmp(subxids, MyProc->subxids.xids,
+ nsubxids * sizeof(TransactionId)) == 0))
+ {
+ /*
+ * If we can immediately acquire XactSLRULock, we update the status of
+ * our own XID and release the lock. If not, try use group XID
+ * update. If that doesn't work out, fall back to waiting for the
+ * lock to perform an update for this transaction only.
+ */
+ if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE))
+ {
+ /* Got the lock without waiting! Do the update. */
+ TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
+ lsn, pageno);
+ LWLockRelease(XactSLRULock);
+ return;
+ }
+ else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
+ {
+ /* Group update mechanism has done the work. */
+ return;
+ }
+
+ /* Fall through only if update isn't done yet. */
+ }
+
+ /* Group update not applicable, or couldn't accept this page number. */
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+ TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
+ lsn, pageno);
+ LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Record the final state of transaction entry in the commit log
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static void
+TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
+ TransactionId *subxids, XidStatus status,
+ XLogRecPtr lsn, int pageno)
+{
+ int slotno;
+ int i;
+
+ Assert(status == TRANSACTION_STATUS_COMMITTED ||
+ status == TRANSACTION_STATUS_ABORTED ||
+ (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
+ Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE));
+
+ /*
+ * If we're doing an async commit (ie, lsn is valid), then we must wait
+ * for any active write on the page slot to complete. Otherwise our
+ * update could reach disk in that write, which will not do since we
+ * mustn't let it reach disk until we've done the appropriate WAL flush.
+ * But when lsn is invalid, it's OK to scribble on a page while it is
+ * write-busy, since we don't care if the update reaches disk sooner than
+ * we think.
+ */
+ slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
+
+ /*
+ * Set the main transaction id, if any.
+ *
+ * If we update more than one xid on this page while it is being written
+ * out, we might find that some of the bits go to disk and others don't.
+ * If we are updating commits on the page with the top-level xid that
+ * could break atomicity, so we subcommit the subxids first before we mark
+ * the top-level commit.
+ */
+ if (TransactionIdIsValid(xid))
+ {
+ /* Subtransactions first, if needed ... */
+ if (status == TRANSACTION_STATUS_COMMITTED)
+ {
+ for (i = 0; i < nsubxids; i++)
+ {
+ Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+ TransactionIdSetStatusBit(subxids[i],
+ TRANSACTION_STATUS_SUB_COMMITTED,
+ lsn, slotno);
+ }
+ }
+
+ /* ... then the main transaction */
+ TransactionIdSetStatusBit(xid, status, lsn, slotno);
+ }
+
+ /* Set the subtransactions */
+ for (i = 0; i < nsubxids; i++)
+ {
+ Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+ TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
+ }
+
+ XactCtl->shared->page_dirty[slotno] = true;
+}
+
+/*
+ * When we cannot immediately acquire XactSLRULock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * status update. The first process to add itself to the list will acquire
+ * XactSLRULock in exclusive mode and set transaction status as required
+ * on behalf of all group members. This avoids a great deal of contention
+ * around XactSLRULock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ *
+ * Returns true when transaction status has been updated in clog; returns
+ * false if we decided against applying the optimization because the page
+ * number we need to update differs from those processes already waiting.
+ */
+static bool
+TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
+ XLogRecPtr lsn, int pageno)
+{
+ volatile PROC_HDR *procglobal = ProcGlobal;
+ PGPROC *proc = MyProc;
+ uint32 nextidx;
+ uint32 wakeidx;
+
+ /* We should definitely have an XID whose status needs to be updated. */
+ Assert(TransactionIdIsValid(xid));
+
+ /*
+ * Add ourselves to the list of processes needing a group XID status
+ * update.
+ */
+ proc->clogGroupMember = true;
+ proc->clogGroupMemberXid = xid;
+ proc->clogGroupMemberXidStatus = status;
+ proc->clogGroupMemberPage = pageno;
+ proc->clogGroupMemberLsn = lsn;
+
+ nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
+
+ while (true)
+ {
+ /*
+ * Add the proc to list, if the clog page where we need to update the
+ * current transaction status is same as group leader's clog page.
+ *
+ * There is a race condition here, which is that after doing the below
+ * check and before adding this proc's clog update to a group, the
+ * group leader might have already finished the group update for this
+ * page and becomes group leader of another group. This will lead to a
+ * situation where a single group can have different clog page
+ * updates. This isn't likely and will still work, just maybe a bit
+ * less efficiently.
+ */
+ if (nextidx != INVALID_PGPROCNO &&
+ ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage)
+ {
+ /*
+ * Ensure that this proc is not a member of any clog group that
+ * needs an XID status update.
+ */
+ proc->clogGroupMember = false;
+ pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO);
+ return false;
+ }
+
+ pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
+
+ if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
+ &nextidx,
+ (uint32) proc->pgprocno))
+ break;
+ }
+
+ /*
+ * If the list was not empty, the leader will update the status of our
+ * XID. It is impossible to have followers without a leader because the
+ * first process that has added itself to the list will always have
+ * nextidx as INVALID_PGPROCNO.
+ */
+ if (nextidx != INVALID_PGPROCNO)
+ {
+ int extraWaits = 0;
+
+ /* Sleep until the leader updates our XID status. */
+ pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
+ for (;;)
+ {
+ /* acts as a read barrier */
+ PGSemaphoreLock(proc->sem);
+ if (!proc->clogGroupMember)
+ break;
+ extraWaits++;
+ }
+ pgstat_report_wait_end();
+
+ Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO);
+
+ /* Fix semaphore count for any absorbed wakeups */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+ return true;
+ }
+
+ /* We are the leader. Acquire the lock on behalf of everyone. */
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * Now that we've got the lock, clear the list of processes waiting for
+ * group XID status update, saving a pointer to the head of the list.
+ * Trying to pop elements one at a time could lead to an ABA problem.
+ */
+ nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
+ INVALID_PGPROCNO);
+
+ /* Remember head of list so we can perform wakeups after dropping lock. */
+ wakeidx = nextidx;
+
+ /* Walk the list and update the status of all XIDs. */
+ while (nextidx != INVALID_PGPROCNO)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[nextidx];
+
+ /*
+ * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
+ * should not use group XID status update mechanism.
+ */
+ Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
+
+ TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid,
+ proc->subxidStatus.count,
+ proc->subxids.xids,
+ proc->clogGroupMemberXidStatus,
+ proc->clogGroupMemberLsn,
+ proc->clogGroupMemberPage);
+
+ /* Move to next proc in list. */
+ nextidx = pg_atomic_read_u32(&proc->clogGroupNext);
+ }
+
+ /* We're done with the lock now. */
+ LWLockRelease(XactSLRULock);
+
+ /*
+ * Now that we've released the lock, go back and wake everybody up. We
+ * don't do this under the lock so as to keep lock hold times to a
+ * minimum.
+ */
+ while (wakeidx != INVALID_PGPROCNO)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[wakeidx];
+
+ wakeidx = pg_atomic_read_u32(&proc->clogGroupNext);
+ pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO);
+
+ /* ensure all previous writes are visible before follower continues. */
+ pg_write_barrier();
+
+ proc->clogGroupMember = false;
+
+ if (proc != MyProc)
+ PGSemaphoreUnlock(proc->sem);
+ }
+
+ return true;
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ *
+ * Must be called with XactSLRULock held
+ */
+static void
+TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
+{
+ int byteno = TransactionIdToByte(xid);
+ int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+ char *byteptr;
+ char byteval;
+ char curval;
+
+ byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
+
+ /*
+ * When replaying transactions during recovery we still need to perform
+ * the two phases of subcommit and then commit. However, some transactions
+ * are already correctly marked, so we just treat those as a no-op which
+ * allows us to keep the following Assert as restrictive as possible.
+ */
+ if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
+ curval == TRANSACTION_STATUS_COMMITTED)
+ return;
+
+ /*
+ * Current state change should be from 0 or subcommitted to target state
+ * or we should already be there when replaying changes during recovery.
+ */
+ Assert(curval == 0 ||
+ (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
+ status != TRANSACTION_STATUS_IN_PROGRESS) ||
+ curval == status);
+
+ /* note this assumes exclusive access to the clog page */
+ byteval = *byteptr;
+ byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
+ byteval |= (status << bshift);
+ *byteptr = byteval;
+
+ /*
+ * Update the group LSN if the transaction completion LSN is higher.
+ *
+ * Note: lsn will be invalid when supplied during InRecovery processing,
+ * so we don't need to do anything special to avoid LSN updates during
+ * recovery. After recovery completes the next clog change will set the
+ * LSN correctly.
+ */
+ if (!XLogRecPtrIsInvalid(lsn))
+ {
+ int lsnindex = GetLSNIndex(slotno, xid);
+
+ if (XactCtl->shared->group_lsn[lsnindex] < lsn)
+ XactCtl->shared->group_lsn[lsnindex] = lsn;
+ }
+}
+
+/*
+ * Interrogate the state of a transaction in the commit log.
+ *
+ * Aside from the actual commit status, this function returns (into *lsn)
+ * an LSN that is late enough to be able to guarantee that if we flush up to
+ * that LSN then we will have flushed the transaction's commit record to disk.
+ * The result is not necessarily the exact LSN of the transaction's commit
+ * record! For example, for long-past transactions (those whose clog pages
+ * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
+ * we group transactions on the same clog page to conserve storage, we might
+ * return the LSN of a later transaction that falls into the same group.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionLogFetch() in transam.c is the intended caller.
+ */
+XidStatus
+TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
+{
+ int pageno = TransactionIdToPage(xid);
+ int byteno = TransactionIdToByte(xid);
+ int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+ int slotno;
+ int lsnindex;
+ char *byteptr;
+ XidStatus status;
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+ slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
+ byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+
+ status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
+
+ lsnindex = GetLSNIndex(slotno, xid);
+ *lsn = XactCtl->shared->group_lsn[lsnindex];
+
+ LWLockRelease(XactSLRULock);
+
+ return status;
+}
+
+/*
+ * Number of shared CLOG buffers.
+ *
+ * On larger multi-processor systems, it is possible to have many CLOG page
+ * requests in flight at one time which could lead to disk access for CLOG
+ * page if the required page is not found in memory. Testing revealed that we
+ * can get the best performance by having 128 CLOG buffers, more than that it
+ * doesn't improve performance.
+ *
+ * Unconditionally keeping the number of CLOG buffers to 128 did not seem like
+ * a good idea, because it would increase the minimum amount of shared memory
+ * required to start, which could be a problem for people running very small
+ * configurations. The following formula seems to represent a reasonable
+ * compromise: people with very low values for shared_buffers will get fewer
+ * CLOG buffers as well, and everyone else will get 128.
+ */
+Size
+CLOGShmemBuffers(void)
+{
+ return Min(128, Max(4, NBuffers / 512));
+}
+
+/*
+ * Initialization of shared memory for CLOG
+ */
+Size
+CLOGShmemSize(void)
+{
+ return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+}
+
+void
+CLOGShmemInit(void)
+{
+ XactCtl->PagePrecedes = CLOGPagePrecedes;
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
+ SYNC_HANDLER_CLOG);
+ SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
+}
+
+/*
+ * This func must be called ONCE on system install. It creates
+ * the initial CLOG segment. (The CLOG directory is assumed to
+ * have been created by initdb, and CLOGShmemInit must have been
+ * called already.)
+ */
+void
+BootStrapCLOG(void)
+{
+ int slotno;
+
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ /* Create and zero the first page of the commit log */
+ slotno = ZeroCLOGPage(0, false);
+
+ /* Make sure it's written out */
+ SimpleLruWritePage(XactCtl, slotno);
+ Assert(!XactCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of CLOG to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCLOGPage(int pageno, bool writeXlog)
+{
+ int slotno;
+
+ slotno = SimpleLruZeroPage(XactCtl, pageno);
+
+ if (writeXlog)
+ WriteZeroPageXlogRec(pageno);
+
+ return slotno;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ */
+void
+StartupCLOG(void)
+{
+ TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ int pageno = TransactionIdToPage(xid);
+
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * Initialize our idea of the latest page number.
+ */
+ XactCtl->shared->latest_page_number = pageno;
+
+ LWLockRelease(XactSLRULock);
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ */
+void
+TrimCLOG(void)
+{
+ TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ int pageno = TransactionIdToPage(xid);
+
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * Zero out the remainder of the current clog page. Under normal
+ * circumstances it should be zeroes already, but it seems at least
+ * theoretically possible that XLOG replay will have settled on a nextXID
+ * value that is less than the last XID actually used and marked by the
+ * previous database lifecycle (since subtransaction commit writes clog
+ * but makes no WAL entry). Let's just be safe. (We need not worry about
+ * pages beyond the current one, since those will be zeroed when first
+ * used. For the same reason, there is no need to do anything when
+ * nextXid is exactly at a page boundary; and it's likely that the
+ * "current" page doesn't exist yet in that case.)
+ */
+ if (TransactionIdToPgIndex(xid) != 0)
+ {
+ int byteno = TransactionIdToByte(xid);
+ int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+ int slotno;
+ char *byteptr;
+
+ slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
+ byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+
+ /* Zero so-far-unused positions in the current byte */
+ *byteptr &= (1 << bshift) - 1;
+ /* Zero the rest of the page */
+ MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+
+ XactCtl->shared->page_dirty[slotno] = true;
+ }
+
+ LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCLOG(void)
+{
+ /*
+ * Write dirty CLOG pages to disk. This may result in sync requests
+ * queued for later handling by ProcessSyncRequests(), as part of the
+ * checkpoint.
+ */
+ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
+ SimpleLruWriteAll(XactCtl, true);
+ TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
+}
+
+
+/*
+ * Make sure that CLOG has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock. We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCLOG(TransactionId newestXact)
+{
+ int pageno;
+
+ /*
+ * No work except at first XID of a page. But beware: just after
+ * wraparound, the first XID of page zero is FirstNormalTransactionId.
+ */
+ if (TransactionIdToPgIndex(newestXact) != 0 &&
+ !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+ return;
+
+ pageno = TransactionIdToPage(newestXact);
+
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ /* Zero the page and make an XLOG entry about it */
+ ZeroCLOGPage(pageno, true);
+
+ LWLockRelease(XactSLRULock);
+}
+
+
+/*
+ * Remove all CLOG segments before the one holding the passed transaction ID
+ *
+ * Before removing any CLOG data, we must flush XLOG to disk, to ensure
+ * that any recently-emitted FREEZE_PAGE records have reached disk; otherwise
+ * a crash and restart might leave us with some unfrozen tuples referencing
+ * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
+ * Replaying the deletion from XLOG is not critical, since the files could
+ * just as well be removed later, but doing so prevents a long-running hot
+ * standby server from acquiring an unreasonably bloated CLOG directory.
+ *
+ * Since CLOG segments hold a large number of transactions, the opportunity to
+ * actually remove a segment is fairly rare, and so it seems best not to do
+ * the XLOG flush unless we have confirmed that there is a removable segment.
+ */
+void
+TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
+{
+ int cutoffPage;
+
+ /*
+ * The cutoff point is the start of the segment containing oldestXact. We
+ * pass the *page* containing oldestXact to SimpleLruTruncate.
+ */
+ cutoffPage = TransactionIdToPage(oldestXact);
+
+ /* Check to see if there's any files that could be removed */
+ if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
+ return; /* nothing to remove */
+
+ /*
+ * Advance oldestClogXid before truncating clog, so concurrent xact status
+ * lookups can ensure they don't attempt to access truncated-away clog.
+ *
+ * It's only necessary to do this if we will actually truncate away clog
+ * pages.
+ */
+ AdvanceOldestClogXid(oldestXact);
+
+ /*
+ * Write XLOG record and flush XLOG to disk. We record the oldest xid
+ * we're keeping information about here so we can ensure that it's always
+ * ahead of clog truncation in case we crash, and so a standby finds out
+ * the new valid xid before the next checkpoint.
+ */
+ WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
+
+ /* Now we can remove the old CLOG segment(s) */
+ SimpleLruTruncate(XactCtl, cutoffPage);
+}
+
+
+/*
+ * Decide whether a CLOG page number is "older" for truncation purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
+ * would get weird about permanent xact IDs. So, offset both such that xid1,
+ * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
+ * is relevant to page 0 and to the page preceding page 0.
+ *
+ * The page containing oldestXact-2^31 is the important edge case. The
+ * portion of that page equaling or following oldestXact-2^31 is expendable,
+ * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
+ * the first XID of a page and segment, the entire page and segment is
+ * expendable, and we could truncate the segment. Recognizing that case would
+ * require making oldestXact, not just the page containing oldestXact,
+ * available to this callback. The benefit would be rare and small, so we
+ * don't optimize that edge case.
+ */
+static bool
+CLOGPagePrecedes(int page1, int page2)
+{
+ TransactionId xid1;
+ TransactionId xid2;
+
+ xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
+}
+
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroPageXlogRec(int pageno)
+{
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&pageno), sizeof(int));
+ (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ *
+ * We must flush the xlog record to disk before returning --- see notes
+ * in TruncateCLOG().
+ */
+static void
+WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb)
+{
+ XLogRecPtr recptr;
+ xl_clog_truncate xlrec;
+
+ xlrec.pageno = pageno;
+ xlrec.oldestXact = oldestXact;
+ xlrec.oldestXactDb = oldestXactDb;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate));
+ recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
+ XLogFlush(recptr);
+}
+
+/*
+ * CLOG resource manager's routines
+ */
+void
+clog_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in clog records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ if (info == CLOG_ZEROPAGE)
+ {
+ int pageno;
+ int slotno;
+
+ memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+ LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+ slotno = ZeroCLOGPage(pageno, false);
+ SimpleLruWritePage(XactCtl, slotno);
+ Assert(!XactCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(XactSLRULock);
+ }
+ else if (info == CLOG_TRUNCATE)
+ {
+ xl_clog_truncate xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
+
+ AdvanceOldestClogXid(xlrec.oldestXact);
+
+ SimpleLruTruncate(XactCtl, xlrec.pageno);
+ }
+ else
+ elog(PANIC, "clog_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+ return SlruSyncFileTag(XactCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
new file mode 100644
index 0000000..4dc8d40
--- /dev/null
+++ b/src/backend/access/transam/commit_ts.c
@@ -0,0 +1,1035 @@
+/*-------------------------------------------------------------------------
+ *
+ * commit_ts.c
+ * PostgreSQL commit timestamp manager
+ *
+ * This module is a pg_xact-like system that stores the commit timestamp
+ * for each transaction.
+ *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * CommitTs page is initialized to zeroes. Also, one XLOG record is
+ * generated for setting of values when the caller requests it; this allows
+ * us to support values coming from places other than transaction commit.
+ * Other writes of CommitTS come from recording of transaction commit in
+ * xact.c, which generates its own XLOG records for these events and will
+ * re-perform the status update on redo; so we need make no additional XLOG
+ * entry here.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/commit_ts.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/commit_ts.h"
+#include "access/htup_details.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+/*
+ * Defines for CommitTs page sizes. A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CommitTs page numbering also wraps around at
+ * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE, and CommitTs segment numbering at
+ * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCommitTs (see CommitTsPagePrecedes).
+ */
+
+/*
+ * We need 8+2 bytes per xact. Note that enlarging this struct might mean
+ * the largest possible file name is more than 5 chars long; see
+ * SlruScanDirectory.
+ */
+typedef struct CommitTimestampEntry
+{
+ TimestampTz time;
+ RepOriginId nodeid;
+} CommitTimestampEntry;
+
+#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \
+ sizeof(RepOriginId))
+
+#define COMMIT_TS_XACTS_PER_PAGE \
+ (BLCKSZ / SizeOfCommitTimestampEntry)
+
+#define TransactionIdToCTsPage(xid) \
+ ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+#define TransactionIdToCTsEntry(xid) \
+ ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CommitTs control
+ */
+static SlruCtlData CommitTsCtlData;
+
+#define CommitTsCtl (&CommitTsCtlData)
+
+/*
+ * We keep a cache of the last value set in shared memory.
+ *
+ * This is also good place to keep the activation status. We keep this
+ * separate from the GUC so that the standby can activate the module if the
+ * primary has it active independently of the value of the GUC.
+ *
+ * This is protected by CommitTsLock. In some places, we use commitTsActive
+ * without acquiring the lock; where this happens, a comment explains the
+ * rationale for it.
+ */
+typedef struct CommitTimestampShared
+{
+ TransactionId xidLastCommit;
+ CommitTimestampEntry dataLastCommit;
+ bool commitTsActive;
+} CommitTimestampShared;
+
+static CommitTimestampShared *commitTsShared;
+
+
+/* GUC variable */
+bool track_commit_timestamp;
+
+static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int pageno);
+static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+ RepOriginId nodeid, int slotno);
+static void error_commit_ts_disabled(void);
+static int ZeroCommitTsPage(int pageno, bool writeXlog);
+static bool CommitTsPagePrecedes(int page1, int page2);
+static void ActivateCommitTs(void);
+static void DeactivateCommitTs(void);
+static void WriteZeroPageXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid);
+
+/*
+ * TransactionTreeSetCommitTsData
+ *
+ * Record the final commit timestamp of transaction entries in the commit log
+ * for a transaction and its subtransaction tree, as efficiently as possible.
+ *
+ * xid is the top level transaction id.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ * The reason why tracking just the parent xid commit timestamp is not enough
+ * is that the subtrans SLRU does not stay valid across crashes (it's not
+ * permanent) so we need to keep the information about them here. If the
+ * subtrans implementation changes in the future, we might want to revisit the
+ * decision of storing timestamp info for each subxid.
+ */
+void
+TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz timestamp,
+ RepOriginId nodeid)
+{
+ int i;
+ TransactionId headxid;
+ TransactionId newestXact;
+
+ /*
+ * No-op if the module is not active.
+ *
+ * An unlocked read here is fine, because in a standby (the only place
+ * where the flag can change in flight) this routine is only called by the
+ * recovery process, which is also the only process which can change the
+ * flag.
+ */
+ if (!commitTsShared->commitTsActive)
+ return;
+
+ /*
+ * Figure out the latest Xid in this batch: either the last subxid if
+ * there's any, otherwise the parent xid.
+ */
+ if (nsubxids > 0)
+ newestXact = subxids[nsubxids - 1];
+ else
+ newestXact = xid;
+
+ /*
+ * We split the xids to set the timestamp to in groups belonging to the
+ * same SLRU page; the first element in each such set is its head. The
+ * first group has the main XID as the head; subsequent sets use the first
+ * subxid not on the previous page as head. This way, we only have to
+ * lock/modify each SLRU page once.
+ */
+ headxid = xid;
+ i = 0;
+ for (;;)
+ {
+ int pageno = TransactionIdToCTsPage(headxid);
+ int j;
+
+ for (j = i; j < nsubxids; j++)
+ {
+ if (TransactionIdToCTsPage(subxids[j]) != pageno)
+ break;
+ }
+ /* subxids[i..j] are on the same page as the head */
+
+ SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
+ pageno);
+
+ /* if we wrote out all subxids, we're done. */
+ if (j >= nsubxids)
+ break;
+
+ /*
+ * Set the new head and skip over it, as well as over the subxids we
+ * just wrote.
+ */
+ headxid = subxids[j];
+ i = j + 1;
+ }
+
+ /* update the cached value in shared memory */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->xidLastCommit = xid;
+ commitTsShared->dataLastCommit.time = timestamp;
+ commitTsShared->dataLastCommit.nodeid = nodeid;
+
+ /* and move forwards our endpoint, if needed */
+ if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact))
+ ShmemVariableCache->newestCommitTsXid = newestXact;
+ LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Record the commit timestamp of transaction entries in the commit log for all
+ * entries on a single page. Atomic only on this page.
+ */
+static void
+SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int pageno)
+{
+ int slotno;
+ int i;
+
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+ slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
+
+ TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
+ for (i = 0; i < nsubxids; i++)
+ TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
+
+ CommitTsCtl->shared->page_dirty[slotno] = true;
+
+ LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Sets the commit timestamp of a single transaction.
+ *
+ * Must be called with CommitTsSLRULock held
+ */
+static void
+TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+ RepOriginId nodeid, int slotno)
+{
+ int entryno = TransactionIdToCTsEntry(xid);
+ CommitTimestampEntry entry;
+
+ Assert(TransactionIdIsNormal(xid));
+
+ entry.time = ts;
+ entry.nodeid = nodeid;
+
+ memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ SizeOfCommitTimestampEntry * entryno,
+ &entry, SizeOfCommitTimestampEntry);
+}
+
+/*
+ * Interrogate the commit timestamp of a transaction.
+ *
+ * The return value indicates whether a commit timestamp record was found for
+ * the given xid. The timestamp value is returned in *ts (which may not be
+ * null), and the origin node for the Xid is returned in *nodeid, if it's not
+ * null.
+ */
+bool
+TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
+ RepOriginId *nodeid)
+{
+ int pageno = TransactionIdToCTsPage(xid);
+ int entryno = TransactionIdToCTsEntry(xid);
+ int slotno;
+ CommitTimestampEntry entry;
+ TransactionId oldestCommitTsXid;
+ TransactionId newestCommitTsXid;
+
+ if (!TransactionIdIsValid(xid))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
+ else if (!TransactionIdIsNormal(xid))
+ {
+ /* frozen and bootstrap xids are always committed far in the past */
+ *ts = 0;
+ if (nodeid)
+ *nodeid = 0;
+ return false;
+ }
+
+ LWLockAcquire(CommitTsLock, LW_SHARED);
+
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
+
+ /*
+ * If we're asked for the cached value, return that. Otherwise, fall
+ * through to read from SLRU.
+ */
+ if (commitTsShared->xidLastCommit == xid)
+ {
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+
+ LWLockRelease(CommitTsLock);
+ return *ts != 0;
+ }
+
+ oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
+ newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
+ /* neither is invalid, or both are */
+ Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid));
+ LWLockRelease(CommitTsLock);
+
+ /*
+ * Return empty if the requested value is outside our valid range.
+ */
+ if (!TransactionIdIsValid(oldestCommitTsXid) ||
+ TransactionIdPrecedes(xid, oldestCommitTsXid) ||
+ TransactionIdPrecedes(newestCommitTsXid, xid))
+ {
+ *ts = 0;
+ if (nodeid)
+ *nodeid = InvalidRepOriginId;
+ return false;
+ }
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
+ memcpy(&entry,
+ CommitTsCtl->shared->page_buffer[slotno] +
+ SizeOfCommitTimestampEntry * entryno,
+ SizeOfCommitTimestampEntry);
+
+ *ts = entry.time;
+ if (nodeid)
+ *nodeid = entry.nodeid;
+
+ LWLockRelease(CommitTsSLRULock);
+ return *ts != 0;
+}
+
+/*
+ * Return the Xid of the latest committed transaction. (As far as this module
+ * is concerned, anyway; it's up to the caller to ensure the value is useful
+ * for its purposes.)
+ *
+ * ts and nodeid are filled with the corresponding data; they can be passed
+ * as NULL if not wanted.
+ */
+TransactionId
+GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid)
+{
+ TransactionId xid;
+
+ LWLockAcquire(CommitTsLock, LW_SHARED);
+
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
+
+ xid = commitTsShared->xidLastCommit;
+ if (ts)
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+ LWLockRelease(CommitTsLock);
+
+ return xid;
+}
+
+static void
+error_commit_ts_disabled(void)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not get commit timestamp data"),
+ RecoveryInProgress() ?
+ errhint("Make sure the configuration parameter \"%s\" is set on the primary server.",
+ "track_commit_timestamp") :
+ errhint("Make sure the configuration parameter \"%s\" is set.",
+ "track_commit_timestamp")));
+}
+
+/*
+ * SQL-callable wrapper to obtain commit time of a transaction
+ */
+Datum
+pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
+{
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ TimestampTz ts;
+ bool found;
+
+ found = TransactionIdGetCommitTsData(xid, &ts, NULL);
+
+ if (!found)
+ PG_RETURN_NULL();
+
+ PG_RETURN_TIMESTAMPTZ(ts);
+}
+
+
+/*
+ * pg_last_committed_xact
+ *
+ * SQL-callable wrapper to obtain some information about the latest
+ * committed transaction: transaction ID, timestamp and replication
+ * origin.
+ */
+Datum
+pg_last_committed_xact(PG_FUNCTION_ARGS)
+{
+ TransactionId xid;
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[3];
+ bool nulls[3];
+ TupleDesc tupdesc;
+ HeapTuple htup;
+
+ /* and construct a tuple with our data */
+ xid = GetLatestCommitTsData(&ts, &nodeid);
+
+ /*
+ * Construct a tuple descriptor for the result row. This must match this
+ * function's pg_proc entry!
+ */
+ tupdesc = CreateTemplateTupleDesc(3);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+ XIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp",
+ TIMESTAMPTZOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "roident",
+ OIDOID, -1, 0);
+ tupdesc = BlessTupleDesc(tupdesc);
+
+ if (!TransactionIdIsNormal(xid))
+ {
+ memset(nulls, true, sizeof(nulls));
+ }
+ else
+ {
+ values[0] = TransactionIdGetDatum(xid);
+ nulls[0] = false;
+
+ values[1] = TimestampTzGetDatum(ts);
+ nulls[1] = false;
+
+ values[2] = ObjectIdGetDatum((Oid) nodeid);
+ nulls[2] = false;
+ }
+
+ htup = heap_form_tuple(tupdesc, values, nulls);
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
+
+/*
+ * pg_xact_commit_timestamp_origin
+ *
+ * SQL-callable wrapper to obtain commit timestamp and replication origin
+ * of a given transaction.
+ */
+Datum
+pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
+{
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[2];
+ bool nulls[2];
+ TupleDesc tupdesc;
+ HeapTuple htup;
+ bool found;
+
+ found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
+
+ /*
+ * Construct a tuple descriptor for the result row. This must match this
+ * function's pg_proc entry!
+ */
+ tupdesc = CreateTemplateTupleDesc(2);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "timestamp",
+ TIMESTAMPTZOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "roident",
+ OIDOID, -1, 0);
+ tupdesc = BlessTupleDesc(tupdesc);
+
+ if (!found)
+ {
+ memset(nulls, true, sizeof(nulls));
+ }
+ else
+ {
+ values[0] = TimestampTzGetDatum(ts);
+ nulls[0] = false;
+
+ values[1] = ObjectIdGetDatum((Oid) nodeid);
+ nulls[1] = false;
+ }
+
+ htup = heap_form_tuple(tupdesc, values, nulls);
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
+
+/*
+ * Number of shared CommitTS buffers.
+ *
+ * We use a very similar logic as for the number of CLOG buffers (except we
+ * scale up twice as fast with shared buffers, and the maximum is twice as
+ * high); see comments in CLOGShmemBuffers.
+ */
+Size
+CommitTsShmemBuffers(void)
+{
+ return Min(256, Max(4, NBuffers / 256));
+}
+
+/*
+ * Shared memory sizing for CommitTs
+ */
+Size
+CommitTsShmemSize(void)
+{
+ return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ sizeof(CommitTimestampShared);
+}
+
+/*
+ * Initialize CommitTs at system startup (postmaster start or standalone
+ * backend)
+ */
+void
+CommitTsShmemInit(void)
+{
+ bool found;
+
+ CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ CommitTsSLRULock, "pg_commit_ts",
+ LWTRANCHE_COMMITTS_BUFFER,
+ SYNC_HANDLER_COMMIT_TS);
+ SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
+
+ commitTsShared = ShmemInitStruct("CommitTs shared",
+ sizeof(CommitTimestampShared),
+ &found);
+
+ if (!IsUnderPostmaster)
+ {
+ Assert(!found);
+
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+ commitTsShared->commitTsActive = false;
+ }
+ else
+ Assert(found);
+}
+
+/*
+ * This function must be called ONCE on system install.
+ *
+ * (The CommitTs directory is assumed to have been created by initdb, and
+ * CommitTsShmemInit must have been called already.)
+ */
+void
+BootStrapCommitTs(void)
+{
+ /*
+ * Nothing to do here at present, unlike most other SLRU modules; segments
+ * are created when the server is started with this module enabled. See
+ * ActivateCommitTs.
+ */
+}
+
+/*
+ * Initialize (or reinitialize) a page of CommitTs to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCommitTsPage(int pageno, bool writeXlog)
+{
+ int slotno;
+
+ slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+
+ if (writeXlog)
+ WriteZeroPageXlogRec(pageno);
+
+ return slotno;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ */
+void
+StartupCommitTs(void)
+{
+ ActivateCommitTs();
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after recovery has finished.
+ */
+void
+CompleteCommitTsInitialization(void)
+{
+ /*
+ * If the feature is not enabled, turn it off for good. This also removes
+ * any leftover data.
+ *
+ * Conversely, we activate the module if the feature is enabled. This is
+ * necessary for primary and standby as the activation depends on the
+ * control file contents at the beginning of recovery or when a
+ * XLOG_PARAMETER_CHANGE is replayed.
+ */
+ if (!track_commit_timestamp)
+ DeactivateCommitTs();
+ else
+ ActivateCommitTs();
+}
+
+/*
+ * Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE
+ * XLog record during recovery.
+ */
+void
+CommitTsParameterChange(bool newvalue, bool oldvalue)
+{
+ /*
+ * If the commit_ts module is disabled in this server and we get word from
+ * the primary server that it is enabled there, activate it so that we can
+ * replay future WAL records involving it; also mark it as active on
+ * pg_control. If the old value was already set, we already did this, so
+ * don't do anything.
+ *
+ * If the module is disabled in the primary, disable it here too, unless
+ * the module is enabled locally.
+ *
+ * Note this only runs in the recovery process, so an unlocked read is
+ * fine.
+ */
+ if (newvalue)
+ {
+ if (!commitTsShared->commitTsActive)
+ ActivateCommitTs();
+ }
+ else if (commitTsShared->commitTsActive)
+ DeactivateCommitTs();
+}
+
+/*
+ * Activate this module whenever necessary.
+ * This must happen during postmaster or standalone-backend startup,
+ * or during WAL replay anytime the track_commit_timestamp setting is
+ * changed in the primary.
+ *
+ * The reason why this SLRU needs separate activation/deactivation functions is
+ * that it can be enabled/disabled during start and the activation/deactivation
+ * on the primary is propagated to the standby via replay. Other SLRUs don't
+ * have this property and they can be just initialized during normal startup.
+ *
+ * This is in charge of creating the currently active segment, if it's not
+ * already there. The reason for this is that the server might have been
+ * running with this module disabled for a while and thus might have skipped
+ * the normal creation point.
+ */
+static void
+ActivateCommitTs(void)
+{
+ TransactionId xid;
+ int pageno;
+
+ /* If we've done this already, there's nothing to do */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (commitTsShared->commitTsActive)
+ {
+ LWLockRelease(CommitTsLock);
+ return;
+ }
+ LWLockRelease(CommitTsLock);
+
+ xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ pageno = TransactionIdToCTsPage(xid);
+
+ /*
+ * Re-Initialize our idea of the latest page number.
+ */
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+ CommitTsCtl->shared->latest_page_number = pageno;
+ LWLockRelease(CommitTsSLRULock);
+
+ /*
+ * If CommitTs is enabled, but it wasn't in the previous server run, we
+ * need to set the oldest and newest values to the next Xid; that way, we
+ * will not try to read data that might not have been set.
+ *
+ * XXX does this have a problem if a server is started with commitTs
+ * enabled, then started with commitTs disabled, then restarted with it
+ * enabled again? It doesn't look like it does, because there should be a
+ * checkpoint that sets the value to InvalidTransactionId at end of
+ * recovery; and so any chance of injecting new transactions without
+ * CommitTs values would occur after the oldestCommitTsXid has been set to
+ * Invalid temporarily.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (ShmemVariableCache->oldestCommitTsXid == InvalidTransactionId)
+ {
+ ShmemVariableCache->oldestCommitTsXid =
+ ShmemVariableCache->newestCommitTsXid = ReadNextTransactionId();
+ }
+ LWLockRelease(CommitTsLock);
+
+ /* Create the current segment file, if necessary */
+ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno))
+ {
+ int slotno;
+
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+ slotno = ZeroCommitTsPage(pageno, false);
+ SimpleLruWritePage(CommitTsCtl, slotno);
+ Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+ LWLockRelease(CommitTsSLRULock);
+ }
+
+ /* Change the activation status in shared memory. */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->commitTsActive = true;
+ LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Deactivate this module.
+ *
+ * This must be called when the track_commit_timestamp parameter is turned off.
+ * This happens during postmaster or standalone-backend startup, or during WAL
+ * replay.
+ *
+ * Resets CommitTs into invalid state to make sure we don't hand back
+ * possibly-invalid data; also removes segments of old data.
+ */
+static void
+DeactivateCommitTs(void)
+{
+ /*
+ * Cleanup the status in the shared memory.
+ *
+ * We reset everything in the commitTsShared record to prevent user from
+ * getting confusing data about last committed transaction on the standby
+ * when the module was activated repeatedly on the primary.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+
+ commitTsShared->commitTsActive = false;
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+
+ ShmemVariableCache->oldestCommitTsXid = InvalidTransactionId;
+ ShmemVariableCache->newestCommitTsXid = InvalidTransactionId;
+
+ LWLockRelease(CommitTsLock);
+
+ /*
+ * Remove *all* files. This is necessary so that there are no leftover
+ * files; in the case where this feature is later enabled after running
+ * with it disabled for some time there may be a gap in the file sequence.
+ * (We can probably tolerate out-of-sequence files, as they are going to
+ * be overwritten anyway when we wrap around, but it seems better to be
+ * tidy.)
+ */
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+ (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
+ LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCommitTs(void)
+{
+ /*
+ * Write dirty CommitTs pages to disk. This may result in sync requests
+ * queued for later handling by ProcessSyncRequests(), as part of the
+ * checkpoint.
+ */
+ SimpleLruWriteAll(CommitTsCtl, true);
+}
+
+/*
+ * Make sure that CommitTs has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock. We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty CommitTs or xlog page to make room
+ * in shared memory.
+ *
+ * NB: the current implementation relies on track_commit_timestamp being
+ * PGC_POSTMASTER.
+ */
+void
+ExtendCommitTs(TransactionId newestXact)
+{
+ int pageno;
+
+ /*
+ * Nothing to do if module not enabled. Note we do an unlocked read of
+ * the flag here, which is okay because this routine is only called from
+ * GetNewTransactionId, which is never called in a standby.
+ */
+ Assert(!InRecovery);
+ if (!commitTsShared->commitTsActive)
+ return;
+
+ /*
+ * No work except at first XID of a page. But beware: just after
+ * wraparound, the first XID of page zero is FirstNormalTransactionId.
+ */
+ if (TransactionIdToCTsEntry(newestXact) != 0 &&
+ !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+ return;
+
+ pageno = TransactionIdToCTsPage(newestXact);
+
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+ /* Zero the page and make an XLOG entry about it */
+ ZeroCommitTsPage(pageno, !InRecovery);
+
+ LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Remove all CommitTs segments before the one holding the passed
+ * transaction ID.
+ *
+ * Note that we don't need to flush XLOG here.
+ */
+void
+TruncateCommitTs(TransactionId oldestXact)
+{
+ int cutoffPage;
+
+ /*
+ * The cutoff point is the start of the segment containing oldestXact. We
+ * pass the *page* containing oldestXact to SimpleLruTruncate.
+ */
+ cutoffPage = TransactionIdToCTsPage(oldestXact);
+
+ /* Check to see if there's any files that could be removed */
+ if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence,
+ &cutoffPage))
+ return; /* nothing to remove */
+
+ /* Write XLOG record */
+ WriteTruncateXlogRec(cutoffPage, oldestXact);
+
+ /* Now we can remove the old CommitTs segment(s) */
+ SimpleLruTruncate(CommitTsCtl, cutoffPage);
+}
+
+/*
+ * Set the limit values between which commit TS can be consulted.
+ */
+void
+SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact)
+{
+ /*
+ * Be careful not to overwrite values that are either further into the
+ * "future" or signal a disabled committs.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId)
+ {
+ if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact))
+ ShmemVariableCache->oldestCommitTsXid = oldestXact;
+ if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTsXid))
+ ShmemVariableCache->newestCommitTsXid = newestXact;
+ }
+ else
+ {
+ Assert(ShmemVariableCache->newestCommitTsXid == InvalidTransactionId);
+ ShmemVariableCache->oldestCommitTsXid = oldestXact;
+ ShmemVariableCache->newestCommitTsXid = newestXact;
+ }
+ LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Move forwards the oldest commitTS value that can be consulted
+ */
+void
+AdvanceOldestCommitTsXid(TransactionId oldestXact)
+{
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId &&
+ TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact))
+ ShmemVariableCache->oldestCommitTsXid = oldestXact;
+ LWLockRelease(CommitTsLock);
+}
+
+
+/*
+ * Decide whether a commitTS page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ *
+ * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This
+ * introduces differences compared to CLOG and the other SLRUs having (1 <<
+ * 31) % per_page == 0. This function never tests exactly
+ * TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit,
+ * there are two possible counts of page boundaries between oldestXact and the
+ * latest XID assigned, depending on whether oldestXact is within the first
+ * 128 entries of its page. Since this function doesn't know the location of
+ * oldestXact within page2, it returns false for one page that actually is
+ * expendable. This is a wider (yet still negligible) version of the
+ * truncation opportunity that CLOGPagePrecedes() cannot recognize.
+ *
+ * For the sake of a worked example, number entries with decimal values such
+ * that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of
+ * pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1,
+ * then the final safe XID assignment leaves newestXact=1.95. We keep page 2,
+ * because entry=2.85 is the border that toggles whether entries precede the
+ * last entry of the oldestXact page. While page 2 is expendable at
+ * oldestXact=N+2.1, it would be precious at oldestXact=N+2.9.
+ */
+static bool
+CommitTsPagePrecedes(int page1, int page2)
+{
+ TransactionId xid1;
+ TransactionId xid2;
+
+ xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
+}
+
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroPageXlogRec(int pageno)
+{
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&pageno), sizeof(int));
+ (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ */
+static void
+WriteTruncateXlogRec(int pageno, TransactionId oldestXid)
+{
+ xl_commit_ts_truncate xlrec;
+
+ xlrec.pageno = pageno;
+ xlrec.oldestXid = oldestXid;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate);
+ (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE);
+}
+
+/*
+ * CommitTS resource manager's routines
+ */
+void
+commit_ts_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in commit_ts records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ if (info == COMMIT_TS_ZEROPAGE)
+ {
+ int pageno;
+ int slotno;
+
+ memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+ LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+ slotno = ZeroCommitTsPage(pageno, false);
+ SimpleLruWritePage(CommitTsCtl, slotno);
+ Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(CommitTsSLRULock);
+ }
+ else if (info == COMMIT_TS_TRUNCATE)
+ {
+ xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record);
+
+ AdvanceOldestCommitTsXid(trunc->oldestXid);
+
+ /*
+ * During XLOG replay, latest_page_number isn't set up yet; insert a
+ * suitable value to bypass the sanity test in SimpleLruTruncate.
+ */
+ CommitTsCtl->shared->latest_page_number = trunc->pageno;
+
+ SimpleLruTruncate(CommitTsCtl, trunc->pageno);
+ }
+ else
+ elog(PANIC, "commit_ts_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+ return SlruSyncFileTag(CommitTsCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
new file mode 100644
index 0000000..0136ca7
--- /dev/null
+++ b/src/backend/access/transam/generic_xlog.c
@@ -0,0 +1,540 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic_xlog.c
+ * Implementation of generic xlog records.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/generic_xlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/generic_xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+/*-------------------------------------------------------------------------
+ * Internally, a delta between pages consists of a set of fragments. Each
+ * fragment represents changes made in a given region of a page. A fragment
+ * is made up as follows:
+ *
+ * - offset of page region (OffsetNumber)
+ * - length of page region (OffsetNumber)
+ * - data - the data to place into the region ('length' number of bytes)
+ *
+ * Unchanged regions of a page are not represented in its delta. As a result,
+ * a delta can be more compact than the full page image. But having an
+ * unchanged region between two fragments that is smaller than the fragment
+ * header (offset+length) does not pay off in terms of the overall size of
+ * the delta. For this reason, we merge adjacent fragments if the unchanged
+ * region between them is <= MATCH_THRESHOLD bytes.
+ *
+ * We do not bother to merge fragments across the "lower" and "upper" parts
+ * of a page; it's very seldom the case that pd_lower and pd_upper are within
+ * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
+ * would complicate and slow down the delta-computation code unduly.
+ * Therefore, the worst-case delta size includes two fragment headers plus
+ * a full page's worth of data.
+ *-------------------------------------------------------------------------
+ */
+#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber))
+#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE
+#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
+
+/* Struct of generic xlog data for single page */
+typedef struct
+{
+ Buffer buffer; /* registered buffer */
+ int flags; /* flags for this buffer */
+ int deltaLen; /* space consumed in delta field */
+ char *image; /* copy of page image for modification, do not
+ * do it in-place to have aligned memory chunk */
+ char delta[MAX_DELTA_SIZE]; /* delta between page images */
+} PageData;
+
+/* State of generic xlog record construction */
+struct GenericXLogState
+{
+ /* Info about each page, see above */
+ PageData pages[MAX_GENERIC_XLOG_PAGES];
+ bool isLogged;
+ /* Page images (properly aligned) */
+ PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
+};
+
+static void writeFragment(PageData *pageData, OffsetNumber offset,
+ OffsetNumber len, const char *data);
+static void computeRegionDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd);
+static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
+static void applyPageRedo(Page page, const char *delta, Size deltaSize);
+
+
+/*
+ * Write next fragment into pageData's delta.
+ *
+ * The fragment has the given offset and length, and data points to the
+ * actual data (of length length).
+ */
+static void
+writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
+ const char *data)
+{
+ char *ptr = pageData->delta + pageData->deltaLen;
+
+ /* Verify we have enough space */
+ Assert(pageData->deltaLen + sizeof(offset) +
+ sizeof(length) + length <= sizeof(pageData->delta));
+
+ /* Write fragment data */
+ memcpy(ptr, &offset, sizeof(offset));
+ ptr += sizeof(offset);
+ memcpy(ptr, &length, sizeof(length));
+ ptr += sizeof(length);
+ memcpy(ptr, data, length);
+ ptr += length;
+
+ pageData->deltaLen = ptr - pageData->delta;
+}
+
+/*
+ * Compute the XLOG fragments needed to transform a region of curpage into the
+ * corresponding region of targetpage, and append them to pageData's delta
+ * field. The region to transform runs from targetStart to targetEnd-1.
+ * Bytes in curpage outside the range validStart to validEnd-1 should be
+ * considered invalid, and always overwritten with target data.
+ *
+ * This function is a hot spot, so it's worth being as tense as possible
+ * about the data-matching loops.
+ */
+static void
+computeRegionDelta(PageData *pageData,
+ const char *curpage, const char *targetpage,
+ int targetStart, int targetEnd,
+ int validStart, int validEnd)
+{
+ int i,
+ loopEnd,
+ fragmentBegin = -1,
+ fragmentEnd = -1;
+
+ /* Deal with any invalid start region by including it in first fragment */
+ if (validStart > targetStart)
+ {
+ fragmentBegin = targetStart;
+ targetStart = validStart;
+ }
+
+ /* We'll deal with any invalid end region after the main loop */
+ loopEnd = Min(targetEnd, validEnd);
+
+ /* Examine all the potentially matchable bytes */
+ i = targetStart;
+ while (i < loopEnd)
+ {
+ if (curpage[i] != targetpage[i])
+ {
+ /* On unmatched byte, start new fragment if not already in one */
+ if (fragmentBegin < 0)
+ fragmentBegin = i;
+ /* Mark unmatched-data endpoint as uncertain */
+ fragmentEnd = -1;
+ /* Extend the fragment as far as possible in a tight loop */
+ i++;
+ while (i < loopEnd && curpage[i] != targetpage[i])
+ i++;
+ if (i >= loopEnd)
+ break;
+ }
+
+ /* Found a matched byte, so remember end of unmatched fragment */
+ fragmentEnd = i;
+
+ /*
+ * Extend the match as far as possible in a tight loop. (On typical
+ * workloads, this inner loop is the bulk of this function's runtime.)
+ */
+ i++;
+ while (i < loopEnd && curpage[i] == targetpage[i])
+ i++;
+
+ /*
+ * There are several possible cases at this point:
+ *
+ * 1. We have no unwritten fragment (fragmentBegin < 0). There's
+ * nothing to write; and it doesn't matter what fragmentEnd is.
+ *
+ * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
+ * Dump out the unwritten fragment, stopping at fragmentEnd.
+ *
+ * 3. The match extends to loopEnd. We'll do nothing here, exit the
+ * loop, and then dump the unwritten fragment, after merging it with
+ * the invalid end region if any. If we don't so merge, fragmentEnd
+ * establishes how much the final writeFragment call needs to write.
+ *
+ * 4. We found an unmatched byte before loopEnd. The loop will repeat
+ * and will enter the unmatched-byte stanza above. So in this case
+ * also, it doesn't matter what fragmentEnd is. The matched bytes
+ * will get merged into the continuing unmatched fragment.
+ *
+ * Only in case 3 do we reach the bottom of the loop with a meaningful
+ * fragmentEnd value, which is why it's OK that we unconditionally
+ * assign "fragmentEnd = i" above.
+ */
+ if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD)
+ {
+ writeFragment(pageData, fragmentBegin,
+ fragmentEnd - fragmentBegin,
+ targetpage + fragmentBegin);
+ fragmentBegin = -1;
+ fragmentEnd = -1; /* not really necessary */
+ }
+ }
+
+ /* Deal with any invalid end region by including it in final fragment */
+ if (loopEnd < targetEnd)
+ {
+ if (fragmentBegin < 0)
+ fragmentBegin = loopEnd;
+ fragmentEnd = targetEnd;
+ }
+
+ /* Write final fragment if any */
+ if (fragmentBegin >= 0)
+ {
+ if (fragmentEnd < 0)
+ fragmentEnd = targetEnd;
+ writeFragment(pageData, fragmentBegin,
+ fragmentEnd - fragmentBegin,
+ targetpage + fragmentBegin);
+ }
+}
+
+/*
+ * Compute the XLOG delta record needed to transform curpage into targetpage,
+ * and store it in pageData's delta field.
+ */
+static void
+computeDelta(PageData *pageData, Page curpage, Page targetpage)
+{
+ int targetLower = ((PageHeader) targetpage)->pd_lower,
+ targetUpper = ((PageHeader) targetpage)->pd_upper,
+ curLower = ((PageHeader) curpage)->pd_lower,
+ curUpper = ((PageHeader) curpage)->pd_upper;
+
+ pageData->deltaLen = 0;
+
+ /* Compute delta records for lower part of page ... */
+ computeRegionDelta(pageData, curpage, targetpage,
+ 0, targetLower,
+ 0, curLower);
+ /* ... and for upper part, ignoring what's between */
+ computeRegionDelta(pageData, curpage, targetpage,
+ targetUpper, BLCKSZ,
+ curUpper, BLCKSZ);
+
+ /*
+ * If xlog debug is enabled, then check produced delta. Result of delta
+ * application to curpage should be equivalent to targetpage.
+ */
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG)
+ {
+ PGAlignedBlock tmp;
+
+ memcpy(tmp.data, curpage, BLCKSZ);
+ applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen);
+ if (memcmp(tmp.data, targetpage, targetLower) != 0 ||
+ memcmp(tmp.data + targetUpper, targetpage + targetUpper,
+ BLCKSZ - targetUpper) != 0)
+ elog(ERROR, "result of generic xlog apply does not match");
+ }
+#endif
+}
+
+/*
+ * Start new generic xlog record for modifications to specified relation.
+ */
+GenericXLogState *
+GenericXLogStart(Relation relation)
+{
+ GenericXLogState *state;
+ int i;
+
+ state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+ state->isLogged = RelationNeedsWAL(relation);
+
+ for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+ {
+ state->pages[i].image = state->images[i].data;
+ state->pages[i].buffer = InvalidBuffer;
+ }
+
+ return state;
+}
+
+/*
+ * Register new buffer for generic xlog record.
+ *
+ * Returns pointer to the page's image in the GenericXLogState, which
+ * is what the caller should modify.
+ *
+ * If the buffer is already registered, just return its existing entry.
+ * (It's not very clear what to do with the flags in such a case, but
+ * for now we stay with the original flags.)
+ */
+Page
+GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
+{
+ int block_id;
+
+ /* Search array for existing entry or first unused slot */
+ for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++)
+ {
+ PageData *page = &state->pages[block_id];
+
+ if (BufferIsInvalid(page->buffer))
+ {
+ /* Empty slot, so use it (there cannot be a match later) */
+ page->buffer = buffer;
+ page->flags = flags;
+ memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
+ return (Page) page->image;
+ }
+ else if (page->buffer == buffer)
+ {
+ /*
+ * Buffer is already registered. Just return the image, which is
+ * already prepared.
+ */
+ return (Page) page->image;
+ }
+ }
+
+ elog(ERROR, "maximum number %d of generic xlog buffers is exceeded",
+ MAX_GENERIC_XLOG_PAGES);
+ /* keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * Apply changes represented by GenericXLogState to the actual buffers,
+ * and emit a generic xlog record.
+ */
+XLogRecPtr
+GenericXLogFinish(GenericXLogState *state)
+{
+ XLogRecPtr lsn;
+ int i;
+
+ if (state->isLogged)
+ {
+ /* Logged relation: make xlog record in critical section. */
+ XLogBeginInsert();
+
+ START_CRIT_SECTION();
+
+ /*
+ * Compute deltas if necessary, write changes to buffers, mark
+ * buffers dirty, and register changes.
+ */
+ for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+ {
+ PageData *pageData = &state->pages[i];
+ Page page;
+ PageHeader pageHeader;
+
+ if (BufferIsInvalid(pageData->buffer))
+ continue;
+
+ page = BufferGetPage(pageData->buffer);
+ pageHeader = (PageHeader) pageData->image;
+
+ /*
+ * Compute delta while we still have both the unmodified page and
+ * the new image. Not needed if we are logging the full image.
+ */
+ if (!(pageData->flags & GENERIC_XLOG_FULL_IMAGE))
+ computeDelta(pageData, page, (Page) pageData->image);
+
+ /*
+ * Apply the image, being careful to zero the "hole" between
+ * pd_lower and pd_upper in order to avoid divergence between
+ * actual page state and what replay would produce.
+ */
+ memcpy(page, pageData->image, pageHeader->pd_lower);
+ memset(page + pageHeader->pd_lower, 0,
+ pageHeader->pd_upper - pageHeader->pd_lower);
+ memcpy(page + pageHeader->pd_upper,
+ pageData->image + pageHeader->pd_upper,
+ BLCKSZ - pageHeader->pd_upper);
+
+ MarkBufferDirty(pageData->buffer);
+
+ if (pageData->flags & GENERIC_XLOG_FULL_IMAGE)
+ {
+ XLogRegisterBuffer(i, pageData->buffer,
+ REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+ }
+ else
+ {
+ XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD);
+ XLogRegisterBufData(i, pageData->delta, pageData->deltaLen);
+ }
+ }
+
+ /* Insert xlog record */
+ lsn = XLogInsert(RM_GENERIC_ID, 0);
+
+ /* Set LSN */
+ for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+ {
+ PageData *pageData = &state->pages[i];
+
+ if (BufferIsInvalid(pageData->buffer))
+ continue;
+ PageSetLSN(BufferGetPage(pageData->buffer), lsn);
+ }
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ /* Unlogged relation: skip xlog-related stuff */
+ START_CRIT_SECTION();
+ for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+ {
+ PageData *pageData = &state->pages[i];
+
+ if (BufferIsInvalid(pageData->buffer))
+ continue;
+ memcpy(BufferGetPage(pageData->buffer),
+ pageData->image,
+ BLCKSZ);
+ /* We don't worry about zeroing the "hole" in this case */
+ MarkBufferDirty(pageData->buffer);
+ }
+ END_CRIT_SECTION();
+ /* We don't have a LSN to return, in this case */
+ lsn = InvalidXLogRecPtr;
+ }
+
+ pfree(state);
+
+ return lsn;
+}
+
+/*
+ * Abort generic xlog record construction. No changes are applied to buffers.
+ *
+ * Note: caller is responsible for releasing locks/pins on buffers, if needed.
+ */
+void
+GenericXLogAbort(GenericXLogState *state)
+{
+ pfree(state);
+}
+
+/*
+ * Apply delta to given page image.
+ */
+static void
+applyPageRedo(Page page, const char *delta, Size deltaSize)
+{
+ const char *ptr = delta;
+ const char *end = delta + deltaSize;
+
+ while (ptr < end)
+ {
+ OffsetNumber offset,
+ length;
+
+ memcpy(&offset, ptr, sizeof(offset));
+ ptr += sizeof(offset);
+ memcpy(&length, ptr, sizeof(length));
+ ptr += sizeof(length);
+
+ memcpy(page + offset, ptr, length);
+
+ ptr += length;
+ }
+}
+
+/*
+ * Redo function for generic xlog record.
+ */
+void
+generic_redo(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buffers[MAX_GENERIC_XLOG_PAGES];
+ uint8 block_id;
+
+ /* Protect limited size of buffers[] array */
+ Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES);
+
+ /* Iterate over blocks */
+ for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ XLogRedoAction action;
+
+ if (!XLogRecHasBlockRef(record, block_id))
+ {
+ buffers[block_id] = InvalidBuffer;
+ continue;
+ }
+
+ action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]);
+
+ /* Apply redo to given block if needed */
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page page;
+ PageHeader pageHeader;
+ char *blockDelta;
+ Size blockDeltaSize;
+
+ page = BufferGetPage(buffers[block_id]);
+ blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize);
+ applyPageRedo(page, blockDelta, blockDeltaSize);
+
+ /*
+ * Since the delta contains no information about what's in the
+ * "hole" between pd_lower and pd_upper, set that to zero to
+ * ensure we produce the same page state that application of the
+ * logged action by GenericXLogFinish did.
+ */
+ pageHeader = (PageHeader) page;
+ memset(page + pageHeader->pd_lower, 0,
+ pageHeader->pd_upper - pageHeader->pd_lower);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffers[block_id]);
+ }
+ }
+
+ /* Changes are done: unlock and release all buffers */
+ for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ if (BufferIsValid(buffers[block_id]))
+ UnlockReleaseBuffer(buffers[block_id]);
+ }
+}
+
+/*
+ * Mask a generic page before performing consistency checks on it.
+ */
+void
+generic_mask(char *page, BlockNumber blkno)
+{
+ mask_page_lsn_and_checksum(page);
+
+ mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
new file mode 100644
index 0000000..b8b1773
--- /dev/null
+++ b/src/backend/access/transam/multixact.c
@@ -0,0 +1,3428 @@
+/*-------------------------------------------------------------------------
+ *
+ * multixact.c
+ * PostgreSQL multi-transaction-log manager
+ *
+ * The pg_multixact manager is a pg_xact-like manager that stores an array of
+ * MultiXactMember for each MultiXactId. It is a fundamental part of the
+ * shared-row-lock implementation. Each MultiXactMember is comprised of a
+ * TransactionId and a set of flag bits. The name is a bit historical:
+ * originally, a MultiXactId consisted of more than one TransactionId (except
+ * in rare corner cases), hence "multi". Nowadays, however, it's perfectly
+ * legitimate to have MultiXactIds that only include a single Xid.
+ *
+ * The meaning of the flag bits is opaque to this module, but they are mostly
+ * used in heapam.c to identify lock modes that each of the member transactions
+ * is holding on any given tuple. This module just contains support to store
+ * and retrieve the arrays.
+ *
+ * We use two SLRU areas, one for storing the offsets at which the data
+ * starts for each MultiXactId in the other one. This trick allows us to
+ * store variable length arrays of TransactionIds. (We could alternatively
+ * use one area containing counts and TransactionIds, with valid MultiXactId
+ * values pointing at slots containing counts; but that way seems less robust
+ * since it would get completely confused if someone inquired about a bogus
+ * MultiXactId that pointed to an intermediate slot containing an XID.)
+ *
+ * XLOG interactions: this module generates a record whenever a new OFFSETs or
+ * MEMBERs page is initialized to zeroes, as well as an
+ * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
+ * This module ignores the WAL rule "write xlog before data," because it
+ * suffices that actions recording a MultiXactId in a heap xmax do follow that
+ * rule. The only way for the MXID to be referenced from any data page is for
+ * heap_lock_tuple() or heap_update() to have put it there, and each generates
+ * an XLOG record that must follow ours. The normal LSN interlock between the
+ * data page and that XLOG record will ensure that our XLOG record reaches
+ * disk first. If the SLRU members/offsets data reaches disk sooner than the
+ * XLOG records, we do not care; after recovery, no xmax will refer to it. On
+ * the flip side, to ensure that all referenced entries _do_ reach disk, this
+ * module's XLOG records completely rebuild the data entered since the last
+ * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk
+ * before each checkpoint is considered complete.
+ *
+ * Like clog.c, and unlike subtrans.c, we have to preserve state across
+ * crashes and ensure that MXID and offset numbering increases monotonically
+ * across a crash. We do this in the same way as it's done for transaction
+ * IDs: the WAL record is guaranteed to contain evidence of every MXID we
+ * could need to worry about, and we just make sure that at the end of
+ * replay, the next-MXID and next-offset counters are at least as large as
+ * anything we saw during replay.
+ *
+ * We are able to remove segments no longer necessary by carefully tracking
+ * each table's used values: during vacuum, any multixact older than a certain
+ * value is removed; the cutoff value is stored in pg_class. The minimum value
+ * across all tables in each database is stored in pg_database, and the global
+ * minimum across all databases is part of pg_control and is kept in shared
+ * memory. Whenever that minimum is advanced, the SLRUs are truncated.
+ *
+ * When new multixactid values are to be created, care is taken that the
+ * counter does not fall within the wraparound horizon considering the global
+ * minimum value.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/multixact.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/multixact.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
+#include "funcapi.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "postmaster/autovacuum.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
+ * used everywhere else in Postgres.
+ *
+ * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
+ * MultiXact page numbering also wraps around at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in TruncateMultiXact (see
+ * MultiXactOffsetPagePrecedes).
+ */
+
+/* We need four bytes per offset */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+
+#define MultiXactIdToOffsetPage(xid) \
+ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
+#define MultiXactIdToOffsetEntry(xid) \
+ ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
+#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId. To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT 8
+#define MXACT_MEMBER_FLAGS_PER_BYTE 1
+#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP 4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \
+ (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE \
+ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/*
+ * Because the number of items per page is not a divisor of the last item
+ * number (member 0xFFFFFFFF), the last segment does not use the maximum number
+ * of pages, and moreover the last used page therein does not use the same
+ * number of items as previous pages. (Another way to say it is that the
+ * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
+ * has some empty space after that item.)
+ *
+ * This constant is the number of members in the last page of the last segment.
+ */
+#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
+ ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
+
+/* page in which a member is to be found */
+#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)
+
+/* Location (byte offset within page) of flag word for a given member */
+#define MXOffsetToFlagsOffset(xid) \
+ ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
+ (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
+ (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
+#define MXOffsetToFlagsBitShift(xid) \
+ (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
+ MXACT_MEMBER_BITS_PER_XACT)
+
+/* Location (byte offset within page) of TransactionId of given member */
+#define MXOffsetToMemberOffset(xid) \
+ (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
+ ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
+
+/* Multixact members wraparound thresholds. */
+#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2)
+#define MULTIXACT_MEMBER_DANGER_THRESHOLD \
+ (MaxMultiXactOffset - MaxMultiXactOffset / 4)
+
+#define PreviousMultiXactId(xid) \
+ ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1)
+
+/*
+ * Links to shared-memory data structures for MultiXact control
+ */
+static SlruCtlData MultiXactOffsetCtlData;
+static SlruCtlData MultiXactMemberCtlData;
+
+#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
+#define MultiXactMemberCtl (&MultiXactMemberCtlData)
+
+/*
+ * MultiXact state shared across all backends. All this state is protected
+ * by MultiXactGenLock. (We also use MultiXactOffsetSLRULock and
+ * MultiXactMemberSLRULock to guard accesses to the two sets of SLRU
+ * buffers. For concurrency's sake, we avoid holding more than one of these
+ * locks at a time.)
+ */
+typedef struct MultiXactStateData
+{
+ /* next-to-be-assigned MultiXactId */
+ MultiXactId nextMXact;
+
+ /* next-to-be-assigned offset */
+ MultiXactOffset nextOffset;
+
+ /* Have we completed multixact startup? */
+ bool finishedStartup;
+
+ /*
+ * Oldest multixact that is still potentially referenced by a relation.
+ * Anything older than this should not be consulted. These values are
+ * updated by vacuum.
+ */
+ MultiXactId oldestMultiXactId;
+ Oid oldestMultiXactDB;
+
+ /*
+ * Oldest multixact offset that is potentially referenced by a multixact
+ * referenced by a relation. We don't always know this value, so there's
+ * a flag here to indicate whether or not we currently do.
+ */
+ MultiXactOffset oldestOffset;
+ bool oldestOffsetKnown;
+
+ /* support for anti-wraparound measures */
+ MultiXactId multiVacLimit;
+ MultiXactId multiWarnLimit;
+ MultiXactId multiStopLimit;
+ MultiXactId multiWrapLimit;
+
+ /* support for members anti-wraparound measures */
+ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
+
+ /*
+ * Per-backend data starts here. We have two arrays stored in the area
+ * immediately following the MultiXactStateData struct. Each is indexed by
+ * BackendId.
+ *
+ * In both arrays, there's a slot for all normal backends (1..MaxBackends)
+ * followed by a slot for max_prepared_xacts prepared transactions. Valid
+ * BackendIds start from 1; element zero of each array is never used.
+ *
+ * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
+ * transaction(s) could possibly be a member of, or InvalidMultiXactId
+ * when the backend has no live transaction that could possibly be a
+ * member of a MultiXact. Each backend sets its entry to the current
+ * nextMXact counter just before first acquiring a shared lock in a given
+ * transaction, and clears it at transaction end. (This works because only
+ * during or after acquiring a shared lock could an XID possibly become a
+ * member of a MultiXact, and that MultiXact would have to be created
+ * during or after the lock acquisition.)
+ *
+ * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
+ * current transaction(s) think is potentially live, or InvalidMultiXactId
+ * when not in a transaction or not in a transaction that's paid any
+ * attention to MultiXacts yet. This is computed when first needed in a
+ * given transaction, and cleared at transaction end. We can compute it
+ * as the minimum of the valid OldestMemberMXactId[] entries at the time
+ * we compute it (using nextMXact if none are valid). Each backend is
+ * required not to attempt to access any SLRU data for MultiXactIds older
+ * than its own OldestVisibleMXactId[] setting; this is necessary because
+ * the checkpointer could truncate away such data at any instant.
+ *
+ * The oldest valid value among all of the OldestMemberMXactId[] and
+ * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
+ * possible value still having any live member transaction. Subtracting
+ * vacuum_multixact_freeze_min_age from that value we obtain the freezing
+ * point for multixacts for that table. Any value older than that is
+ * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note
+ * that multis that have member xids that are older than the cutoff point
+ * for xids must also be frozen, even if the multis themselves are newer
+ * than the multixid cutoff point). Whenever a full table vacuum happens,
+ * the freezing point so computed is used as the new pg_class.relminmxid
+ * value. The minimum of all those values in a database is stored as
+ * pg_database.datminmxid. In turn, the minimum of all of those values is
+ * stored in pg_control and used as truncation point for pg_multixact. At
+ * checkpoint or restartpoint, unneeded segments are removed.
+ */
+ MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
+} MultiXactStateData;
+
+/*
+ * Last element of OldestMemberMXactId and OldestVisibleMXactId arrays.
+ * Valid elements are (1..MaxOldestSlot); element 0 is never used.
+ */
+#define MaxOldestSlot (MaxBackends + max_prepared_xacts)
+
+/* Pointers to the state data in shared memory */
+static MultiXactStateData *MultiXactState;
+static MultiXactId *OldestMemberMXactId;
+static MultiXactId *OldestVisibleMXactId;
+
+
+/*
+ * Definitions for the backend-local MultiXactId cache.
+ *
+ * We use this cache to store known MultiXacts, so we don't need to go to
+ * SLRU areas every time.
+ *
+ * The cache lasts for the duration of a single transaction, the rationale
+ * for this being that most entries will contain our own TransactionId and
+ * so they will be uninteresting by the time our next transaction starts.
+ * (XXX not clear that this is correct --- other members of the MultiXact
+ * could hang around longer than we did. However, it's not clear what a
+ * better policy for flushing old cache entries would be.) FIXME actually
+ * this is plain wrong now that multixact's may contain update Xids.
+ *
+ * We allocate the cache entries in a memory context that is deleted at
+ * transaction end, so we don't need to do retail freeing of entries.
+ */
+typedef struct mXactCacheEnt
+{
+ MultiXactId multi;
+ int nmembers;
+ dlist_node node;
+ MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
+} mXactCacheEnt;
+
+#define MAX_CACHE_ENTRIES 256
+static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
+static int MXactCacheMembers = 0;
+static MemoryContext MXactContext = NULL;
+
+#ifdef MULTIXACT_DEBUG
+#define debug_elog2(a,b) elog(a,b)
+#define debug_elog3(a,b,c) elog(a,b,c)
+#define debug_elog4(a,b,c,d) elog(a,b,c,d)
+#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
+#else
+#define debug_elog2(a,b)
+#define debug_elog3(a,b,c)
+#define debug_elog4(a,b,c,d)
+#define debug_elog5(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f)
+#endif
+
+/* internal MultiXactId management */
+static void MultiXactIdSetOldestVisible(void);
+static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
+ int nmembers, MultiXactMember *members);
+static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
+
+/* MultiXact cache management */
+static int mxactMemberComparator(const void *arg1, const void *arg2);
+static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
+static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
+static void mXactCachePut(MultiXactId multi, int nmembers,
+ MultiXactMember *members);
+
+static char *mxstatus_to_string(MultiXactStatus status);
+
+/* management of SLRU infrastructure */
+static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
+static int ZeroMultiXactMemberPage(int pageno, bool writeXlog);
+static bool MultiXactOffsetPagePrecedes(int page1, int page2);
+static bool MultiXactMemberPagePrecedes(int page1, int page2);
+static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
+ MultiXactOffset offset2);
+static void ExtendMultiXactOffset(MultiXactId multi);
+static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
+static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
+ MultiXactOffset start, uint32 distance);
+static bool SetOffsetVacuumLimit(bool is_startup);
+static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
+static void WriteMZeroPageXlogRec(int pageno, uint8 info);
+static void WriteMTruncateXlogRec(Oid oldestMultiDB,
+ MultiXactId startTruncOff,
+ MultiXactId endTruncOff,
+ MultiXactOffset startTruncMemb,
+ MultiXactOffset endTruncMemb);
+
+
+/*
+ * MultiXactIdCreate
+ * Construct a MultiXactId representing two TransactionIds.
+ *
+ * The two XIDs must be different, or be requesting different statuses.
+ *
+ * NB - we don't worry about our local MultiXactId cache here, because that
+ * is handled by the lower-level routines.
+ */
+MultiXactId
+MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
+ TransactionId xid2, MultiXactStatus status2)
+{
+ MultiXactId newMulti;
+ MultiXactMember members[2];
+
+ AssertArg(TransactionIdIsValid(xid1));
+ AssertArg(TransactionIdIsValid(xid2));
+
+ Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
+
+ /* MultiXactIdSetOldestMember() must have been called already. */
+ Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+
+ /*
+ * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
+ * are still running. In typical usage, xid2 will be our own XID and the
+ * caller just did a check on xid1, so it'd be wasted effort.
+ */
+
+ members[0].xid = xid1;
+ members[0].status = status1;
+ members[1].xid = xid2;
+ members[1].status = status2;
+
+ newMulti = MultiXactIdCreateFromMembers(2, members);
+
+ debug_elog3(DEBUG2, "Create: %s",
+ mxid_to_string(newMulti, 2, members));
+
+ return newMulti;
+}
+
+/*
+ * MultiXactIdExpand
+ * Add a TransactionId to a pre-existing MultiXactId.
+ *
+ * If the TransactionId is already a member of the passed MultiXactId with the
+ * same status, just return it as-is.
+ *
+ * Note that we do NOT actually modify the membership of a pre-existing
+ * MultiXactId; instead we create a new one. This is necessary to avoid
+ * a race condition against code trying to wait for one MultiXactId to finish;
+ * see notes in heapam.c.
+ *
+ * NB - we don't worry about our local MultiXactId cache here, because that
+ * is handled by the lower-level routines.
+ *
+ * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
+ * one upgraded by pg_upgrade from a cluster older than this feature) are not
+ * passed in.
+ */
+MultiXactId
+MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
+{
+ MultiXactId newMulti;
+ MultiXactMember *members;
+ MultiXactMember *newMembers;
+ int nmembers;
+ int i;
+ int j;
+
+ AssertArg(MultiXactIdIsValid(multi));
+ AssertArg(TransactionIdIsValid(xid));
+
+ /* MultiXactIdSetOldestMember() must have been called already. */
+ Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+
+ debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
+ multi, xid, mxstatus_to_string(status));
+
+ /*
+ * Note: we don't allow for old multis here. The reason is that the only
+ * caller of this function does a check that the multixact is no longer
+ * running.
+ */
+ nmembers = GetMultiXactIdMembers(multi, &members, false, false);
+
+ if (nmembers < 0)
+ {
+ MultiXactMember member;
+
+ /*
+ * The MultiXactId is obsolete. This can only happen if all the
+ * MultiXactId members stop running between the caller checking and
+ * passing it to us. It would be better to return that fact to the
+ * caller, but it would complicate the API and it's unlikely to happen
+ * too often, so just deal with it by creating a singleton MultiXact.
+ */
+ member.xid = xid;
+ member.status = status;
+ newMulti = MultiXactIdCreateFromMembers(1, &member);
+
+ debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
+ multi, newMulti);
+ return newMulti;
+ }
+
+ /*
+ * If the TransactionId is already a member of the MultiXactId with the
+ * same status, just return the existing MultiXactId.
+ */
+ for (i = 0; i < nmembers; i++)
+ {
+ if (TransactionIdEquals(members[i].xid, xid) &&
+ (members[i].status == status))
+ {
+ debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
+ xid, multi);
+ pfree(members);
+ return multi;
+ }
+ }
+
+ /*
+ * Determine which of the members of the MultiXactId are still of
+ * interest. This is any running transaction, and also any transaction
+ * that grabbed something stronger than just a lock and was committed. (An
+ * update that aborted is of no interest here; and having more than one
+ * update Xid in a multixact would cause errors elsewhere.)
+ *
+ * Removing dead members is not just an optimization: freezing of tuples
+ * whose Xmax are multis depends on this behavior.
+ *
+ * Note we have the same race condition here as above: j could be 0 at the
+ * end of the loop.
+ */
+ newMembers = (MultiXactMember *)
+ palloc(sizeof(MultiXactMember) * (nmembers + 1));
+
+ for (i = 0, j = 0; i < nmembers; i++)
+ {
+ if (TransactionIdIsInProgress(members[i].xid) ||
+ (ISUPDATE_from_mxstatus(members[i].status) &&
+ TransactionIdDidCommit(members[i].xid)))
+ {
+ newMembers[j].xid = members[i].xid;
+ newMembers[j++].status = members[i].status;
+ }
+ }
+
+ newMembers[j].xid = xid;
+ newMembers[j++].status = status;
+ newMulti = MultiXactIdCreateFromMembers(j, newMembers);
+
+ pfree(members);
+ pfree(newMembers);
+
+ debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
+
+ return newMulti;
+}
+
+/*
+ * MultiXactIdIsRunning
+ * Returns whether a MultiXactId is "running".
+ *
+ * We return true if at least one member of the given MultiXactId is still
+ * running. Note that a "false" result is certain not to change,
+ * because it is not legal to add members to an existing MultiXactId.
+ *
+ * Caller is expected to have verified that the multixact does not come from
+ * a pg_upgraded share-locked tuple.
+ */
+bool
+MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
+{
+ MultiXactMember *members;
+ int nmembers;
+ int i;
+
+ debug_elog3(DEBUG2, "IsRunning %u?", multi);
+
+ /*
+ * "false" here means we assume our callers have checked that the given
+ * multi cannot possibly come from a pg_upgraded database.
+ */
+ nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
+
+ if (nmembers <= 0)
+ {
+ debug_elog2(DEBUG2, "IsRunning: no members");
+ return false;
+ }
+
+ /*
+ * Checking for myself is cheap compared to looking in shared memory;
+ * return true if any live subtransaction of the current top-level
+ * transaction is a member.
+ *
+ * This is not needed for correctness, it's just a fast path.
+ */
+ for (i = 0; i < nmembers; i++)
+ {
+ if (TransactionIdIsCurrentTransactionId(members[i].xid))
+ {
+ debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
+ pfree(members);
+ return true;
+ }
+ }
+
+ /*
+ * This could be made faster by having another entry point in procarray.c,
+ * walking the PGPROC array only once for all the members. But in most
+ * cases nmembers should be small enough that it doesn't much matter.
+ */
+ for (i = 0; i < nmembers; i++)
+ {
+ if (TransactionIdIsInProgress(members[i].xid))
+ {
+ debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
+ i, members[i].xid);
+ pfree(members);
+ return true;
+ }
+ }
+
+ pfree(members);
+
+ debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
+
+ return false;
+}
+
+/*
+ * MultiXactIdSetOldestMember
+ * Save the oldest MultiXactId this transaction could be a member of.
+ *
+ * We set the OldestMemberMXactId for a given transaction the first time it's
+ * going to do some operation that might require a MultiXactId (tuple lock,
+ * update or delete). We need to do this even if we end up using a
+ * TransactionId instead of a MultiXactId, because there is a chance that
+ * another transaction would add our XID to a MultiXactId.
+ *
+ * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
+ * be called just before doing any such possibly-MultiXactId-able operation.
+ */
+void
+MultiXactIdSetOldestMember(void)
+{
+ if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
+ {
+ MultiXactId nextMXact;
+
+ /*
+ * You might think we don't need to acquire a lock here, since
+ * fetching and storing of TransactionIds is probably atomic, but in
+ * fact we do: suppose we pick up nextMXact and then lose the CPU for
+ * a long time. Someone else could advance nextMXact, and then
+ * another someone else could compute an OldestVisibleMXactId that
+ * would be after the value we are going to store when we get control
+ * back. Which would be wrong.
+ *
+ * Note that a shared lock is sufficient, because it's enough to stop
+ * someone from advancing nextMXact; and nobody else could be trying
+ * to write to our OldestMember entry, only reading (and we assume
+ * storing it is atomic.)
+ */
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+ /*
+ * We have to beware of the possibility that nextMXact is in the
+ * wrapped-around state. We don't fix the counter itself here, but we
+ * must be sure to store a valid value in our array entry.
+ */
+ nextMXact = MultiXactState->nextMXact;
+ if (nextMXact < FirstMultiXactId)
+ nextMXact = FirstMultiXactId;
+
+ OldestMemberMXactId[MyBackendId] = nextMXact;
+
+ LWLockRelease(MultiXactGenLock);
+
+ debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
+ MyBackendId, nextMXact);
+ }
+}
+
+/*
+ * MultiXactIdSetOldestVisible
+ * Save the oldest MultiXactId this transaction considers possibly live.
+ *
+ * We set the OldestVisibleMXactId for a given transaction the first time
+ * it's going to inspect any MultiXactId. Once we have set this, we are
+ * guaranteed that the checkpointer won't truncate off SLRU data for
+ * MultiXactIds at or after our OldestVisibleMXactId.
+ *
+ * The value to set is the oldest of nextMXact and all the valid per-backend
+ * OldestMemberMXactId[] entries. Because of the locking we do, we can be
+ * certain that no subsequent call to MultiXactIdSetOldestMember can set
+ * an OldestMemberMXactId[] entry older than what we compute here. Therefore
+ * there is no live transaction, now or later, that can be a member of any
+ * MultiXactId older than the OldestVisibleMXactId we compute here.
+ */
+static void
+MultiXactIdSetOldestVisible(void)
+{
+ if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
+ {
+ MultiXactId oldestMXact;
+ int i;
+
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+ /*
+ * We have to beware of the possibility that nextMXact is in the
+ * wrapped-around state. We don't fix the counter itself here, but we
+ * must be sure to store a valid value in our array entry.
+ */
+ oldestMXact = MultiXactState->nextMXact;
+ if (oldestMXact < FirstMultiXactId)
+ oldestMXact = FirstMultiXactId;
+
+ for (i = 1; i <= MaxOldestSlot; i++)
+ {
+ MultiXactId thisoldest = OldestMemberMXactId[i];
+
+ if (MultiXactIdIsValid(thisoldest) &&
+ MultiXactIdPrecedes(thisoldest, oldestMXact))
+ oldestMXact = thisoldest;
+ }
+
+ OldestVisibleMXactId[MyBackendId] = oldestMXact;
+
+ LWLockRelease(MultiXactGenLock);
+
+ debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
+ MyBackendId, oldestMXact);
+ }
+}
+
+/*
+ * ReadNextMultiXactId
+ * Return the next MultiXactId to be assigned, but don't allocate it
+ */
+MultiXactId
+ReadNextMultiXactId(void)
+{
+ MultiXactId mxid;
+
+ /* XXX we could presumably do this without a lock. */
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ mxid = MultiXactState->nextMXact;
+ LWLockRelease(MultiXactGenLock);
+
+ if (mxid < FirstMultiXactId)
+ mxid = FirstMultiXactId;
+
+ return mxid;
+}
+
+/*
+ * ReadMultiXactIdRange
+ * Get the range of IDs that may still be referenced by a relation.
+ */
+void
+ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
+{
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ *oldest = MultiXactState->oldestMultiXactId;
+ *next = MultiXactState->nextMXact;
+ LWLockRelease(MultiXactGenLock);
+
+ if (*oldest < FirstMultiXactId)
+ *oldest = FirstMultiXactId;
+ if (*next < FirstMultiXactId)
+ *next = FirstMultiXactId;
+}
+
+
+/*
+ * MultiXactIdCreateFromMembers
+ * Make a new MultiXactId from the specified set of members
+ *
+ * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
+ * given TransactionIds as members. Returns the newly created MultiXactId.
+ *
+ * NB: the passed members[] array will be sorted in-place.
+ */
+MultiXactId
+MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
+{
+ MultiXactId multi;
+ MultiXactOffset offset;
+ xl_multixact_create xlrec;
+
+ debug_elog3(DEBUG2, "Create: %s",
+ mxid_to_string(InvalidMultiXactId, nmembers, members));
+
+ /*
+ * See if the same set of members already exists in our cache; if so, just
+ * re-use that MultiXactId. (Note: it might seem that looking in our
+ * cache is insufficient, and we ought to search disk to see if a
+ * duplicate definition already exists. But since we only ever create
+ * MultiXacts containing our own XID, in most cases any such MultiXacts
+ * were in fact created by us, and so will be in our cache. There are
+ * corner cases where someone else added us to a MultiXact without our
+ * knowledge, but it's not worth checking for.)
+ */
+ multi = mXactCacheGetBySet(nmembers, members);
+ if (MultiXactIdIsValid(multi))
+ {
+ debug_elog2(DEBUG2, "Create: in cache!");
+ return multi;
+ }
+
+ /* Verify that there is a single update Xid among the given members. */
+ {
+ int i;
+ bool has_update = false;
+
+ for (i = 0; i < nmembers; i++)
+ {
+ if (ISUPDATE_from_mxstatus(members[i].status))
+ {
+ if (has_update)
+ elog(ERROR, "new multixact has more than one updating member: %s",
+ mxid_to_string(InvalidMultiXactId, nmembers, members));
+ has_update = true;
+ }
+ }
+ }
+
+ /*
+ * Assign the MXID and offsets range to use, and make sure there is space
+ * in the OFFSETs and MEMBERs files. NB: this routine does
+ * START_CRIT_SECTION().
+ *
+ * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
+ * that we've called MultiXactIdSetOldestMember here. This is because
+ * this routine is used in some places to create new MultiXactIds of which
+ * the current backend is not a member, notably during freezing of multis
+ * in vacuum. During vacuum, in particular, it would be unacceptable to
+ * keep OldestMulti set, in case it runs for long.
+ */
+ multi = GetNewMultiXactId(nmembers, &offset);
+
+ /* Make an XLOG entry describing the new MXID. */
+ xlrec.mid = multi;
+ xlrec.moff = offset;
+ xlrec.nmembers = nmembers;
+
+ /*
+ * XXX Note: there's a lot of padding space in MultiXactMember. We could
+ * find a more compact representation of this Xlog record -- perhaps all
+ * the status flags in one XLogRecData, then all the xids in another one?
+ * Not clear that it's worth the trouble though.
+ */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
+ XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
+
+ (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
+
+ /* Now enter the information into the OFFSETs and MEMBERs logs */
+ RecordNewMultiXact(multi, offset, nmembers, members);
+
+ /* Done with critical section */
+ END_CRIT_SECTION();
+
+ /* Store the new MultiXactId in the local cache, too */
+ mXactCachePut(multi, nmembers, members);
+
+ debug_elog2(DEBUG2, "Create: all done");
+
+ return multi;
+}
+
+/*
+ * RecordNewMultiXact
+ * Write info about a new multixact into the offsets and members files
+ *
+ * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
+ * use it.
+ */
+static void
+RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
+ int nmembers, MultiXactMember *members)
+{
+ int pageno;
+ int prev_pageno;
+ int entryno;
+ int slotno;
+ MultiXactOffset *offptr;
+ int i;
+
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ /*
+ * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
+ * to complain about if there's any I/O error. This is kinda bogus, but
+ * since the errors will always give the full pathname, it should be clear
+ * enough that a MultiXactId is really involved. Perhaps someday we'll
+ * take the trouble to generalize the slru.c error reporting code.
+ */
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
+ offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr += entryno;
+
+ *offptr = offset;
+
+ MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+
+ /* Exchange our lock */
+ LWLockRelease(MultiXactOffsetSLRULock);
+
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ prev_pageno = -1;
+
+ for (i = 0; i < nmembers; i++, offset++)
+ {
+ TransactionId *memberptr;
+ uint32 *flagsptr;
+ uint32 flagsval;
+ int bshift;
+ int flagsoff;
+ int memberoff;
+
+ Assert(members[i].status <= MultiXactStatusUpdate);
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+
+ if (pageno != prev_pageno)
+ {
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
+ prev_pageno = pageno;
+ }
+
+ memberptr = (TransactionId *)
+ (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+ *memberptr = members[i].xid;
+
+ flagsptr = (uint32 *)
+ (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+ flagsval = *flagsptr;
+ flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+ flagsval |= (members[i].status << bshift);
+ *flagsptr = flagsval;
+
+ MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+ }
+
+ LWLockRelease(MultiXactMemberSLRULock);
+}
+
+/*
+ * GetNewMultiXactId
+ * Get the next MultiXactId.
+ *
+ * Also, reserve the needed amount of space in the "members" area. The
+ * starting offset of the reserved space is returned in *offset.
+ *
+ * This may generate XLOG records for expansion of the offsets and/or members
+ * files. Unfortunately, we have to do that while holding MultiXactGenLock
+ * to avoid race conditions --- the XLOG record for zeroing a page must appear
+ * before any backend can possibly try to store data in that page!
+ *
+ * We start a critical section before advancing the shared counters. The
+ * caller must end the critical section after writing SLRU data.
+ */
+static MultiXactId
+GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
+{
+ MultiXactId result;
+ MultiXactOffset nextOffset;
+
+ debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
+
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot assign MultiXactIds during recovery");
+
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+ /* Handle wraparound of the nextMXact counter */
+ if (MultiXactState->nextMXact < FirstMultiXactId)
+ MultiXactState->nextMXact = FirstMultiXactId;
+
+ /* Assign the MXID */
+ result = MultiXactState->nextMXact;
+
+ /*----------
+ * Check to see if it's safe to assign another MultiXactId. This protects
+ * against catastrophic data loss due to multixact wraparound. The basic
+ * rules are:
+ *
+ * If we're past multiVacLimit or the safe threshold for member storage
+ * space, or we don't know what the safe threshold for member storage is,
+ * start trying to force autovacuum cycles.
+ * If we're past multiWarnLimit, start issuing warnings.
+ * If we're past multiStopLimit, refuse to create new MultiXactIds.
+ *
+ * Note these are pretty much the same protections in GetNewTransactionId.
+ *----------
+ */
+ if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
+ {
+ /*
+ * For safety's sake, we release MultiXactGenLock while sending
+ * signals, warnings, etc. This is not so much because we care about
+ * preserving concurrency in this situation, as to avoid any
+ * possibility of deadlock while doing get_database_name(). First,
+ * copy all the shared values we'll need in this path.
+ */
+ MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
+ MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
+ MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
+ Oid oldest_datoid = MultiXactState->oldestMultiXactDB;
+
+ LWLockRelease(MultiXactGenLock);
+
+ if (IsUnderPostmaster &&
+ !MultiXactIdPrecedes(result, multiStopLimit))
+ {
+ char *oldest_datname = get_database_name(oldest_datoid);
+
+ /*
+ * Immediately kick autovacuum into action as we're already in
+ * ERROR territory.
+ */
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ /* complain even if that DB has disappeared */
+ if (oldest_datname)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
+ oldest_datname),
+ errhint("Execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
+ oldest_datoid),
+ errhint("Execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+
+ /*
+ * To avoid swamping the postmaster with signals, we issue the autovac
+ * request only once per 64K multis generated. This still gives
+ * plenty of chances before we get into real trouble.
+ */
+ if (IsUnderPostmaster && (result % 65536) == 0)
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ if (!MultiXactIdPrecedes(result, multiWarnLimit))
+ {
+ char *oldest_datname = get_database_name(oldest_datoid);
+
+ /* complain even if that DB has disappeared */
+ if (oldest_datname)
+ ereport(WARNING,
+ (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
+ "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+ multiWrapLimit - result,
+ oldest_datname,
+ multiWrapLimit - result),
+ errhint("Execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(WARNING,
+ (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
+ "database with OID %u must be vacuumed before %u more MultiXactIds are used",
+ multiWrapLimit - result,
+ oldest_datoid,
+ multiWrapLimit - result),
+ errhint("Execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+
+ /* Re-acquire lock and start over */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ result = MultiXactState->nextMXact;
+ if (result < FirstMultiXactId)
+ result = FirstMultiXactId;
+ }
+
+ /* Make sure there is room for the MXID in the file. */
+ ExtendMultiXactOffset(result);
+
+ /*
+ * Reserve the members space, similarly to above. Also, be careful not to
+ * return zero as the starting offset for any multixact. See
+ * GetMultiXactIdMembers() for motivation.
+ */
+ nextOffset = MultiXactState->nextOffset;
+ if (nextOffset == 0)
+ {
+ *offset = 1;
+ nmembers++; /* allocate member slot 0 too */
+ }
+ else
+ *offset = nextOffset;
+
+ /*----------
+ * Protect against overrun of the members space as well, with the
+ * following rules:
+ *
+ * If we're past offsetStopLimit, refuse to generate more multis.
+ * If we're close to offsetStopLimit, emit a warning.
+ *
+ * Arbitrarily, we start emitting warnings when we're 20 segments or less
+ * from offsetStopLimit.
+ *
+ * Note we haven't updated the shared state yet, so if we fail at this
+ * point, the multixact ID we grabbed can still be used by the next guy.
+ *
+ * Note that there is no point in forcing autovacuum runs here: the
+ * multixact freeze settings would have to be reduced for that to have any
+ * effect.
+ *----------
+ */
+#define OFFSET_WARN_SEGMENTS 20
+ if (MultiXactState->oldestOffsetKnown &&
+ MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
+ nmembers))
+ {
+ /* see comment in the corresponding offsets wraparound case */
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("multixact \"members\" limit exceeded"),
+ errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
+ "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
+ MultiXactState->offsetStopLimit - nextOffset - 1,
+ nmembers,
+ MultiXactState->offsetStopLimit - nextOffset - 1),
+ errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
+ MultiXactState->oldestMultiXactDB)));
+ }
+
+ /*
+ * Check whether we should kick autovacuum into action, to prevent members
+ * wraparound. NB we use a much larger window to trigger autovacuum than
+ * just the warning limit. The warning is just a measure of last resort -
+ * this is in line with GetNewTransactionId's behaviour.
+ */
+ if (!MultiXactState->oldestOffsetKnown ||
+ (MultiXactState->nextOffset - MultiXactState->oldestOffset
+ > MULTIXACT_MEMBER_SAFE_THRESHOLD))
+ {
+ /*
+ * To avoid swamping the postmaster with signals, we issue the autovac
+ * request only when crossing a segment boundary. With default
+ * compilation settings that's roughly after 50k members. This still
+ * gives plenty of chances before we get into real trouble.
+ */
+ if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
+ (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+ }
+
+ if (MultiXactState->oldestOffsetKnown &&
+ MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
+ nextOffset,
+ nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
+ ereport(WARNING,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
+ "database with OID %u must be vacuumed before %d more multixact members are used",
+ MultiXactState->offsetStopLimit - nextOffset + nmembers,
+ MultiXactState->oldestMultiXactDB,
+ MultiXactState->offsetStopLimit - nextOffset + nmembers),
+ errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.")));
+
+ ExtendMultiXactMember(nextOffset, nmembers);
+
+ /*
+ * Critical section from here until caller has written the data into the
+ * just-reserved SLRU space; we don't want to error out with a partly
+ * written MultiXact structure. (In particular, failing to write our
+ * start offset after advancing nextMXact would effectively corrupt the
+ * previous MultiXact.)
+ */
+ START_CRIT_SECTION();
+
+ /*
+ * Advance counters. As in GetNewTransactionId(), this must not happen
+ * until after file extension has succeeded!
+ *
+ * We don't care about MultiXactId wraparound here; it will be handled by
+ * the next iteration. But note that nextMXact may be InvalidMultiXactId
+ * or the first value on a segment-beginning page after this routine
+ * exits, so anyone else looking at the variable must be prepared to deal
+ * with either case. Similarly, nextOffset may be zero, but we won't use
+ * that as the actual start offset of the next multixact.
+ */
+ (MultiXactState->nextMXact)++;
+
+ MultiXactState->nextOffset += nmembers;
+
+ LWLockRelease(MultiXactGenLock);
+
+ debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
+ return result;
+}
+
+/*
+ * GetMultiXactIdMembers
+ * Return the set of MultiXactMembers that make up a MultiXactId
+ *
+ * Return value is the number of members found, or -1 if there are none,
+ * and *members is set to a newly palloc'ed array of members. It's the
+ * caller's responsibility to free it when done with it.
+ *
+ * from_pgupgrade must be passed as true if and only if only the multixact
+ * corresponds to a value from a tuple that was locked in a 9.2-or-older
+ * installation and later pg_upgrade'd (that is, the infomask is
+ * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members
+ * can still be running, so we return -1 just like for an empty multixact
+ * without any further checking. It would be wrong to try to resolve such a
+ * multixact: either the multixact is within the current valid multixact
+ * range, in which case the returned result would be bogus, or outside that
+ * range, in which case an error would be raised.
+ *
+ * In all other cases, the passed multixact must be within the known valid
+ * range, that is, greater to or equal than oldestMultiXactId, and less than
+ * nextMXact. Otherwise, an error is raised.
+ *
+ * onlyLock must be set to true if caller is certain that the given multi
+ * is used only to lock tuples; can be false without loss of correctness,
+ * but passing a true means we can return quickly without checking for
+ * old updates.
+ */
+int
+GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
+ bool from_pgupgrade, bool onlyLock)
+{
+ int pageno;
+ int prev_pageno;
+ int entryno;
+ int slotno;
+ MultiXactOffset *offptr;
+ MultiXactOffset offset;
+ int length;
+ int truelength;
+ int i;
+ MultiXactId oldestMXact;
+ MultiXactId nextMXact;
+ MultiXactId tmpMXact;
+ MultiXactOffset nextOffset;
+ MultiXactMember *ptr;
+
+ debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
+
+ if (!MultiXactIdIsValid(multi) || from_pgupgrade)
+ {
+ *members = NULL;
+ return -1;
+ }
+
+ /* See if the MultiXactId is in the local cache */
+ length = mXactCacheGetById(multi, members);
+ if (length >= 0)
+ {
+ debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
+ mxid_to_string(multi, length, *members));
+ return length;
+ }
+
+ /* Set our OldestVisibleMXactId[] entry if we didn't already */
+ MultiXactIdSetOldestVisible();
+
+ /*
+ * If we know the multi is used only for locking and not for updates, then
+ * we can skip checking if the value is older than our oldest visible
+ * multi. It cannot possibly still be running.
+ */
+ if (onlyLock &&
+ MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
+ {
+ debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
+ *members = NULL;
+ return -1;
+ }
+
+ /*
+ * We check known limits on MultiXact before resorting to the SLRU area.
+ *
+ * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
+ * useful; it has already been removed, or will be removed shortly, by
+ * truncation. If one is passed, an error is raised.
+ *
+ * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
+ * implies undetected ID wraparound has occurred. This raises a hard
+ * error.
+ *
+ * Shared lock is enough here since we aren't modifying any global state.
+ * Acquire it just long enough to grab the current counter values. We may
+ * need both nextMXact and nextOffset; see below.
+ */
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+ oldestMXact = MultiXactState->oldestMultiXactId;
+ nextMXact = MultiXactState->nextMXact;
+ nextOffset = MultiXactState->nextOffset;
+
+ LWLockRelease(MultiXactGenLock);
+
+ if (MultiXactIdPrecedes(multi, oldestMXact))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
+ multi)));
+
+ if (!MultiXactIdPrecedes(multi, nextMXact))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
+ multi)));
+
+ /*
+ * Find out the offset at which we need to start reading MultiXactMembers
+ * and the number of members in the multixact. We determine the latter as
+ * the difference between this multixact's starting offset and the next
+ * one's. However, there are some corner cases to worry about:
+ *
+ * 1. This multixact may be the latest one created, in which case there is
+ * no next one to look at. In this case the nextOffset value we just
+ * saved is the correct endpoint.
+ *
+ * 2. The next multixact may still be in process of being filled in: that
+ * is, another process may have done GetNewMultiXactId but not yet written
+ * the offset entry for that ID. In that scenario, it is guaranteed that
+ * the offset entry for that multixact exists (because GetNewMultiXactId
+ * won't release MultiXactGenLock until it does) but contains zero
+ * (because we are careful to pre-zero offset pages). Because
+ * GetNewMultiXactId will never return zero as the starting offset for a
+ * multixact, when we read zero as the next multixact's offset, we know we
+ * have this case. We sleep for a bit and try again.
+ *
+ * 3. Because GetNewMultiXactId increments offset zero to offset one to
+ * handle case #2, there is an ambiguity near the point of offset
+ * wraparound. If we see next multixact's offset is one, is that our
+ * multixact's actual endpoint, or did it end at zero with a subsequent
+ * increment? We handle this using the knowledge that if the zero'th
+ * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+ * transaction ID so it can't be a multixact member. Therefore, if we
+ * read a zero from the members array, just ignore it.
+ *
+ * This is all pretty messy, but the mess occurs only in infrequent corner
+ * cases, so it seems better than holding the MultiXactGenLock for a long
+ * time on every multixact creation.
+ */
+retry:
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
+ offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr += entryno;
+ offset = *offptr;
+
+ Assert(offset != 0);
+
+ /*
+ * Use the same increment rule as GetNewMultiXactId(), that is, don't
+ * handle wraparound explicitly until needed.
+ */
+ tmpMXact = multi + 1;
+
+ if (nextMXact == tmpMXact)
+ {
+ /* Corner case 1: there is no next multixact */
+ length = nextOffset - offset;
+ }
+ else
+ {
+ MultiXactOffset nextMXOffset;
+
+ /* handle wraparound if needed */
+ if (tmpMXact < FirstMultiXactId)
+ tmpMXact = FirstMultiXactId;
+
+ prev_pageno = pageno;
+
+ pageno = MultiXactIdToOffsetPage(tmpMXact);
+ entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+ if (pageno != prev_pageno)
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
+
+ offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr += entryno;
+ nextMXOffset = *offptr;
+
+ if (nextMXOffset == 0)
+ {
+ /* Corner case 2: next multixact is still being filled in */
+ LWLockRelease(MultiXactOffsetSLRULock);
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(1000L);
+ goto retry;
+ }
+
+ length = nextMXOffset - offset;
+ }
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+
+ ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
+
+ /* Now get the members themselves. */
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ truelength = 0;
+ prev_pageno = -1;
+ for (i = 0; i < length; i++, offset++)
+ {
+ TransactionId *xactptr;
+ uint32 *flagsptr;
+ int flagsoff;
+ int bshift;
+ int memberoff;
+
+ pageno = MXOffsetToMemberPage(offset);
+ memberoff = MXOffsetToMemberOffset(offset);
+
+ if (pageno != prev_pageno)
+ {
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
+ prev_pageno = pageno;
+ }
+
+ xactptr = (TransactionId *)
+ (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+ if (!TransactionIdIsValid(*xactptr))
+ {
+ /* Corner case 3: we must be looking at unused slot zero */
+ Assert(offset == 0);
+ continue;
+ }
+
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ bshift = MXOffsetToFlagsBitShift(offset);
+ flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+ ptr[truelength].xid = *xactptr;
+ ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+ truelength++;
+ }
+
+ LWLockRelease(MultiXactMemberSLRULock);
+
+ /* A multixid with zero members should not happen */
+ Assert(truelength > 0);
+
+ /*
+ * Copy the result into the local cache.
+ */
+ mXactCachePut(multi, truelength, ptr);
+
+ debug_elog3(DEBUG2, "GetMembers: no cache for %s",
+ mxid_to_string(multi, truelength, ptr));
+ *members = ptr;
+ return truelength;
+}
+
+/*
+ * mxactMemberComparator
+ * qsort comparison function for MultiXactMember
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality! Any old sort order will do.
+ */
+static int
+mxactMemberComparator(const void *arg1, const void *arg2)
+{
+ MultiXactMember member1 = *(const MultiXactMember *) arg1;
+ MultiXactMember member2 = *(const MultiXactMember *) arg2;
+
+ if (member1.xid > member2.xid)
+ return 1;
+ if (member1.xid < member2.xid)
+ return -1;
+ if (member1.status > member2.status)
+ return 1;
+ if (member1.status < member2.status)
+ return -1;
+ return 0;
+}
+
+/*
+ * mXactCacheGetBySet
+ * returns a MultiXactId from the cache based on the set of
+ * TransactionIds that compose it, or InvalidMultiXactId if
+ * none matches.
+ *
+ * This is helpful, for example, if two transactions want to lock a huge
+ * table. By using the cache, the second will use the same MultiXactId
+ * for the majority of tuples, thus keeping MultiXactId usage low (saving
+ * both I/O and wraparound issues).
+ *
+ * NB: the passed members array will be sorted in-place.
+ */
+static MultiXactId
+mXactCacheGetBySet(int nmembers, MultiXactMember *members)
+{
+ dlist_iter iter;
+
+ debug_elog3(DEBUG2, "CacheGet: looking for %s",
+ mxid_to_string(InvalidMultiXactId, nmembers, members));
+
+ /* sort the array so comparison is easy */
+ qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
+
+ dlist_foreach(iter, &MXactCache)
+ {
+ mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
+
+ if (entry->nmembers != nmembers)
+ continue;
+
+ /*
+ * We assume the cache entries are sorted, and that the unused bits in
+ * "status" are zeroed.
+ */
+ if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
+ {
+ debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
+ dlist_move_head(&MXactCache, iter.cur);
+ return entry->multi;
+ }
+ }
+
+ debug_elog2(DEBUG2, "CacheGet: not found :-(");
+ return InvalidMultiXactId;
+}
+
+/*
+ * mXactCacheGetById
+ * returns the composing MultiXactMember set from the cache for a
+ * given MultiXactId, if present.
+ *
+ * If successful, *xids is set to the address of a palloc'd copy of the
+ * MultiXactMember set. Return value is number of members, or -1 on failure.
+ */
+static int
+mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
+{
+ dlist_iter iter;
+
+ debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
+
+ dlist_foreach(iter, &MXactCache)
+ {
+ mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
+
+ if (entry->multi == multi)
+ {
+ MultiXactMember *ptr;
+ Size size;
+
+ size = sizeof(MultiXactMember) * entry->nmembers;
+ ptr = (MultiXactMember *) palloc(size);
+
+ memcpy(ptr, entry->members, size);
+
+ debug_elog3(DEBUG2, "CacheGet: found %s",
+ mxid_to_string(multi,
+ entry->nmembers,
+ entry->members));
+
+ /*
+ * Note we modify the list while not using a modifiable iterator.
+ * This is acceptable only because we exit the iteration
+ * immediately afterwards.
+ */
+ dlist_move_head(&MXactCache, iter.cur);
+
+ *members = ptr;
+ return entry->nmembers;
+ }
+ }
+
+ debug_elog2(DEBUG2, "CacheGet: not found");
+ return -1;
+}
+
+/*
+ * mXactCachePut
+ * Add a new MultiXactId and its composing set into the local cache.
+ */
+static void
+mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+ mXactCacheEnt *entry;
+
+ debug_elog3(DEBUG2, "CachePut: storing %s",
+ mxid_to_string(multi, nmembers, members));
+
+ if (MXactContext == NULL)
+ {
+ /* The cache only lives as long as the current transaction */
+ debug_elog2(DEBUG2, "CachePut: initializing memory context");
+ MXactContext = AllocSetContextCreate(TopTransactionContext,
+ "MultiXact cache context",
+ ALLOCSET_SMALL_SIZES);
+ }
+
+ entry = (mXactCacheEnt *)
+ MemoryContextAlloc(MXactContext,
+ offsetof(mXactCacheEnt, members) +
+ nmembers * sizeof(MultiXactMember));
+
+ entry->multi = multi;
+ entry->nmembers = nmembers;
+ memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
+
+ /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
+ qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
+
+ dlist_push_head(&MXactCache, &entry->node);
+ if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES)
+ {
+ dlist_node *node;
+ mXactCacheEnt *entry;
+
+ node = dlist_tail_node(&MXactCache);
+ dlist_delete(node);
+ MXactCacheMembers--;
+
+ entry = dlist_container(mXactCacheEnt, node, node);
+ debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
+ entry->multi);
+
+ pfree(entry);
+ }
+}
+
+static char *
+mxstatus_to_string(MultiXactStatus status)
+{
+ switch (status)
+ {
+ case MultiXactStatusForKeyShare:
+ return "keysh";
+ case MultiXactStatusForShare:
+ return "sh";
+ case MultiXactStatusForNoKeyUpdate:
+ return "fornokeyupd";
+ case MultiXactStatusForUpdate:
+ return "forupd";
+ case MultiXactStatusNoKeyUpdate:
+ return "nokeyupd";
+ case MultiXactStatusUpdate:
+ return "upd";
+ default:
+ elog(ERROR, "unrecognized multixact status %d", status);
+ return "";
+ }
+}
+
+char *
+mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+ static char *str = NULL;
+ StringInfoData buf;
+ int i;
+
+ if (str != NULL)
+ pfree(str);
+
+ initStringInfo(&buf);
+
+ appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
+ mxstatus_to_string(members[0].status));
+
+ for (i = 1; i < nmembers; i++)
+ appendStringInfo(&buf, ", %u (%s)", members[i].xid,
+ mxstatus_to_string(members[i].status));
+
+ appendStringInfoChar(&buf, ']');
+ str = MemoryContextStrdup(TopMemoryContext, buf.data);
+ pfree(buf.data);
+ return str;
+}
+
+/*
+ * AtEOXact_MultiXact
+ * Handle transaction end for MultiXact
+ *
+ * This is called at top transaction commit or abort (we don't care which).
+ */
+void
+AtEOXact_MultiXact(void)
+{
+ /*
+ * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
+ * which should only be valid while within a transaction.
+ *
+ * We assume that storing a MultiXactId is atomic and so we need not take
+ * MultiXactGenLock to do this.
+ */
+ OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
+ OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
+
+ /*
+ * Discard the local MultiXactId cache. Since MXactContext was created as
+ * a child of TopTransactionContext, we needn't delete it explicitly.
+ */
+ MXactContext = NULL;
+ dlist_init(&MXactCache);
+ MXactCacheMembers = 0;
+}
+
+/*
+ * AtPrepare_MultiXact
+ * Save multixact state at 2PC transaction prepare
+ *
+ * In this phase, we only store our OldestMemberMXactId value in the two-phase
+ * state file.
+ */
+void
+AtPrepare_MultiXact(void)
+{
+ MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
+
+ if (MultiXactIdIsValid(myOldestMember))
+ RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
+ &myOldestMember, sizeof(MultiXactId));
+}
+
+/*
+ * PostPrepare_MultiXact
+ * Clean up after successful PREPARE TRANSACTION
+ */
+void
+PostPrepare_MultiXact(TransactionId xid)
+{
+ MultiXactId myOldestMember;
+
+ /*
+ * Transfer our OldestMemberMXactId value to the slot reserved for the
+ * prepared transaction.
+ */
+ myOldestMember = OldestMemberMXactId[MyBackendId];
+ if (MultiXactIdIsValid(myOldestMember))
+ {
+ BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
+
+ /*
+ * Even though storing MultiXactId is atomic, acquire lock to make
+ * sure others see both changes, not just the reset of the slot of the
+ * current backend. Using a volatile pointer might suffice, but this
+ * isn't a hot spot.
+ */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+ OldestMemberMXactId[dummyBackendId] = myOldestMember;
+ OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
+
+ LWLockRelease(MultiXactGenLock);
+ }
+
+ /*
+ * We don't need to transfer OldestVisibleMXactId value, because the
+ * transaction is not going to be looking at any more multixacts once it's
+ * prepared.
+ *
+ * We assume that storing a MultiXactId is atomic and so we need not take
+ * MultiXactGenLock to do this.
+ */
+ OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
+
+ /*
+ * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
+ */
+ MXactContext = NULL;
+ dlist_init(&MXactCache);
+ MXactCacheMembers = 0;
+}
+
+/*
+ * multixact_twophase_recover
+ * Recover the state of a prepared transaction at startup
+ */
+void
+multixact_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
+ MultiXactId oldestMember;
+
+ /*
+ * Get the oldest member XID from the state file record, and set it in the
+ * OldestMemberMXactId slot reserved for this prepared transaction.
+ */
+ Assert(len == sizeof(MultiXactId));
+ oldestMember = *((MultiXactId *) recdata);
+
+ OldestMemberMXactId[dummyBackendId] = oldestMember;
+}
+
+/*
+ * multixact_twophase_postcommit
+ * Similar to AtEOXact_MultiXact but for COMMIT PREPARED
+ */
+void
+multixact_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true);
+
+ Assert(len == sizeof(MultiXactId));
+
+ OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
+}
+
+/*
+ * multixact_twophase_postabort
+ * This is actually just the same as the COMMIT case.
+ */
+void
+multixact_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ multixact_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ * Initialization of shared memory for MultiXact. We use two SLRU areas,
+ * thus double memory. Also, reserve space for the shared MultiXactState
+ * struct and the per-backend MultiXactId arrays (two of those, too).
+ */
+Size
+MultiXactShmemSize(void)
+{
+ Size size;
+
+ /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */
+#define SHARED_MULTIXACT_STATE_SIZE \
+ add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \
+ mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
+
+ size = SHARED_MULTIXACT_STATE_SIZE;
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+
+ return size;
+}
+
+void
+MultiXactShmemInit(void)
+{
+ bool found;
+
+ debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
+
+ MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
+ MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
+
+ SimpleLruInit(MultiXactOffsetCtl,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ MultiXactOffsetSLRULock, "pg_multixact/offsets",
+ LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+ SYNC_HANDLER_MULTIXACT_OFFSET);
+ SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
+ SimpleLruInit(MultiXactMemberCtl,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ MultiXactMemberSLRULock, "pg_multixact/members",
+ LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+ SYNC_HANDLER_MULTIXACT_MEMBER);
+ /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
+
+ /* Initialize our shared state struct */
+ MultiXactState = ShmemInitStruct("Shared MultiXact State",
+ SHARED_MULTIXACT_STATE_SIZE,
+ &found);
+ if (!IsUnderPostmaster)
+ {
+ Assert(!found);
+
+ /* Make sure we zero out the per-backend state */
+ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
+ }
+ else
+ Assert(found);
+
+ /*
+ * Set up array pointers. Note that perBackendXactIds[0] is wasted space
+ * since we only use indexes 1..MaxOldestSlot in each array.
+ */
+ OldestMemberMXactId = MultiXactState->perBackendXactIds;
+ OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
+}
+
+/*
+ * This func must be called ONCE on system install. It creates the initial
+ * MultiXact segments. (The MultiXacts directories are assumed to have been
+ * created by initdb, and MultiXactShmemInit must have been called already.)
+ */
+void
+BootStrapMultiXact(void)
+{
+ int slotno;
+
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ /* Create and zero the first page of the offsets log */
+ slotno = ZeroMultiXactOffsetPage(0, false);
+
+ /* Make sure it's written out */
+ SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+ Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ /* Create and zero the first page of the members log */
+ slotno = ZeroMultiXactMemberPage(0, false);
+
+ /* Make sure it's written out */
+ SimpleLruWritePage(MultiXactMemberCtl, slotno);
+ Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(MultiXactMemberSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
+{
+ int slotno;
+
+ slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+
+ if (writeXlog)
+ WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+
+ return slotno;
+}
+
+/*
+ * Ditto, for MultiXactMember
+ */
+static int
+ZeroMultiXactMemberPage(int pageno, bool writeXlog)
+{
+ int slotno;
+
+ slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+
+ if (writeXlog)
+ WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+
+ return slotno;
+}
+
+/*
+ * MaybeExtendOffsetSlru
+ * Extend the offsets SLRU area, if necessary
+ *
+ * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
+ * contain files that are shorter than necessary; this would occur if the old
+ * installation had used multixacts beyond the first page (files cannot be
+ * copied, because the on-disk representation is different). pg_upgrade would
+ * update pg_control to set the next offset value to be at that position, so
+ * that tuples marked as locked by such MultiXacts would be seen as visible
+ * without having to consult multixact. However, trying to create and use a
+ * new MultiXactId would result in an error because the page on which the new
+ * value would reside does not exist. This routine is in charge of creating
+ * such pages.
+ */
+static void
+MaybeExtendOffsetSlru(void)
+{
+ int pageno;
+
+ pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
+
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
+ {
+ int slotno;
+
+ /*
+ * Fortunately for us, SimpleLruWritePage is already prepared to deal
+ * with creating a new segment file even if the page we're writing is
+ * not the first in it, so this is enough.
+ */
+ slotno = ZeroMultiXactOffsetPage(pageno, false);
+ SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+ }
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup.
+ *
+ * StartupXLOG has already established nextMXact/nextOffset by calling
+ * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
+ * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
+ * replayed WAL.
+ */
+void
+StartupMultiXact(void)
+{
+ MultiXactId multi = MultiXactState->nextMXact;
+ MultiXactOffset offset = MultiXactState->nextOffset;
+ int pageno;
+
+ /*
+ * Initialize offset's idea of the latest page number.
+ */
+ pageno = MultiXactIdToOffsetPage(multi);
+ MultiXactOffsetCtl->shared->latest_page_number = pageno;
+
+ /*
+ * Initialize member's idea of the latest page number.
+ */
+ pageno = MXOffsetToMemberPage(offset);
+ MultiXactMemberCtl->shared->latest_page_number = pageno;
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ */
+void
+TrimMultiXact(void)
+{
+ MultiXactId nextMXact;
+ MultiXactOffset offset;
+ MultiXactId oldestMXact;
+ Oid oldestMXactDB;
+ int pageno;
+ int entryno;
+ int flagsoff;
+
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ nextMXact = MultiXactState->nextMXact;
+ offset = MultiXactState->nextOffset;
+ oldestMXact = MultiXactState->oldestMultiXactId;
+ oldestMXactDB = MultiXactState->oldestMultiXactDB;
+ LWLockRelease(MultiXactGenLock);
+
+ /* Clean up offsets state */
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * (Re-)Initialize our idea of the latest page number for offsets.
+ */
+ pageno = MultiXactIdToOffsetPage(nextMXact);
+ MultiXactOffsetCtl->shared->latest_page_number = pageno;
+
+ /*
+ * Zero out the remainder of the current offsets page. See notes in
+ * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
+ * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
+ * rule "write xlog before data," nextMXact successors may carry obsolete,
+ * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
+ * operates normally.
+ */
+ entryno = MultiXactIdToOffsetEntry(nextMXact);
+ if (entryno != 0)
+ {
+ int slotno;
+ MultiXactOffset *offptr;
+
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
+ offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr += entryno;
+
+ MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+
+ MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+ }
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+
+ /* And the same for members */
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * (Re-)Initialize our idea of the latest page number for members.
+ */
+ pageno = MXOffsetToMemberPage(offset);
+ MultiXactMemberCtl->shared->latest_page_number = pageno;
+
+ /*
+ * Zero out the remainder of the current members page. See notes in
+ * TrimCLOG() for motivation.
+ */
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ if (flagsoff != 0)
+ {
+ int slotno;
+ TransactionId *xidptr;
+ int memberoff;
+
+ memberoff = MXOffsetToMemberOffset(offset);
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
+ xidptr = (TransactionId *)
+ (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+ MemSet(xidptr, 0, BLCKSZ - memberoff);
+
+ /*
+ * Note: we don't need to zero out the flag bits in the remaining
+ * members of the current group, because they are always reset before
+ * writing.
+ */
+
+ MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+ }
+
+ LWLockRelease(MultiXactMemberSLRULock);
+
+ /* signal that we're officially up */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->finishedStartup = true;
+ LWLockRelease(MultiXactGenLock);
+
+ /* Now compute how far away the next members wraparound is. */
+ SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
+}
+
+/*
+ * Get the MultiXact data to save in a checkpoint record
+ */
+void
+MultiXactGetCheckptMulti(bool is_shutdown,
+ MultiXactId *nextMulti,
+ MultiXactOffset *nextMultiOffset,
+ MultiXactId *oldestMulti,
+ Oid *oldestMultiDB)
+{
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ *nextMulti = MultiXactState->nextMXact;
+ *nextMultiOffset = MultiXactState->nextOffset;
+ *oldestMulti = MultiXactState->oldestMultiXactId;
+ *oldestMultiDB = MultiXactState->oldestMultiXactDB;
+ LWLockRelease(MultiXactGenLock);
+
+ debug_elog6(DEBUG2,
+ "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
+ *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointMultiXact(void)
+{
+ TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
+
+ /*
+ * Write dirty MultiXact pages to disk. This may result in sync requests
+ * queued for later handling by ProcessSyncRequests(), as part of the
+ * checkpoint.
+ */
+ SimpleLruWriteAll(MultiXactOffsetCtl, true);
+ SimpleLruWriteAll(MultiXactMemberCtl, true);
+
+ TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Set the next-to-be-assigned MultiXactId and offset
+ *
+ * This is used when we can determine the correct next ID/offset exactly
+ * from a checkpoint record. Although this is only called during bootstrap
+ * and XLog replay, we take the lock in case any hot-standby backends are
+ * examining the values.
+ */
+void
+MultiXactSetNextMXact(MultiXactId nextMulti,
+ MultiXactOffset nextMultiOffset)
+{
+ debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
+ nextMulti, nextMultiOffset);
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->nextMXact = nextMulti;
+ MultiXactState->nextOffset = nextMultiOffset;
+ LWLockRelease(MultiXactGenLock);
+
+ /*
+ * During a binary upgrade, make sure that the offsets SLRU is large
+ * enough to contain the next value that would be created.
+ *
+ * We need to do this pretty early during the first startup in binary
+ * upgrade mode: before StartupMultiXact() in fact, because this routine
+ * is called even before that by StartupXLOG(). And we can't do it
+ * earlier than at this point, because during that first call of this
+ * routine we determine the MultiXactState->nextMXact value that
+ * MaybeExtendOffsetSlru needs.
+ */
+ if (IsBinaryUpgrade)
+ MaybeExtendOffsetSlru();
+}
+
+/*
+ * Determine the last safe MultiXactId to allocate given the currently oldest
+ * datminmxid (ie, the oldest MultiXactId that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ *
+ * is_startup is true when we are just starting the cluster, false when we
+ * are updating state in a running cluster. This only affects log messages.
+ */
+void
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
+ bool is_startup)
+{
+ MultiXactId multiVacLimit;
+ MultiXactId multiWarnLimit;
+ MultiXactId multiStopLimit;
+ MultiXactId multiWrapLimit;
+ MultiXactId curMulti;
+ bool needs_offset_vacuum;
+
+ Assert(MultiXactIdIsValid(oldest_datminmxid));
+
+ /*
+ * We pretend that a wrap will happen halfway through the multixact ID
+ * space, but that's not really true, because multixacts wrap differently
+ * from transaction IDs. Note that, separately from any concern about
+ * multixact IDs wrapping, we must ensure that multixact members do not
+ * wrap. Limits for that are set in SetOffsetVacuumLimit, not here.
+ */
+ multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
+ if (multiWrapLimit < FirstMultiXactId)
+ multiWrapLimit += FirstMultiXactId;
+
+ /*
+ * We'll refuse to continue assigning MultiXactIds once we get within 3M
+ * multi of data loss. See SetTransactionIdLimit.
+ */
+ multiStopLimit = multiWrapLimit - 3000000;
+ if (multiStopLimit < FirstMultiXactId)
+ multiStopLimit -= FirstMultiXactId;
+
+ /*
+ * We'll start complaining loudly when we get within 40M multis of data
+ * loss. This is kind of arbitrary, but if you let your gas gauge get
+ * down to 2% of full, would you be looking for the next gas station? We
+ * need to be fairly liberal about this number because there are lots of
+ * scenarios where most transactions are done by automatic clients that
+ * won't pay attention to warnings. (No, we're not gonna make this
+ * configurable. If you know enough to configure it, you know enough to
+ * not get in this kind of trouble in the first place.)
+ */
+ multiWarnLimit = multiWrapLimit - 40000000;
+ if (multiWarnLimit < FirstMultiXactId)
+ multiWarnLimit -= FirstMultiXactId;
+
+ /*
+ * We'll start trying to force autovacuums when oldest_datminmxid gets to
+ * be more than autovacuum_multixact_freeze_max_age mxids old.
+ *
+ * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
+ * so that we don't have to worry about dealing with on-the-fly changes in
+ * its value. See SetTransactionIdLimit.
+ */
+ multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
+ if (multiVacLimit < FirstMultiXactId)
+ multiVacLimit += FirstMultiXactId;
+
+ /* Grab lock for just long enough to set the new limit values */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->oldestMultiXactId = oldest_datminmxid;
+ MultiXactState->oldestMultiXactDB = oldest_datoid;
+ MultiXactState->multiVacLimit = multiVacLimit;
+ MultiXactState->multiWarnLimit = multiWarnLimit;
+ MultiXactState->multiStopLimit = multiStopLimit;
+ MultiXactState->multiWrapLimit = multiWrapLimit;
+ curMulti = MultiXactState->nextMXact;
+ LWLockRelease(MultiXactGenLock);
+
+ /* Log the info */
+ ereport(DEBUG1,
+ (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
+ multiWrapLimit, oldest_datoid)));
+
+ /*
+ * Computing the actual limits is only possible once the data directory is
+ * in a consistent state. There's no need to compute the limits while
+ * still replaying WAL - no decisions about new multis are made even
+ * though multixact creations might be replayed. So we'll only do further
+ * checks after TrimMultiXact() has been called.
+ */
+ if (!MultiXactState->finishedStartup)
+ return;
+
+ Assert(!InRecovery);
+
+ /* Set limits for offset vacuum. */
+ needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
+
+ /*
+ * If past the autovacuum force point, immediately signal an autovac
+ * request. The reason for this is that autovac only processes one
+ * database per invocation. Once it's finished cleaning up the oldest
+ * database, it'll call here, and we'll signal the postmaster to start
+ * another iteration immediately if there are still any old databases.
+ */
+ if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
+ needs_offset_vacuum) && IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ /* Give an immediate warning if past the wrap warn point */
+ if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
+ {
+ char *oldest_datname;
+
+ /*
+ * We can be called when not inside a transaction, for example during
+ * StartupXLOG(). In such a case we cannot do database access, so we
+ * must just report the oldest DB's OID.
+ *
+ * Note: it's also possible that get_database_name fails and returns
+ * NULL, for example because the database just got dropped. We'll
+ * still warn, even though the warning might now be unnecessary.
+ */
+ if (IsTransactionState())
+ oldest_datname = get_database_name(oldest_datoid);
+ else
+ oldest_datname = NULL;
+
+ if (oldest_datname)
+ ereport(WARNING,
+ (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
+ "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+ multiWrapLimit - curMulti,
+ oldest_datname,
+ multiWrapLimit - curMulti),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(WARNING,
+ (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
+ "database with OID %u must be vacuumed before %u more MultiXactIds are used",
+ multiWrapLimit - curMulti,
+ oldest_datoid,
+ multiWrapLimit - curMulti),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+}
+
+/*
+ * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
+ * and similarly nextOffset is at least minMultiOffset.
+ *
+ * This is used when we can determine minimum safe values from an XLog
+ * record (either an on-line checkpoint or an mxact creation log entry).
+ * Although this is only called during XLog replay, we take the lock in case
+ * any hot-standby backends are examining the values.
+ */
+void
+MultiXactAdvanceNextMXact(MultiXactId minMulti,
+ MultiXactOffset minMultiOffset)
+{
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
+ {
+ debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
+ MultiXactState->nextMXact = minMulti;
+ }
+ if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
+ {
+ debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
+ minMultiOffset);
+ MultiXactState->nextOffset = minMultiOffset;
+ }
+ LWLockRelease(MultiXactGenLock);
+}
+
+/*
+ * Update our oldestMultiXactId value, but only if it's more recent than what
+ * we had.
+ *
+ * This may only be called during WAL replay.
+ */
+void
+MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
+{
+ Assert(InRecovery);
+
+ if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
+ SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
+}
+
+/*
+ * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
+ *
+ * NB: this is called while holding MultiXactGenLock. We want it to be very
+ * fast most of the time; even when it's not so fast, no actual I/O need
+ * happen unless we're forced to write out a dirty log or xlog page to make
+ * room in shared memory.
+ */
+static void
+ExtendMultiXactOffset(MultiXactId multi)
+{
+ int pageno;
+
+ /*
+ * No work except at first MultiXactId of a page. But beware: just after
+ * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
+ */
+ if (MultiXactIdToOffsetEntry(multi) != 0 &&
+ multi != FirstMultiXactId)
+ return;
+
+ pageno = MultiXactIdToOffsetPage(multi);
+
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ /* Zero the page and make an XLOG entry about it */
+ ZeroMultiXactOffsetPage(pageno, true);
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+}
+
+/*
+ * Make sure that MultiXactMember has room for the members of a newly-
+ * allocated MultiXactId.
+ *
+ * Like the above routine, this is called while holding MultiXactGenLock;
+ * same comments apply.
+ */
+static void
+ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
+{
+ /*
+ * It's possible that the members span more than one page of the members
+ * file, so we loop to ensure we consider each page. The coding is not
+ * optimal if the members span several pages, but that seems unusual
+ * enough to not worry much about.
+ */
+ while (nmembers > 0)
+ {
+ int flagsoff;
+ int flagsbit;
+ uint32 difference;
+
+ /*
+ * Only zero when at first entry of a page.
+ */
+ flagsoff = MXOffsetToFlagsOffset(offset);
+ flagsbit = MXOffsetToFlagsBitShift(offset);
+ if (flagsoff == 0 && flagsbit == 0)
+ {
+ int pageno;
+
+ pageno = MXOffsetToMemberPage(offset);
+
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ /* Zero the page and make an XLOG entry about it */
+ ZeroMultiXactMemberPage(pageno, true);
+
+ LWLockRelease(MultiXactMemberSLRULock);
+ }
+
+ /*
+ * Compute the number of items till end of current page. Careful: if
+ * addition of unsigned ints wraps around, we're at the last page of
+ * the last segment; since that page holds a different number of items
+ * than other pages, we need to do it differently.
+ */
+ if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
+ {
+ /*
+ * This is the last page of the last segment; we can compute the
+ * number of items left to allocate in it without modulo
+ * arithmetic.
+ */
+ difference = MaxMultiXactOffset - offset + 1;
+ }
+ else
+ difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+
+ /*
+ * Advance to next page, taking care to properly handle the wraparound
+ * case. OK if nmembers goes negative.
+ */
+ nmembers -= difference;
+ offset += difference;
+ }
+}
+
+/*
+ * GetOldestMultiXactId
+ *
+ * Return the oldest MultiXactId that's still possibly still seen as live by
+ * any running transaction. Older ones might still exist on disk, but they no
+ * longer have any running member transaction.
+ *
+ * It's not safe to truncate MultiXact SLRU segments on the value returned by
+ * this function; however, it can be used by a full-table vacuum to set the
+ * point at which it will be possible to truncate SLRU for that table.
+ */
+MultiXactId
+GetOldestMultiXactId(void)
+{
+ MultiXactId oldestMXact;
+ MultiXactId nextMXact;
+ int i;
+
+ /*
+ * This is the oldest valid value among all the OldestMemberMXactId[] and
+ * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
+ */
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+ /*
+ * We have to beware of the possibility that nextMXact is in the
+ * wrapped-around state. We don't fix the counter itself here, but we
+ * must be sure to use a valid value in our calculation.
+ */
+ nextMXact = MultiXactState->nextMXact;
+ if (nextMXact < FirstMultiXactId)
+ nextMXact = FirstMultiXactId;
+
+ oldestMXact = nextMXact;
+ for (i = 1; i <= MaxOldestSlot; i++)
+ {
+ MultiXactId thisoldest;
+
+ thisoldest = OldestMemberMXactId[i];
+ if (MultiXactIdIsValid(thisoldest) &&
+ MultiXactIdPrecedes(thisoldest, oldestMXact))
+ oldestMXact = thisoldest;
+ thisoldest = OldestVisibleMXactId[i];
+ if (MultiXactIdIsValid(thisoldest) &&
+ MultiXactIdPrecedes(thisoldest, oldestMXact))
+ oldestMXact = thisoldest;
+ }
+
+ LWLockRelease(MultiXactGenLock);
+
+ return oldestMXact;
+}
+
+/*
+ * Determine how aggressively we need to vacuum in order to prevent member
+ * wraparound.
+ *
+ * To do so determine what's the oldest member offset and install the limit
+ * info in MultiXactState, where it can be used to prevent overrun of old data
+ * in the members SLRU area.
+ *
+ * The return value is true if emergency autovacuum is required and false
+ * otherwise.
+ */
+static bool
+SetOffsetVacuumLimit(bool is_startup)
+{
+ MultiXactId oldestMultiXactId;
+ MultiXactId nextMXact;
+ MultiXactOffset oldestOffset = 0; /* placate compiler */
+ MultiXactOffset prevOldestOffset;
+ MultiXactOffset nextOffset;
+ bool oldestOffsetKnown = false;
+ bool prevOldestOffsetKnown;
+ MultiXactOffset offsetStopLimit = 0;
+ MultiXactOffset prevOffsetStopLimit;
+
+ /*
+ * NB: Have to prevent concurrent truncation, we might otherwise try to
+ * lookup an oldestMulti that's concurrently getting truncated away.
+ */
+ LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
+
+ /* Read relevant fields from shared memory. */
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ oldestMultiXactId = MultiXactState->oldestMultiXactId;
+ nextMXact = MultiXactState->nextMXact;
+ nextOffset = MultiXactState->nextOffset;
+ prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
+ prevOldestOffset = MultiXactState->oldestOffset;
+ prevOffsetStopLimit = MultiXactState->offsetStopLimit;
+ Assert(MultiXactState->finishedStartup);
+ LWLockRelease(MultiXactGenLock);
+
+ /*
+ * Determine the offset of the oldest multixact. Normally, we can read
+ * the offset from the multixact itself, but there's an important special
+ * case: if there are no multixacts in existence at all, oldestMXact
+ * obviously can't point to one. It will instead point to the multixact
+ * ID that will be assigned the next time one is needed.
+ */
+ if (oldestMultiXactId == nextMXact)
+ {
+ /*
+ * When the next multixact gets created, it will be stored at the next
+ * offset.
+ */
+ oldestOffset = nextOffset;
+ oldestOffsetKnown = true;
+ }
+ else
+ {
+ /*
+ * Figure out where the oldest existing multixact's offsets are
+ * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
+ * the supposedly-earliest multixact might not really exist. We are
+ * careful not to fail in that case.
+ */
+ oldestOffsetKnown =
+ find_multixact_start(oldestMultiXactId, &oldestOffset);
+
+ if (oldestOffsetKnown)
+ ereport(DEBUG1,
+ (errmsg_internal("oldest MultiXactId member is at offset %u",
+ oldestOffset)));
+ else
+ ereport(LOG,
+ (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
+ oldestMultiXactId)));
+ }
+
+ LWLockRelease(MultiXactTruncationLock);
+
+ /*
+ * If we can, compute limits (and install them MultiXactState) to prevent
+ * overrun of old data in the members SLRU area. We can only do so if the
+ * oldest offset is known though.
+ */
+ if (oldestOffsetKnown)
+ {
+ /* move back to start of the corresponding segment */
+ offsetStopLimit = oldestOffset - (oldestOffset %
+ (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
+
+ /* always leave one segment before the wraparound point */
+ offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
+
+ if (!prevOldestOffsetKnown && !is_startup)
+ ereport(LOG,
+ (errmsg("MultiXact member wraparound protections are now enabled")));
+
+ ereport(DEBUG1,
+ (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
+ offsetStopLimit, oldestMultiXactId)));
+ }
+ else if (prevOldestOffsetKnown)
+ {
+ /*
+ * If we failed to get the oldest offset this time, but we have a
+ * value from a previous pass through this function, use the old
+ * values rather than automatically forcing an emergency autovacuum
+ * cycle again.
+ */
+ oldestOffset = prevOldestOffset;
+ oldestOffsetKnown = true;
+ offsetStopLimit = prevOffsetStopLimit;
+ }
+
+ /* Install the computed values */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->oldestOffset = oldestOffset;
+ MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
+ MultiXactState->offsetStopLimit = offsetStopLimit;
+ LWLockRelease(MultiXactGenLock);
+
+ /*
+ * Do we need an emergency autovacuum? If we're not sure, assume yes.
+ */
+ return !oldestOffsetKnown ||
+ (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
+}
+
+/*
+ * Return whether adding "distance" to "start" would move past "boundary".
+ *
+ * We use this to determine whether the addition is "wrapping around" the
+ * boundary point, hence the name. The reason we don't want to use the regular
+ * 2^31-modulo arithmetic here is that we want to be able to use the whole of
+ * the 2^32-1 space here, allowing for more multixacts than would fit
+ * otherwise.
+ */
+static bool
+MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
+ uint32 distance)
+{
+ MultiXactOffset finish;
+
+ /*
+ * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
+ * if the addition wraps around the UINT_MAX boundary, skip that value.
+ */
+ finish = start + distance;
+ if (finish < start)
+ finish++;
+
+ /*-----------------------------------------------------------------------
+ * When the boundary is numerically greater than the starting point, any
+ * value numerically between the two is not wrapped:
+ *
+ * <----S----B---->
+ * [---) = F wrapped past B (and UINT_MAX)
+ * [---) = F not wrapped
+ * [----] = F wrapped past B
+ *
+ * When the boundary is numerically less than the starting point (i.e. the
+ * UINT_MAX wraparound occurs somewhere in between) then all values in
+ * between are wrapped:
+ *
+ * <----B----S---->
+ * [---) = F not wrapped past B (but wrapped past UINT_MAX)
+ * [---) = F wrapped past B (and UINT_MAX)
+ * [----] = F not wrapped
+ *-----------------------------------------------------------------------
+ */
+ if (start < boundary)
+ return finish >= boundary || finish < start;
+ else
+ return finish >= boundary && finish < start;
+}
+
+/*
+ * Find the starting offset of the given MultiXactId.
+ *
+ * Returns false if the file containing the multi does not exist on disk.
+ * Otherwise, returns true and sets *result to the starting member offset.
+ *
+ * This function does not prevent concurrent truncation, so if that's
+ * required, the caller has to protect against that.
+ */
+static bool
+find_multixact_start(MultiXactId multi, MultiXactOffset *result)
+{
+ MultiXactOffset offset;
+ int pageno;
+ int entryno;
+ int slotno;
+ MultiXactOffset *offptr;
+
+ Assert(MultiXactState->finishedStartup);
+
+ pageno = MultiXactIdToOffsetPage(multi);
+ entryno = MultiXactIdToOffsetEntry(multi);
+
+ /*
+ * Write out dirty data, so PhysicalPageExists can work correctly.
+ */
+ SimpleLruWriteAll(MultiXactOffsetCtl, true);
+ SimpleLruWriteAll(MultiXactMemberCtl, true);
+
+ if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
+ return false;
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+ slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
+ offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr += entryno;
+ offset = *offptr;
+ LWLockRelease(MultiXactOffsetSLRULock);
+
+ *result = offset;
+ return true;
+}
+
+/*
+ * Determine how many multixacts, and how many multixact members, currently
+ * exist. Return false if unable to determine.
+ */
+static bool
+ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
+{
+ MultiXactOffset nextOffset;
+ MultiXactOffset oldestOffset;
+ MultiXactId oldestMultiXactId;
+ MultiXactId nextMultiXactId;
+ bool oldestOffsetKnown;
+
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ nextOffset = MultiXactState->nextOffset;
+ oldestMultiXactId = MultiXactState->oldestMultiXactId;
+ nextMultiXactId = MultiXactState->nextMXact;
+ oldestOffset = MultiXactState->oldestOffset;
+ oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
+ LWLockRelease(MultiXactGenLock);
+
+ if (!oldestOffsetKnown)
+ return false;
+
+ *members = nextOffset - oldestOffset;
+ *multixacts = nextMultiXactId - oldestMultiXactId;
+ return true;
+}
+
+/*
+ * Multixact members can be removed once the multixacts that refer to them
+ * are older than every datminmxid. autovacuum_multixact_freeze_max_age and
+ * vacuum_multixact_freeze_table_age work together to make sure we never have
+ * too many multixacts; we hope that, at least under normal circumstances,
+ * this will also be sufficient to keep us from using too many offsets.
+ * However, if the average multixact has many members, we might exhaust the
+ * members space while still using few enough members that these limits fail
+ * to trigger full table scans for relminmxid advancement. At that point,
+ * we'd have no choice but to start failing multixact-creating operations
+ * with an error.
+ *
+ * To prevent that, if more than a threshold portion of the members space is
+ * used, we effectively reduce autovacuum_multixact_freeze_max_age and
+ * to a value just less than the number of multixacts in use. We hope that
+ * this will quickly trigger autovacuuming on the table or tables with the
+ * oldest relminmxid, thus allowing datminmxid values to advance and removing
+ * some members.
+ *
+ * As the fraction of the member space currently in use grows, we become
+ * more aggressive in clamping this value. That not only causes autovacuum
+ * to ramp up, but also makes any manual vacuums the user issues more
+ * aggressive. This happens because vacuum_set_xid_limits() clamps the
+ * freeze table and the minimum freeze age based on the effective
+ * autovacuum_multixact_freeze_max_age this function returns. In the worst
+ * case, we'll claim the freeze_max_age to zero, and every vacuum of any
+ * table will try to freeze every multixact.
+ *
+ * It's possible that these thresholds should be user-tunable, but for now
+ * we keep it simple.
+ */
+int
+MultiXactMemberFreezeThreshold(void)
+{
+ MultiXactOffset members;
+ uint32 multixacts;
+ uint32 victim_multixacts;
+ double fraction;
+
+ /* If we can't determine member space utilization, assume the worst. */
+ if (!ReadMultiXactCounts(&multixacts, &members))
+ return 0;
+
+ /* If member space utilization is low, no special action is required. */
+ if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
+ return autovacuum_multixact_freeze_max_age;
+
+ /*
+ * Compute a target for relminmxid advancement. The number of multixacts
+ * we try to eliminate from the system is based on how far we are past
+ * MULTIXACT_MEMBER_SAFE_THRESHOLD.
+ */
+ fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
+ (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
+ victim_multixacts = multixacts * fraction;
+
+ /* fraction could be > 1.0, but lowest possible freeze age is zero */
+ if (victim_multixacts > multixacts)
+ return 0;
+ return multixacts - victim_multixacts;
+}
+
+typedef struct mxtruncinfo
+{
+ int earliestExistingPage;
+} mxtruncinfo;
+
+/*
+ * SlruScanDirectory callback
+ * This callback determines the earliest existing page number.
+ */
+static bool
+SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+ mxtruncinfo *trunc = (mxtruncinfo *) data;
+
+ if (trunc->earliestExistingPage == -1 ||
+ ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
+ {
+ trunc->earliestExistingPage = segpage;
+ }
+
+ return false; /* keep going */
+}
+
+
+/*
+ * Delete members segments [oldest, newOldest)
+ *
+ * The members SLRU can, in contrast to the offsets one, be filled to almost
+ * the full range at once. This means SimpleLruTruncate() can't trivially be
+ * used - instead the to-be-deleted range is computed using the offsets
+ * SLRU. C.f. TruncateMultiXact().
+ */
+static void
+PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
+{
+ const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
+ int startsegment = MXOffsetToMemberSegment(oldestOffset);
+ int endsegment = MXOffsetToMemberSegment(newOldestOffset);
+ int segment = startsegment;
+
+ /*
+ * Delete all the segments but the last one. The last segment can still
+ * contain, possibly partially, valid data.
+ */
+ while (segment != endsegment)
+ {
+ elog(DEBUG2, "truncating multixact members segment %x", segment);
+ SlruDeleteSegment(MultiXactMemberCtl, segment);
+
+ /* move to next segment, handling wraparound correctly */
+ if (segment == maxsegment)
+ segment = 0;
+ else
+ segment += 1;
+ }
+}
+
+/*
+ * Delete offsets segments [oldest, newOldest)
+ */
+static void
+PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
+{
+ /*
+ * We step back one multixact to avoid passing a cutoff page that hasn't
+ * been created yet in the rare case that oldestMulti would be the first
+ * item on a page and oldestMulti == nextMulti. In that case, if we
+ * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
+ * detection.
+ */
+ SimpleLruTruncate(MultiXactOffsetCtl,
+ MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
+}
+
+/*
+ * Remove all MultiXactOffset and MultiXactMember segments before the oldest
+ * ones still of interest.
+ *
+ * This is only called on a primary as part of vacuum (via
+ * vac_truncate_clog()). During recovery truncation is done by replaying
+ * truncation WAL records logged here.
+ *
+ * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
+ * is one of the databases preventing newOldestMulti from increasing.
+ */
+void
+TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
+{
+ MultiXactId oldestMulti;
+ MultiXactId nextMulti;
+ MultiXactOffset newOldestOffset;
+ MultiXactOffset oldestOffset;
+ MultiXactOffset nextOffset;
+ mxtruncinfo trunc;
+ MultiXactId earliest;
+
+ Assert(!RecoveryInProgress());
+ Assert(MultiXactState->finishedStartup);
+
+ /*
+ * We can only allow one truncation to happen at once. Otherwise parts of
+ * members might vanish while we're doing lookups or similar. There's no
+ * need to have an interlock with creating new multis or such, since those
+ * are constrained by the limits (which only grow, never shrink).
+ */
+ LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
+
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ nextMulti = MultiXactState->nextMXact;
+ nextOffset = MultiXactState->nextOffset;
+ oldestMulti = MultiXactState->oldestMultiXactId;
+ LWLockRelease(MultiXactGenLock);
+ Assert(MultiXactIdIsValid(oldestMulti));
+
+ /*
+ * Make sure to only attempt truncation if there's values to truncate
+ * away. In normal processing values shouldn't go backwards, but there's
+ * some corner cases (due to bugs) where that's possible.
+ */
+ if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
+ {
+ LWLockRelease(MultiXactTruncationLock);
+ return;
+ }
+
+ /*
+ * Note we can't just plow ahead with the truncation; it's possible that
+ * there are no segments to truncate, which is a problem because we are
+ * going to attempt to read the offsets page to determine where to
+ * truncate the members SLRU. So we first scan the directory to determine
+ * the earliest offsets page number that we can read without error.
+ *
+ * When nextMXact is less than one segment away from multiWrapLimit,
+ * SlruScanDirCbFindEarliest can find some early segment other than the
+ * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
+ * returns false, because not all pairs of entries have the same answer.)
+ * That can also arise when an earlier truncation attempt failed unlink()
+ * or returned early from this function. The only consequence is
+ * returning early, which wastes space that we could have liberated.
+ *
+ * NB: It's also possible that the page that oldestMulti is on has already
+ * been truncated away, and we crashed before updating oldestMulti.
+ */
+ trunc.earliestExistingPage = -1;
+ SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
+ earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+ if (earliest < FirstMultiXactId)
+ earliest = FirstMultiXactId;
+
+ /* If there's nothing to remove, we can bail out early. */
+ if (MultiXactIdPrecedes(oldestMulti, earliest))
+ {
+ LWLockRelease(MultiXactTruncationLock);
+ return;
+ }
+
+ /*
+ * First, compute the safe truncation point for MultiXactMember. This is
+ * the starting offset of the oldest multixact.
+ *
+ * Hopefully, find_multixact_start will always work here, because we've
+ * already checked that it doesn't precede the earliest MultiXact on disk.
+ * But if it fails, don't truncate anything, and log a message.
+ */
+ if (oldestMulti == nextMulti)
+ {
+ /* there are NO MultiXacts */
+ oldestOffset = nextOffset;
+ }
+ else if (!find_multixact_start(oldestMulti, &oldestOffset))
+ {
+ ereport(LOG,
+ (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
+ oldestMulti, earliest)));
+ LWLockRelease(MultiXactTruncationLock);
+ return;
+ }
+
+ /*
+ * Secondly compute up to where to truncate. Lookup the corresponding
+ * member offset for newOldestMulti for that.
+ */
+ if (newOldestMulti == nextMulti)
+ {
+ /* there are NO MultiXacts */
+ newOldestOffset = nextOffset;
+ }
+ else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
+ {
+ ereport(LOG,
+ (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
+ newOldestMulti)));
+ LWLockRelease(MultiXactTruncationLock);
+ return;
+ }
+
+ elog(DEBUG1, "performing multixact truncation: "
+ "offsets [%u, %u), offsets segments [%x, %x), "
+ "members [%u, %u), members segments [%x, %x)",
+ oldestMulti, newOldestMulti,
+ MultiXactIdToOffsetSegment(oldestMulti),
+ MultiXactIdToOffsetSegment(newOldestMulti),
+ oldestOffset, newOldestOffset,
+ MXOffsetToMemberSegment(oldestOffset),
+ MXOffsetToMemberSegment(newOldestOffset));
+
+ /*
+ * Do truncation, and the WAL logging of the truncation, in a critical
+ * section. That way offsets/members cannot get out of sync anymore, i.e.
+ * once consistent the newOldestMulti will always exist in members, even
+ * if we crashed in the wrong moment.
+ */
+ START_CRIT_SECTION();
+
+ /*
+ * Prevent checkpoints from being scheduled concurrently. This is critical
+ * because otherwise a truncation record might not be replayed after a
+ * crash/basebackup, even though the state of the data directory would
+ * require it.
+ */
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ /* WAL log truncation */
+ WriteMTruncateXlogRec(newOldestMultiDB,
+ oldestMulti, newOldestMulti,
+ oldestOffset, newOldestOffset);
+
+ /*
+ * Update in-memory limits before performing the truncation, while inside
+ * the critical section: Have to do it before truncation, to prevent
+ * concurrent lookups of those values. Has to be inside the critical
+ * section as otherwise a future call to this function would error out,
+ * while looking up the oldest member in offsets, if our caller crashes
+ * before updating the limits.
+ */
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->oldestMultiXactId = newOldestMulti;
+ MultiXactState->oldestMultiXactDB = newOldestMultiDB;
+ LWLockRelease(MultiXactGenLock);
+
+ /* First truncate members */
+ PerformMembersTruncation(oldestOffset, newOldestOffset);
+
+ /* Then offsets */
+ PerformOffsetsTruncation(oldestMulti, newOldestMulti);
+
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ END_CRIT_SECTION();
+ LWLockRelease(MultiXactTruncationLock);
+}
+
+/*
+ * Decide whether a MultiXactOffset page number is "older" for truncation
+ * purposes. Analogous to CLOGPagePrecedes().
+ *
+ * Offsetting the values is optional, because MultiXactIdPrecedes() has
+ * translational symmetry.
+ */
+static bool
+MultiXactOffsetPagePrecedes(int page1, int page2)
+{
+ MultiXactId multi1;
+ MultiXactId multi2;
+
+ multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
+ multi1 += FirstMultiXactId + 1;
+ multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
+ multi2 += FirstMultiXactId + 1;
+
+ return (MultiXactIdPrecedes(multi1, multi2) &&
+ MultiXactIdPrecedes(multi1,
+ multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
+}
+
+/*
+ * Decide whether a MultiXactMember page number is "older" for truncation
+ * purposes. There is no "invalid offset number" so use the numbers verbatim.
+ */
+static bool
+MultiXactMemberPagePrecedes(int page1, int page2)
+{
+ MultiXactOffset offset1;
+ MultiXactOffset offset2;
+
+ offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
+ offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
+
+ return (MultiXactOffsetPrecedes(offset1, offset2) &&
+ MultiXactOffsetPrecedes(offset1,
+ offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
+}
+
+/*
+ * Decide which of two MultiXactIds is earlier.
+ *
+ * XXX do we need to do something special for InvalidMultiXactId?
+ * (Doesn't look like it.)
+ */
+bool
+MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
+{
+ int32 diff = (int32) (multi1 - multi2);
+
+ return (diff < 0);
+}
+
+/*
+ * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
+ *
+ * XXX do we need to do something special for InvalidMultiXactId?
+ * (Doesn't look like it.)
+ */
+bool
+MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
+{
+ int32 diff = (int32) (multi1 - multi2);
+
+ return (diff <= 0);
+}
+
+
+/*
+ * Decide which of two offsets is earlier.
+ */
+static bool
+MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
+{
+ int32 diff = (int32) (offset1 - offset2);
+
+ return (diff < 0);
+}
+
+/*
+ * Write an xlog record reflecting the zeroing of either a MEMBERs or
+ * OFFSETs page (info shows which)
+ */
+static void
+WriteMZeroPageXlogRec(int pageno, uint8 info)
+{
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&pageno), sizeof(int));
+ (void) XLogInsert(RM_MULTIXACT_ID, info);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ *
+ * We must flush the xlog record to disk before returning --- see notes in
+ * TruncateCLOG().
+ */
+static void
+WriteMTruncateXlogRec(Oid oldestMultiDB,
+ MultiXactId startTruncOff, MultiXactId endTruncOff,
+ MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
+{
+ XLogRecPtr recptr;
+ xl_multixact_truncate xlrec;
+
+ xlrec.oldestMultiDB = oldestMultiDB;
+
+ xlrec.startTruncOff = startTruncOff;
+ xlrec.endTruncOff = endTruncOff;
+
+ xlrec.startTruncMemb = startTruncMemb;
+ xlrec.endTruncMemb = endTruncMemb;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
+ recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
+ XLogFlush(recptr);
+}
+
+/*
+ * MULTIXACT resource manager's routines
+ */
+void
+multixact_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in multixact records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
+ {
+ int pageno;
+ int slotno;
+
+ memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+ LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+ slotno = ZeroMultiXactOffsetPage(pageno, false);
+ SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+ Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(MultiXactOffsetSLRULock);
+ }
+ else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
+ {
+ int pageno;
+ int slotno;
+
+ memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+ LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+ slotno = ZeroMultiXactMemberPage(pageno, false);
+ SimpleLruWritePage(MultiXactMemberCtl, slotno);
+ Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(MultiXactMemberSLRULock);
+ }
+ else if (info == XLOG_MULTIXACT_CREATE_ID)
+ {
+ xl_multixact_create *xlrec =
+ (xl_multixact_create *) XLogRecGetData(record);
+ TransactionId max_xid;
+ int i;
+
+ /* Store the data back into the SLRU files */
+ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
+ xlrec->members);
+
+ /* Make sure nextMXact/nextOffset are beyond what this record has */
+ MultiXactAdvanceNextMXact(xlrec->mid + 1,
+ xlrec->moff + xlrec->nmembers);
+
+ /*
+ * Make sure nextXid is beyond any XID mentioned in the record. This
+ * should be unnecessary, since any XID found here ought to have other
+ * evidence in the XLOG, but let's be safe.
+ */
+ max_xid = XLogRecGetXid(record);
+ for (i = 0; i < xlrec->nmembers; i++)
+ {
+ if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
+ max_xid = xlrec->members[i].xid;
+ }
+
+ AdvanceNextFullTransactionIdPastXid(max_xid);
+ }
+ else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
+ {
+ xl_multixact_truncate xlrec;
+ int pageno;
+
+ memcpy(&xlrec, XLogRecGetData(record),
+ SizeOfMultiXactTruncate);
+
+ elog(DEBUG1, "replaying multixact truncation: "
+ "offsets [%u, %u), offsets segments [%x, %x), "
+ "members [%u, %u), members segments [%x, %x)",
+ xlrec.startTruncOff, xlrec.endTruncOff,
+ MultiXactIdToOffsetSegment(xlrec.startTruncOff),
+ MultiXactIdToOffsetSegment(xlrec.endTruncOff),
+ xlrec.startTruncMemb, xlrec.endTruncMemb,
+ MXOffsetToMemberSegment(xlrec.startTruncMemb),
+ MXOffsetToMemberSegment(xlrec.endTruncMemb));
+
+ /* should not be required, but more than cheap enough */
+ LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
+
+ /*
+ * Advance the horizon values, so they're current at the end of
+ * recovery.
+ */
+ SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
+
+ PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
+
+ /*
+ * During XLOG replay, latest_page_number isn't necessarily set up
+ * yet; insert a suitable value to bypass the sanity test in
+ * SimpleLruTruncate.
+ */
+ pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
+ MultiXactOffsetCtl->shared->latest_page_number = pageno;
+ PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
+
+ LWLockRelease(MultiXactTruncationLock);
+ }
+ else
+ elog(PANIC, "multixact_redo: unknown op code %u", info);
+}
+
+Datum
+pg_get_multixact_members(PG_FUNCTION_ARGS)
+{
+ typedef struct
+ {
+ MultiXactMember *members;
+ int nmembers;
+ int iter;
+ } mxact;
+ MultiXactId mxid = PG_GETARG_TRANSACTIONID(0);
+ mxact *multi;
+ FuncCallContext *funccxt;
+
+ if (mxid < FirstMultiXactId)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid MultiXactId: %u", mxid)));
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcxt;
+ TupleDesc tupdesc;
+
+ funccxt = SRF_FIRSTCALL_INIT();
+ oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
+
+ multi = palloc(sizeof(mxact));
+ /* no need to allow for old values here */
+ multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
+ false);
+ multi->iter = 0;
+
+ tupdesc = CreateTemplateTupleDesc(2);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+ XIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
+ TEXTOID, -1, 0);
+
+ funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+ funccxt->user_fctx = multi;
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ funccxt = SRF_PERCALL_SETUP();
+ multi = (mxact *) funccxt->user_fctx;
+
+ while (multi->iter < multi->nmembers)
+ {
+ HeapTuple tuple;
+ char *values[2];
+
+ values[0] = psprintf("%u", multi->members[multi->iter].xid);
+ values[1] = mxstatus_to_string(multi->members[multi->iter].status);
+
+ tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
+
+ multi->iter++;
+ pfree(values[0]);
+ SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
+ }
+
+ SRF_RETURN_DONE(funccxt);
+}
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+ return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+ return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
new file mode 100644
index 0000000..df0cd77
--- /dev/null
+++ b/src/backend/access/transam/parallel.c
@@ -0,0 +1,1597 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.c
+ * Infrastructure for launching parallel workers
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/parallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/parallel.h"
+#include "access/session.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/index.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_enum.h"
+#include "catalog/storage.h"
+#include "commands/async.h"
+#include "commands/vacuum.h"
+#include "executor/execParallel.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "libpq/pqmq.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/sinval.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/combocid.h"
+#include "utils/guc.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/relmapper.h"
+#include "utils/snapmgr.h"
+#include "utils/typcache.h"
+
+/*
+ * We don't want to waste a lot of memory on an error queue which, most of
+ * the time, will process only a handful of small messages. However, it is
+ * desirable to make it large enough that a typical ErrorResponse can be sent
+ * without blocking. That way, a worker that errors out can write the whole
+ * message into the queue and terminate without waiting for the user backend.
+ */
+#define PARALLEL_ERROR_QUEUE_SIZE 16384
+
+/* Magic number for parallel context TOC. */
+#define PARALLEL_MAGIC 0x50477c7c
+
+/*
+ * Magic numbers for per-context parallel state sharing. Higher-level code
+ * should use smaller values, leaving these very large ones for use by this
+ * module.
+ */
+#define PARALLEL_KEY_FIXED UINT64CONST(0xFFFFFFFFFFFF0001)
+#define PARALLEL_KEY_ERROR_QUEUE UINT64CONST(0xFFFFFFFFFFFF0002)
+#define PARALLEL_KEY_LIBRARY UINT64CONST(0xFFFFFFFFFFFF0003)
+#define PARALLEL_KEY_GUC UINT64CONST(0xFFFFFFFFFFFF0004)
+#define PARALLEL_KEY_COMBO_CID UINT64CONST(0xFFFFFFFFFFFF0005)
+#define PARALLEL_KEY_TRANSACTION_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0006)
+#define PARALLEL_KEY_ACTIVE_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0007)
+#define PARALLEL_KEY_TRANSACTION_STATE UINT64CONST(0xFFFFFFFFFFFF0008)
+#define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009)
+#define PARALLEL_KEY_SESSION_DSM UINT64CONST(0xFFFFFFFFFFFF000A)
+#define PARALLEL_KEY_PENDING_SYNCS UINT64CONST(0xFFFFFFFFFFFF000B)
+#define PARALLEL_KEY_REINDEX_STATE UINT64CONST(0xFFFFFFFFFFFF000C)
+#define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D)
+#define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E)
+
+/* Fixed-size parallel state. */
+typedef struct FixedParallelState
+{
+ /* Fixed-size state that workers must restore. */
+ Oid database_id;
+ Oid authenticated_user_id;
+ Oid current_user_id;
+ Oid outer_user_id;
+ Oid temp_namespace_id;
+ Oid temp_toast_namespace_id;
+ int sec_context;
+ bool is_superuser;
+ PGPROC *parallel_leader_pgproc;
+ pid_t parallel_leader_pid;
+ BackendId parallel_leader_backend_id;
+ TimestampTz xact_ts;
+ TimestampTz stmt_ts;
+ SerializableXactHandle serializable_xact_handle;
+
+ /* Mutex protects remaining fields. */
+ slock_t mutex;
+
+ /* Maximum XactLastRecEnd of any worker. */
+ XLogRecPtr last_xlog_end;
+} FixedParallelState;
+
+/*
+ * Our parallel worker number. We initialize this to -1, meaning that we are
+ * not a parallel worker. In parallel workers, it will be set to a value >= 0
+ * and < the number of workers before any user code is invoked; each parallel
+ * worker will get a different parallel worker number.
+ */
+int ParallelWorkerNumber = -1;
+
+/* Is there a parallel message pending which we need to receive? */
+volatile bool ParallelMessagePending = false;
+
+/* Are we initializing a parallel worker? */
+bool InitializingParallelWorker = false;
+
+/* Pointer to our fixed parallel state. */
+static FixedParallelState *MyFixedParallelState;
+
+/* List of active parallel contexts. */
+static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list);
+
+/* Backend-local copy of data from FixedParallelState. */
+static pid_t ParallelLeaderPid;
+
+/*
+ * List of internal parallel worker entry points. We need this for
+ * reasons explained in LookupParallelWorkerFunction(), below.
+ */
+static const struct
+{
+ const char *fn_name;
+ parallel_worker_main_type fn_addr;
+} InternalParallelWorkers[] =
+
+{
+ {
+ "ParallelQueryMain", ParallelQueryMain
+ },
+ {
+ "_bt_parallel_build_main", _bt_parallel_build_main
+ },
+ {
+ "parallel_vacuum_main", parallel_vacuum_main
+ }
+};
+
+/* Private functions. */
+static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg);
+static void WaitForParallelWorkersToExit(ParallelContext *pcxt);
+static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname);
+static void ParallelWorkerShutdown(int code, Datum arg);
+
+
+/*
+ * Establish a new parallel context. This should be done after entering
+ * parallel mode, and (unless there is an error) the context should be
+ * destroyed before exiting the current subtransaction.
+ */
+ParallelContext *
+CreateParallelContext(const char *library_name, const char *function_name,
+ int nworkers)
+{
+ MemoryContext oldcontext;
+ ParallelContext *pcxt;
+
+ /* It is unsafe to create a parallel context if not in parallel mode. */
+ Assert(IsInParallelMode());
+
+ /* Number of workers should be non-negative. */
+ Assert(nworkers >= 0);
+
+ /* We might be running in a short-lived memory context. */
+ oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+ /* Initialize a new ParallelContext. */
+ pcxt = palloc0(sizeof(ParallelContext));
+ pcxt->subid = GetCurrentSubTransactionId();
+ pcxt->nworkers = nworkers;
+ pcxt->nworkers_to_launch = nworkers;
+ pcxt->library_name = pstrdup(library_name);
+ pcxt->function_name = pstrdup(function_name);
+ pcxt->error_context_stack = error_context_stack;
+ shm_toc_initialize_estimator(&pcxt->estimator);
+ dlist_push_head(&pcxt_list, &pcxt->node);
+
+ /* Restore previous memory context. */
+ MemoryContextSwitchTo(oldcontext);
+
+ return pcxt;
+}
+
+/*
+ * Establish the dynamic shared memory segment for a parallel context and
+ * copy state and other bookkeeping information that will be needed by
+ * parallel workers into it.
+ */
+void
+InitializeParallelDSM(ParallelContext *pcxt)
+{
+ MemoryContext oldcontext;
+ Size library_len = 0;
+ Size guc_len = 0;
+ Size combocidlen = 0;
+ Size tsnaplen = 0;
+ Size asnaplen = 0;
+ Size tstatelen = 0;
+ Size pendingsyncslen = 0;
+ Size reindexlen = 0;
+ Size relmapperlen = 0;
+ Size uncommittedenumslen = 0;
+ Size segsize = 0;
+ int i;
+ FixedParallelState *fps;
+ dsm_handle session_dsm_handle = DSM_HANDLE_INVALID;
+ Snapshot transaction_snapshot = GetTransactionSnapshot();
+ Snapshot active_snapshot = GetActiveSnapshot();
+
+ /* We might be running in a very short-lived memory context. */
+ oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+ /* Allow space to store the fixed-size parallel state. */
+ shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /*
+ * Normally, the user will have requested at least one worker process, but
+ * if by chance they have not, we can skip a bunch of things here.
+ */
+ if (pcxt->nworkers > 0)
+ {
+ /* Get (or create) the per-session DSM segment's handle. */
+ session_dsm_handle = GetSessionDsmHandle();
+
+ /*
+ * If we weren't able to create a per-session DSM segment, then we can
+ * continue but we can't safely launch any workers because their
+ * record typmods would be incompatible so they couldn't exchange
+ * tuples.
+ */
+ if (session_dsm_handle == DSM_HANDLE_INVALID)
+ pcxt->nworkers = 0;
+ }
+
+ if (pcxt->nworkers > 0)
+ {
+ /* Estimate space for various kinds of state sharing. */
+ library_len = EstimateLibraryStateSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, library_len);
+ guc_len = EstimateGUCStateSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, guc_len);
+ combocidlen = EstimateComboCIDStateSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, combocidlen);
+ if (IsolationUsesXactSnapshot())
+ {
+ tsnaplen = EstimateSnapshotSpace(transaction_snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen);
+ }
+ asnaplen = EstimateSnapshotSpace(active_snapshot);
+ shm_toc_estimate_chunk(&pcxt->estimator, asnaplen);
+ tstatelen = EstimateTransactionStateSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, tstatelen);
+ shm_toc_estimate_chunk(&pcxt->estimator, sizeof(dsm_handle));
+ pendingsyncslen = EstimatePendingSyncsSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, pendingsyncslen);
+ reindexlen = EstimateReindexStateSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, reindexlen);
+ relmapperlen = EstimateRelationMapSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, relmapperlen);
+ uncommittedenumslen = EstimateUncommittedEnumsSpace();
+ shm_toc_estimate_chunk(&pcxt->estimator, uncommittedenumslen);
+ /* If you add more chunks here, you probably need to add keys. */
+ shm_toc_estimate_keys(&pcxt->estimator, 11);
+
+ /* Estimate space need for error queues. */
+ StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
+ PARALLEL_ERROR_QUEUE_SIZE,
+ "parallel error queue size not buffer-aligned");
+ shm_toc_estimate_chunk(&pcxt->estimator,
+ mul_size(PARALLEL_ERROR_QUEUE_SIZE,
+ pcxt->nworkers));
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+ /* Estimate how much we'll need for the entrypoint info. */
+ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) +
+ strlen(pcxt->function_name) + 2);
+ shm_toc_estimate_keys(&pcxt->estimator, 1);
+ }
+
+ /*
+ * Create DSM and initialize with new table of contents. But if the user
+ * didn't request any workers, then don't bother creating a dynamic shared
+ * memory segment; instead, just use backend-private memory.
+ *
+ * Also, if we can't create a dynamic shared memory segment because the
+ * maximum number of segments have already been created, then fall back to
+ * backend-private memory, and plan not to use any workers. We hope this
+ * won't happen very often, but it's better to abandon the use of
+ * parallelism than to fail outright.
+ */
+ segsize = shm_toc_estimate(&pcxt->estimator);
+ if (pcxt->nworkers > 0)
+ pcxt->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS);
+ if (pcxt->seg != NULL)
+ pcxt->toc = shm_toc_create(PARALLEL_MAGIC,
+ dsm_segment_address(pcxt->seg),
+ segsize);
+ else
+ {
+ pcxt->nworkers = 0;
+ pcxt->private_memory = MemoryContextAlloc(TopMemoryContext, segsize);
+ pcxt->toc = shm_toc_create(PARALLEL_MAGIC, pcxt->private_memory,
+ segsize);
+ }
+
+ /* Initialize fixed-size state in shared memory. */
+ fps = (FixedParallelState *)
+ shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState));
+ fps->database_id = MyDatabaseId;
+ fps->authenticated_user_id = GetAuthenticatedUserId();
+ fps->outer_user_id = GetCurrentRoleId();
+ fps->is_superuser = session_auth_is_superuser;
+ GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context);
+ GetTempNamespaceState(&fps->temp_namespace_id,
+ &fps->temp_toast_namespace_id);
+ fps->parallel_leader_pgproc = MyProc;
+ fps->parallel_leader_pid = MyProcPid;
+ fps->parallel_leader_backend_id = MyBackendId;
+ fps->xact_ts = GetCurrentTransactionStartTimestamp();
+ fps->stmt_ts = GetCurrentStatementStartTimestamp();
+ fps->serializable_xact_handle = ShareSerializableXact();
+ SpinLockInit(&fps->mutex);
+ fps->last_xlog_end = 0;
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps);
+
+ /* We can skip the rest of this if we're not budgeting for any workers. */
+ if (pcxt->nworkers > 0)
+ {
+ char *libraryspace;
+ char *gucspace;
+ char *combocidspace;
+ char *tsnapspace;
+ char *asnapspace;
+ char *tstatespace;
+ char *pendingsyncsspace;
+ char *reindexspace;
+ char *relmapperspace;
+ char *error_queue_space;
+ char *session_dsm_handle_space;
+ char *entrypointstate;
+ char *uncommittedenumsspace;
+ Size lnamelen;
+
+ /* Serialize shared libraries we have loaded. */
+ libraryspace = shm_toc_allocate(pcxt->toc, library_len);
+ SerializeLibraryState(library_len, libraryspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace);
+
+ /* Serialize GUC settings. */
+ gucspace = shm_toc_allocate(pcxt->toc, guc_len);
+ SerializeGUCState(guc_len, gucspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace);
+
+ /* Serialize combo CID state. */
+ combocidspace = shm_toc_allocate(pcxt->toc, combocidlen);
+ SerializeComboCIDState(combocidlen, combocidspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace);
+
+ /*
+ * Serialize the transaction snapshot if the transaction
+ * isolation-level uses a transaction snapshot.
+ */
+ if (IsolationUsesXactSnapshot())
+ {
+ tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen);
+ SerializeSnapshot(transaction_snapshot, tsnapspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT,
+ tsnapspace);
+ }
+
+ /* Serialize the active snapshot. */
+ asnapspace = shm_toc_allocate(pcxt->toc, asnaplen);
+ SerializeSnapshot(active_snapshot, asnapspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace);
+
+ /* Provide the handle for per-session segment. */
+ session_dsm_handle_space = shm_toc_allocate(pcxt->toc,
+ sizeof(dsm_handle));
+ *(dsm_handle *) session_dsm_handle_space = session_dsm_handle;
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSION_DSM,
+ session_dsm_handle_space);
+
+ /* Serialize transaction state. */
+ tstatespace = shm_toc_allocate(pcxt->toc, tstatelen);
+ SerializeTransactionState(tstatelen, tstatespace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace);
+
+ /* Serialize pending syncs. */
+ pendingsyncsspace = shm_toc_allocate(pcxt->toc, pendingsyncslen);
+ SerializePendingSyncs(pendingsyncslen, pendingsyncsspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_PENDING_SYNCS,
+ pendingsyncsspace);
+
+ /* Serialize reindex state. */
+ reindexspace = shm_toc_allocate(pcxt->toc, reindexlen);
+ SerializeReindexState(reindexlen, reindexspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_REINDEX_STATE, reindexspace);
+
+ /* Serialize relmapper state. */
+ relmapperspace = shm_toc_allocate(pcxt->toc, relmapperlen);
+ SerializeRelationMap(relmapperlen, relmapperspace);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_RELMAPPER_STATE,
+ relmapperspace);
+
+ /* Serialize uncommitted enum state. */
+ uncommittedenumsspace = shm_toc_allocate(pcxt->toc,
+ uncommittedenumslen);
+ SerializeUncommittedEnums(uncommittedenumsspace, uncommittedenumslen);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_UNCOMMITTEDENUMS,
+ uncommittedenumsspace);
+
+ /* Allocate space for worker information. */
+ pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers);
+
+ /*
+ * Establish error queues in dynamic shared memory.
+ *
+ * These queues should be used only for transmitting ErrorResponse,
+ * NoticeResponse, and NotifyResponse protocol messages. Tuple data
+ * should be transmitted via separate (possibly larger?) queues.
+ */
+ error_queue_space =
+ shm_toc_allocate(pcxt->toc,
+ mul_size(PARALLEL_ERROR_QUEUE_SIZE,
+ pcxt->nworkers));
+ for (i = 0; i < pcxt->nworkers; ++i)
+ {
+ char *start;
+ shm_mq *mq;
+
+ start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
+ mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
+ shm_mq_set_receiver(mq, MyProc);
+ pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+ }
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space);
+
+ /*
+ * Serialize entrypoint information. It's unsafe to pass function
+ * pointers across processes, as the function pointer may be different
+ * in each process in EXEC_BACKEND builds, so we always pass library
+ * and function name. (We use library name "postgres" for functions
+ * in the core backend.)
+ */
+ lnamelen = strlen(pcxt->library_name);
+ entrypointstate = shm_toc_allocate(pcxt->toc, lnamelen +
+ strlen(pcxt->function_name) + 2);
+ strcpy(entrypointstate, pcxt->library_name);
+ strcpy(entrypointstate + lnamelen + 1, pcxt->function_name);
+ shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate);
+ }
+
+ /* Restore previous memory context. */
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Reinitialize the dynamic shared memory segment for a parallel context such
+ * that we could launch workers for it again.
+ */
+void
+ReinitializeParallelDSM(ParallelContext *pcxt)
+{
+ FixedParallelState *fps;
+
+ /* Wait for any old workers to exit. */
+ if (pcxt->nworkers_launched > 0)
+ {
+ WaitForParallelWorkersToFinish(pcxt);
+ WaitForParallelWorkersToExit(pcxt);
+ pcxt->nworkers_launched = 0;
+ if (pcxt->known_attached_workers)
+ {
+ pfree(pcxt->known_attached_workers);
+ pcxt->known_attached_workers = NULL;
+ pcxt->nknown_attached_workers = 0;
+ }
+ }
+
+ /* Reset a few bits of fixed parallel state to a clean state. */
+ fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
+ fps->last_xlog_end = 0;
+
+ /* Recreate error queues (if they exist). */
+ if (pcxt->nworkers > 0)
+ {
+ char *error_queue_space;
+ int i;
+
+ error_queue_space =
+ shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false);
+ for (i = 0; i < pcxt->nworkers; ++i)
+ {
+ char *start;
+ shm_mq *mq;
+
+ start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
+ mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
+ shm_mq_set_receiver(mq, MyProc);
+ pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+ }
+ }
+}
+
+/*
+ * Reinitialize parallel workers for a parallel context such that we could
+ * launch a different number of workers. This is required for cases where
+ * we need to reuse the same DSM segment, but the number of workers can
+ * vary from run-to-run.
+ */
+void
+ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch)
+{
+ /*
+ * The number of workers that need to be launched must be less than the
+ * number of workers with which the parallel context is initialized.
+ */
+ Assert(pcxt->nworkers >= nworkers_to_launch);
+ pcxt->nworkers_to_launch = nworkers_to_launch;
+}
+
+/*
+ * Launch parallel workers.
+ */
+void
+LaunchParallelWorkers(ParallelContext *pcxt)
+{
+ MemoryContext oldcontext;
+ BackgroundWorker worker;
+ int i;
+ bool any_registrations_failed = false;
+
+ /* Skip this if we have no workers. */
+ if (pcxt->nworkers == 0 || pcxt->nworkers_to_launch == 0)
+ return;
+
+ /* We need to be a lock group leader. */
+ BecomeLockGroupLeader();
+
+ /* If we do have workers, we'd better have a DSM segment. */
+ Assert(pcxt->seg != NULL);
+
+ /* We might be running in a short-lived memory context. */
+ oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+ /* Configure a worker. */
+ memset(&worker, 0, sizeof(worker));
+ snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d",
+ MyProcPid);
+ snprintf(worker.bgw_type, BGW_MAXLEN, "parallel worker");
+ worker.bgw_flags =
+ BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION
+ | BGWORKER_CLASS_PARALLEL;
+ worker.bgw_start_time = BgWorkerStart_ConsistentState;
+ worker.bgw_restart_time = BGW_NEVER_RESTART;
+ sprintf(worker.bgw_library_name, "postgres");
+ sprintf(worker.bgw_function_name, "ParallelWorkerMain");
+ worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg));
+ worker.bgw_notify_pid = MyProcPid;
+
+ /*
+ * Start workers.
+ *
+ * The caller must be able to tolerate ending up with fewer workers than
+ * expected, so there is no need to throw an error here if registration
+ * fails. It wouldn't help much anyway, because registering the worker in
+ * no way guarantees that it will start up and initialize successfully.
+ */
+ for (i = 0; i < pcxt->nworkers_to_launch; ++i)
+ {
+ memcpy(worker.bgw_extra, &i, sizeof(int));
+ if (!any_registrations_failed &&
+ RegisterDynamicBackgroundWorker(&worker,
+ &pcxt->worker[i].bgwhandle))
+ {
+ shm_mq_set_handle(pcxt->worker[i].error_mqh,
+ pcxt->worker[i].bgwhandle);
+ pcxt->nworkers_launched++;
+ }
+ else
+ {
+ /*
+ * If we weren't able to register the worker, then we've bumped up
+ * against the max_worker_processes limit, and future
+ * registrations will probably fail too, so arrange to skip them.
+ * But we still have to execute this code for the remaining slots
+ * to make sure that we forget about the error queues we budgeted
+ * for those workers. Otherwise, we'll wait for them to start,
+ * but they never will.
+ */
+ any_registrations_failed = true;
+ pcxt->worker[i].bgwhandle = NULL;
+ shm_mq_detach(pcxt->worker[i].error_mqh);
+ pcxt->worker[i].error_mqh = NULL;
+ }
+ }
+
+ /*
+ * Now that nworkers_launched has taken its final value, we can initialize
+ * known_attached_workers.
+ */
+ if (pcxt->nworkers_launched > 0)
+ {
+ pcxt->known_attached_workers =
+ palloc0(sizeof(bool) * pcxt->nworkers_launched);
+ pcxt->nknown_attached_workers = 0;
+ }
+
+ /* Restore previous memory context. */
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Wait for all workers to attach to their error queues, and throw an error if
+ * any worker fails to do this.
+ *
+ * Callers can assume that if this function returns successfully, then the
+ * number of workers given by pcxt->nworkers_launched have initialized and
+ * attached to their error queues. Whether or not these workers are guaranteed
+ * to still be running depends on what code the caller asked them to run;
+ * this function does not guarantee that they have not exited. However, it
+ * does guarantee that any workers which exited must have done so cleanly and
+ * after successfully performing the work with which they were tasked.
+ *
+ * If this function is not called, then some of the workers that were launched
+ * may not have been started due to a fork() failure, or may have exited during
+ * early startup prior to attaching to the error queue, so nworkers_launched
+ * cannot be viewed as completely reliable. It will never be less than the
+ * number of workers which actually started, but it might be more. Any workers
+ * that failed to start will still be discovered by
+ * WaitForParallelWorkersToFinish and an error will be thrown at that time,
+ * provided that function is eventually reached.
+ *
+ * In general, the leader process should do as much work as possible before
+ * calling this function. fork() failures and other early-startup failures
+ * are very uncommon, and having the leader sit idle when it could be doing
+ * useful work is undesirable. However, if the leader needs to wait for
+ * all of its workers or for a specific worker, it may want to call this
+ * function before doing so. If not, it must make some other provision for
+ * the failure-to-start case, lest it wait forever. On the other hand, a
+ * leader which never waits for a worker that might not be started yet, or
+ * at least never does so prior to WaitForParallelWorkersToFinish(), need not
+ * call this function at all.
+ */
+void
+WaitForParallelWorkersToAttach(ParallelContext *pcxt)
+{
+ int i;
+
+ /* Skip this if we have no launched workers. */
+ if (pcxt->nworkers_launched == 0)
+ return;
+
+ for (;;)
+ {
+ /*
+ * This will process any parallel messages that are pending and it may
+ * also throw an error propagated from a worker.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ BgwHandleStatus status;
+ shm_mq *mq;
+ int rc;
+ pid_t pid;
+
+ if (pcxt->known_attached_workers[i])
+ continue;
+
+ /*
+ * If error_mqh is NULL, then the worker has already exited
+ * cleanly.
+ */
+ if (pcxt->worker[i].error_mqh == NULL)
+ {
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ continue;
+ }
+
+ status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid);
+ if (status == BGWH_STARTED)
+ {
+ /* Has the worker attached to the error queue? */
+ mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+ if (shm_mq_get_sender(mq) != NULL)
+ {
+ /* Yes, so it is known to be attached. */
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ }
+ }
+ else if (status == BGWH_STOPPED)
+ {
+ /*
+ * If the worker stopped without attaching to the error queue,
+ * throw an error.
+ */
+ mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+ if (shm_mq_get_sender(mq) == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("parallel worker failed to initialize"),
+ errhint("More details may be available in the server log.")));
+
+ pcxt->known_attached_workers[i] = true;
+ ++pcxt->nknown_attached_workers;
+ }
+ else
+ {
+ /*
+ * Worker not yet started, so we must wait. The postmaster
+ * will notify us if the worker's state changes. Our latch
+ * might also get set for some other reason, but if so we'll
+ * just end up waiting for the same worker again.
+ */
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+ -1, WAIT_EVENT_BGWORKER_STARTUP);
+
+ if (rc & WL_LATCH_SET)
+ ResetLatch(MyLatch);
+ }
+ }
+
+ /* If all workers are known to have started, we're done. */
+ if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched)
+ {
+ Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched);
+ break;
+ }
+ }
+}
+
+/*
+ * Wait for all workers to finish computing.
+ *
+ * Even if the parallel operation seems to have completed successfully, it's
+ * important to call this function afterwards. We must not miss any errors
+ * the workers may have thrown during the parallel operation, or any that they
+ * may yet throw while shutting down.
+ *
+ * Also, we want to update our notion of XactLastRecEnd based on worker
+ * feedback.
+ */
+void
+WaitForParallelWorkersToFinish(ParallelContext *pcxt)
+{
+ for (;;)
+ {
+ bool anyone_alive = false;
+ int nfinished = 0;
+ int i;
+
+ /*
+ * This will process any parallel messages that are pending, which may
+ * change the outcome of the loop that follows. It may also throw an
+ * error propagated from a worker.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ /*
+ * If error_mqh is NULL, then the worker has already exited
+ * cleanly. If we have received a message through error_mqh from
+ * the worker, we know it started up cleanly, and therefore we're
+ * certain to be notified when it exits.
+ */
+ if (pcxt->worker[i].error_mqh == NULL)
+ ++nfinished;
+ else if (pcxt->known_attached_workers[i])
+ {
+ anyone_alive = true;
+ break;
+ }
+ }
+
+ if (!anyone_alive)
+ {
+ /* If all workers are known to have finished, we're done. */
+ if (nfinished >= pcxt->nworkers_launched)
+ {
+ Assert(nfinished == pcxt->nworkers_launched);
+ break;
+ }
+
+ /*
+ * We didn't detect any living workers, but not all workers are
+ * known to have exited cleanly. Either not all workers have
+ * launched yet, or maybe some of them failed to start or
+ * terminated abnormally.
+ */
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ pid_t pid;
+ shm_mq *mq;
+
+ /*
+ * If the worker is BGWH_NOT_YET_STARTED or BGWH_STARTED, we
+ * should just keep waiting. If it is BGWH_STOPPED, then
+ * further investigation is needed.
+ */
+ if (pcxt->worker[i].error_mqh == NULL ||
+ pcxt->worker[i].bgwhandle == NULL ||
+ GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle,
+ &pid) != BGWH_STOPPED)
+ continue;
+
+ /*
+ * Check whether the worker ended up stopped without ever
+ * attaching to the error queue. If so, the postmaster was
+ * unable to fork the worker or it exited without initializing
+ * properly. We must throw an error, since the caller may
+ * have been expecting the worker to do some work before
+ * exiting.
+ */
+ mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+ if (shm_mq_get_sender(mq) == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("parallel worker failed to initialize"),
+ errhint("More details may be available in the server log.")));
+
+ /*
+ * The worker is stopped, but is attached to the error queue.
+ * Unless there's a bug somewhere, this will only happen when
+ * the worker writes messages and terminates after the
+ * CHECK_FOR_INTERRUPTS() near the top of this function and
+ * before the call to GetBackgroundWorkerPid(). In that case,
+ * or latch should have been set as well and the right things
+ * will happen on the next pass through the loop.
+ */
+ }
+ }
+
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1,
+ WAIT_EVENT_PARALLEL_FINISH);
+ ResetLatch(MyLatch);
+ }
+
+ if (pcxt->toc != NULL)
+ {
+ FixedParallelState *fps;
+
+ fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
+ if (fps->last_xlog_end > XactLastRecEnd)
+ XactLastRecEnd = fps->last_xlog_end;
+ }
+}
+
+/*
+ * Wait for all workers to exit.
+ *
+ * This function ensures that workers have been completely shutdown. The
+ * difference between WaitForParallelWorkersToFinish and this function is
+ * that the former just ensures that last message sent by a worker backend is
+ * received by the leader backend whereas this ensures the complete shutdown.
+ */
+static void
+WaitForParallelWorkersToExit(ParallelContext *pcxt)
+{
+ int i;
+
+ /* Wait until the workers actually die. */
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ BgwHandleStatus status;
+
+ if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL)
+ continue;
+
+ status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle);
+
+ /*
+ * If the postmaster kicked the bucket, we have no chance of cleaning
+ * up safely -- we won't be able to tell when our workers are actually
+ * dead. This doesn't necessitate a PANIC since they will all abort
+ * eventually, but we can't safely continue this session.
+ */
+ if (status == BGWH_POSTMASTER_DIED)
+ ereport(FATAL,
+ (errcode(ERRCODE_ADMIN_SHUTDOWN),
+ errmsg("postmaster exited during a parallel transaction")));
+
+ /* Release memory. */
+ pfree(pcxt->worker[i].bgwhandle);
+ pcxt->worker[i].bgwhandle = NULL;
+ }
+}
+
+/*
+ * Destroy a parallel context.
+ *
+ * If expecting a clean exit, you should use WaitForParallelWorkersToFinish()
+ * first, before calling this function. When this function is invoked, any
+ * remaining workers are forcibly killed; the dynamic shared memory segment
+ * is unmapped; and we then wait (uninterruptibly) for the workers to exit.
+ */
+void
+DestroyParallelContext(ParallelContext *pcxt)
+{
+ int i;
+
+ /*
+ * Be careful about order of operations here! We remove the parallel
+ * context from the list before we do anything else; otherwise, if an
+ * error occurs during a subsequent step, we might try to nuke it again
+ * from AtEOXact_Parallel or AtEOSubXact_Parallel.
+ */
+ dlist_delete(&pcxt->node);
+
+ /* Kill each worker in turn, and forget their error queues. */
+ if (pcxt->worker != NULL)
+ {
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ if (pcxt->worker[i].error_mqh != NULL)
+ {
+ TerminateBackgroundWorker(pcxt->worker[i].bgwhandle);
+
+ shm_mq_detach(pcxt->worker[i].error_mqh);
+ pcxt->worker[i].error_mqh = NULL;
+ }
+ }
+ }
+
+ /*
+ * If we have allocated a shared memory segment, detach it. This will
+ * implicitly detach the error queues, and any other shared memory queues,
+ * stored there.
+ */
+ if (pcxt->seg != NULL)
+ {
+ dsm_detach(pcxt->seg);
+ pcxt->seg = NULL;
+ }
+
+ /*
+ * If this parallel context is actually in backend-private memory rather
+ * than shared memory, free that memory instead.
+ */
+ if (pcxt->private_memory != NULL)
+ {
+ pfree(pcxt->private_memory);
+ pcxt->private_memory = NULL;
+ }
+
+ /*
+ * We can't finish transaction commit or abort until all of the workers
+ * have exited. This means, in particular, that we can't respond to
+ * interrupts at this stage.
+ */
+ HOLD_INTERRUPTS();
+ WaitForParallelWorkersToExit(pcxt);
+ RESUME_INTERRUPTS();
+
+ /* Free the worker array itself. */
+ if (pcxt->worker != NULL)
+ {
+ pfree(pcxt->worker);
+ pcxt->worker = NULL;
+ }
+
+ /* Free memory. */
+ pfree(pcxt->library_name);
+ pfree(pcxt->function_name);
+ pfree(pcxt);
+}
+
+/*
+ * Are there any parallel contexts currently active?
+ */
+bool
+ParallelContextActive(void)
+{
+ return !dlist_is_empty(&pcxt_list);
+}
+
+/*
+ * Handle receipt of an interrupt indicating a parallel worker message.
+ *
+ * Note: this is called within a signal handler! All we can do is set
+ * a flag that will cause the next CHECK_FOR_INTERRUPTS() to invoke
+ * HandleParallelMessages().
+ */
+void
+HandleParallelMessageInterrupt(void)
+{
+ InterruptPending = true;
+ ParallelMessagePending = true;
+ SetLatch(MyLatch);
+}
+
+/*
+ * Handle any queued protocol messages received from parallel workers.
+ */
+void
+HandleParallelMessages(void)
+{
+ dlist_iter iter;
+ MemoryContext oldcontext;
+
+ static MemoryContext hpm_context = NULL;
+
+ /*
+ * This is invoked from ProcessInterrupts(), and since some of the
+ * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential
+ * for recursive calls if more signals are received while this runs. It's
+ * unclear that recursive entry would be safe, and it doesn't seem useful
+ * even if it is safe, so let's block interrupts until done.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Moreover, CurrentMemoryContext might be pointing almost anywhere. We
+ * don't want to risk leaking data into long-lived contexts, so let's do
+ * our work here in a private context that we can reset on each use.
+ */
+ if (hpm_context == NULL) /* first time through? */
+ hpm_context = AllocSetContextCreate(TopMemoryContext,
+ "HandleParallelMessages",
+ ALLOCSET_DEFAULT_SIZES);
+ else
+ MemoryContextReset(hpm_context);
+
+ oldcontext = MemoryContextSwitchTo(hpm_context);
+
+ /* OK to process messages. Reset the flag saying there are more to do. */
+ ParallelMessagePending = false;
+
+ dlist_foreach(iter, &pcxt_list)
+ {
+ ParallelContext *pcxt;
+ int i;
+
+ pcxt = dlist_container(ParallelContext, node, iter.cur);
+ if (pcxt->worker == NULL)
+ continue;
+
+ for (i = 0; i < pcxt->nworkers_launched; ++i)
+ {
+ /*
+ * Read as many messages as we can from each worker, but stop when
+ * either (1) the worker's error queue goes away, which can happen
+ * if we receive a Terminate message from the worker; or (2) no
+ * more messages can be read from the worker without blocking.
+ */
+ while (pcxt->worker[i].error_mqh != NULL)
+ {
+ shm_mq_result res;
+ Size nbytes;
+ void *data;
+
+ res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes,
+ &data, true);
+ if (res == SHM_MQ_WOULD_BLOCK)
+ break;
+ else if (res == SHM_MQ_SUCCESS)
+ {
+ StringInfoData msg;
+
+ initStringInfo(&msg);
+ appendBinaryStringInfo(&msg, data, nbytes);
+ HandleParallelMessage(pcxt, i, &msg);
+ pfree(msg.data);
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("lost connection to parallel worker")));
+ }
+ }
+ }
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /* Might as well clear the context on our way out */
+ MemoryContextReset(hpm_context);
+
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * Handle a single protocol message received from a single parallel worker.
+ */
+static void
+HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg)
+{
+ char msgtype;
+
+ if (pcxt->known_attached_workers != NULL &&
+ !pcxt->known_attached_workers[i])
+ {
+ pcxt->known_attached_workers[i] = true;
+ pcxt->nknown_attached_workers++;
+ }
+
+ msgtype = pq_getmsgbyte(msg);
+
+ switch (msgtype)
+ {
+ case 'K': /* BackendKeyData */
+ {
+ int32 pid = pq_getmsgint(msg, 4);
+
+ (void) pq_getmsgint(msg, 4); /* discard cancel key */
+ (void) pq_getmsgend(msg);
+ pcxt->worker[i].pid = pid;
+ break;
+ }
+
+ case 'E': /* ErrorResponse */
+ case 'N': /* NoticeResponse */
+ {
+ ErrorData edata;
+ ErrorContextCallback *save_error_context_stack;
+
+ /* Parse ErrorResponse or NoticeResponse. */
+ pq_parse_errornotice(msg, &edata);
+
+ /* Death of a worker isn't enough justification for suicide. */
+ edata.elevel = Min(edata.elevel, ERROR);
+
+ /*
+ * If desired, add a context line to show that this is a
+ * message propagated from a parallel worker. Otherwise, it
+ * can sometimes be confusing to understand what actually
+ * happened. (We don't do this in FORCE_PARALLEL_REGRESS mode
+ * because it causes test-result instability depending on
+ * whether a parallel worker is actually used or not.)
+ */
+ if (force_parallel_mode != FORCE_PARALLEL_REGRESS)
+ {
+ if (edata.context)
+ edata.context = psprintf("%s\n%s", edata.context,
+ _("parallel worker"));
+ else
+ edata.context = pstrdup(_("parallel worker"));
+ }
+
+ /*
+ * Context beyond that should use the error context callbacks
+ * that were in effect when the ParallelContext was created,
+ * not the current ones.
+ */
+ save_error_context_stack = error_context_stack;
+ error_context_stack = pcxt->error_context_stack;
+
+ /* Rethrow error or print notice. */
+ ThrowErrorData(&edata);
+
+ /* Not an error, so restore previous context stack. */
+ error_context_stack = save_error_context_stack;
+
+ break;
+ }
+
+ case 'A': /* NotifyResponse */
+ {
+ /* Propagate NotifyResponse. */
+ int32 pid;
+ const char *channel;
+ const char *payload;
+
+ pid = pq_getmsgint(msg, 4);
+ channel = pq_getmsgrawstring(msg);
+ payload = pq_getmsgrawstring(msg);
+ pq_endmessage(msg);
+
+ NotifyMyFrontEnd(channel, payload, pid);
+
+ break;
+ }
+
+ case 'X': /* Terminate, indicating clean exit */
+ {
+ shm_mq_detach(pcxt->worker[i].error_mqh);
+ pcxt->worker[i].error_mqh = NULL;
+ break;
+ }
+
+ default:
+ {
+ elog(ERROR, "unrecognized message type received from parallel worker: %c (message length %d bytes)",
+ msgtype, msg->len);
+ }
+ }
+}
+
+/*
+ * End-of-subtransaction cleanup for parallel contexts.
+ *
+ * Currently, it's forbidden to enter or leave a subtransaction while
+ * parallel mode is in effect, so we could just blow away everything. But
+ * we may want to relax that restriction in the future, so this code
+ * contemplates that there may be multiple subtransaction IDs in pcxt_list.
+ */
+void
+AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId)
+{
+ while (!dlist_is_empty(&pcxt_list))
+ {
+ ParallelContext *pcxt;
+
+ pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+ if (pcxt->subid != mySubId)
+ break;
+ if (isCommit)
+ elog(WARNING, "leaked parallel context");
+ DestroyParallelContext(pcxt);
+ }
+}
+
+/*
+ * End-of-transaction cleanup for parallel contexts.
+ */
+void
+AtEOXact_Parallel(bool isCommit)
+{
+ while (!dlist_is_empty(&pcxt_list))
+ {
+ ParallelContext *pcxt;
+
+ pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+ if (isCommit)
+ elog(WARNING, "leaked parallel context");
+ DestroyParallelContext(pcxt);
+ }
+}
+
+/*
+ * Main entrypoint for parallel workers.
+ */
+void
+ParallelWorkerMain(Datum main_arg)
+{
+ dsm_segment *seg;
+ shm_toc *toc;
+ FixedParallelState *fps;
+ char *error_queue_space;
+ shm_mq *mq;
+ shm_mq_handle *mqh;
+ char *libraryspace;
+ char *entrypointstate;
+ char *library_name;
+ char *function_name;
+ parallel_worker_main_type entrypt;
+ char *gucspace;
+ char *combocidspace;
+ char *tsnapspace;
+ char *asnapspace;
+ char *tstatespace;
+ char *pendingsyncsspace;
+ char *reindexspace;
+ char *relmapperspace;
+ char *uncommittedenumsspace;
+ StringInfoData msgbuf;
+ char *session_dsm_handle_space;
+ Snapshot tsnapshot;
+ Snapshot asnapshot;
+
+ /* Set flag to indicate that we're initializing a parallel worker. */
+ InitializingParallelWorker = true;
+
+ /* Establish signal handlers. */
+ pqsignal(SIGTERM, die);
+ BackgroundWorkerUnblockSignals();
+
+ /* Determine and set our parallel worker number. */
+ Assert(ParallelWorkerNumber == -1);
+ memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int));
+
+ /* Set up a memory context to work in, just for cleanliness. */
+ CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext,
+ "Parallel worker",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Attach to the dynamic shared memory segment for the parallel query, and
+ * find its table of contents.
+ *
+ * Note: at this point, we have not created any ResourceOwner in this
+ * process. This will result in our DSM mapping surviving until process
+ * exit, which is fine. If there were a ResourceOwner, it would acquire
+ * ownership of the mapping, but we have no need for that.
+ */
+ seg = dsm_attach(DatumGetUInt32(main_arg));
+ if (seg == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not map dynamic shared memory segment")));
+ toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg));
+ if (toc == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid magic number in dynamic shared memory segment")));
+
+ /* Look up fixed parallel state. */
+ fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED, false);
+ MyFixedParallelState = fps;
+
+ /* Arrange to signal the leader if we exit. */
+ ParallelLeaderPid = fps->parallel_leader_pid;
+ ParallelLeaderBackendId = fps->parallel_leader_backend_id;
+ before_shmem_exit(ParallelWorkerShutdown, PointerGetDatum(seg));
+
+ /*
+ * Now we can find and attach to the error queue provided for us. That's
+ * good, because until we do that, any errors that happen here will not be
+ * reported back to the process that requested that this worker be
+ * launched.
+ */
+ error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE, false);
+ mq = (shm_mq *) (error_queue_space +
+ ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE);
+ shm_mq_set_sender(mq, MyProc);
+ mqh = shm_mq_attach(mq, seg, NULL);
+ pq_redirect_to_shm_mq(seg, mqh);
+ pq_set_parallel_leader(fps->parallel_leader_pid,
+ fps->parallel_leader_backend_id);
+
+ /*
+ * Send a BackendKeyData message to the process that initiated parallelism
+ * so that it has access to our PID before it receives any other messages
+ * from us. Our cancel key is sent, too, since that's the way the
+ * protocol message is defined, but it won't actually be used for anything
+ * in this case.
+ */
+ pq_beginmessage(&msgbuf, 'K');
+ pq_sendint32(&msgbuf, (int32) MyProcPid);
+ pq_sendint32(&msgbuf, (int32) MyCancelKey);
+ pq_endmessage(&msgbuf);
+
+ /*
+ * Hooray! Primary initialization is complete. Now, we need to set up our
+ * backend-local state to match the original backend.
+ */
+
+ /*
+ * Join locking group. We must do this before anything that could try to
+ * acquire a heavyweight lock, because any heavyweight locks acquired to
+ * this point could block either directly against the parallel group
+ * leader or against some process which in turn waits for a lock that
+ * conflicts with the parallel group leader, causing an undetected
+ * deadlock. (If we can't join the lock group, the leader has gone away,
+ * so just exit quietly.)
+ */
+ if (!BecomeLockGroupMember(fps->parallel_leader_pgproc,
+ fps->parallel_leader_pid))
+ return;
+
+ /*
+ * Restore transaction and statement start-time timestamps. This must
+ * happen before anything that would start a transaction, else asserts in
+ * xact.c will fire.
+ */
+ SetParallelStartTimestamps(fps->xact_ts, fps->stmt_ts);
+
+ /*
+ * Identify the entry point to be called. In theory this could result in
+ * loading an additional library, though most likely the entry point is in
+ * the core backend or in a library we just loaded.
+ */
+ entrypointstate = shm_toc_lookup(toc, PARALLEL_KEY_ENTRYPOINT, false);
+ library_name = entrypointstate;
+ function_name = entrypointstate + strlen(library_name) + 1;
+
+ entrypt = LookupParallelWorkerFunction(library_name, function_name);
+
+ /* Restore database connection. */
+ BackgroundWorkerInitializeConnectionByOid(fps->database_id,
+ fps->authenticated_user_id,
+ 0);
+
+ /*
+ * Set the client encoding to the database encoding, since that is what
+ * the leader will expect.
+ */
+ SetClientEncoding(GetDatabaseEncoding());
+
+ /*
+ * Load libraries that were loaded by original backend. We want to do
+ * this before restoring GUCs, because the libraries might define custom
+ * variables.
+ */
+ libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY, false);
+ StartTransactionCommand();
+ RestoreLibraryState(libraryspace);
+
+ /* Restore GUC values from launching backend. */
+ gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC, false);
+ RestoreGUCState(gucspace);
+ CommitTransactionCommand();
+
+ /* Crank up a transaction state appropriate to a parallel worker. */
+ tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE, false);
+ StartParallelWorkerTransaction(tstatespace);
+
+ /* Restore combo CID state. */
+ combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false);
+ RestoreComboCIDState(combocidspace);
+
+ /* Attach to the per-session DSM segment and contained objects. */
+ session_dsm_handle_space =
+ shm_toc_lookup(toc, PARALLEL_KEY_SESSION_DSM, false);
+ AttachSession(*(dsm_handle *) session_dsm_handle_space);
+
+ /*
+ * If the transaction isolation level is REPEATABLE READ or SERIALIZABLE,
+ * the leader has serialized the transaction snapshot and we must restore
+ * it. At lower isolation levels, there is no transaction-lifetime
+ * snapshot, but we need TransactionXmin to get set to a value which is
+ * less than or equal to the xmin of every snapshot that will be used by
+ * this worker. The easiest way to accomplish that is to install the
+ * active snapshot as the transaction snapshot. Code running in this
+ * parallel worker might take new snapshots via GetTransactionSnapshot()
+ * or GetLatestSnapshot(), but it shouldn't have any way of acquiring a
+ * snapshot older than the active snapshot.
+ */
+ asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, false);
+ tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, true);
+ asnapshot = RestoreSnapshot(asnapspace);
+ tsnapshot = tsnapspace ? RestoreSnapshot(tsnapspace) : asnapshot;
+ RestoreTransactionSnapshot(tsnapshot,
+ fps->parallel_leader_pgproc);
+ PushActiveSnapshot(asnapshot);
+
+ /*
+ * We've changed which tuples we can see, and must therefore invalidate
+ * system caches.
+ */
+ InvalidateSystemCaches();
+
+ /*
+ * Restore current role id. Skip verifying whether session user is
+ * allowed to become this role and blindly restore the leader's state for
+ * current role.
+ */
+ SetCurrentRoleId(fps->outer_user_id, fps->is_superuser);
+
+ /* Restore user ID and security context. */
+ SetUserIdAndSecContext(fps->current_user_id, fps->sec_context);
+
+ /* Restore temp-namespace state to ensure search path matches leader's. */
+ SetTempNamespaceState(fps->temp_namespace_id,
+ fps->temp_toast_namespace_id);
+
+ /* Restore pending syncs. */
+ pendingsyncsspace = shm_toc_lookup(toc, PARALLEL_KEY_PENDING_SYNCS,
+ false);
+ RestorePendingSyncs(pendingsyncsspace);
+
+ /* Restore reindex state. */
+ reindexspace = shm_toc_lookup(toc, PARALLEL_KEY_REINDEX_STATE, false);
+ RestoreReindexState(reindexspace);
+
+ /* Restore relmapper state. */
+ relmapperspace = shm_toc_lookup(toc, PARALLEL_KEY_RELMAPPER_STATE, false);
+ RestoreRelationMap(relmapperspace);
+
+ /* Restore uncommitted enums. */
+ uncommittedenumsspace = shm_toc_lookup(toc, PARALLEL_KEY_UNCOMMITTEDENUMS,
+ false);
+ RestoreUncommittedEnums(uncommittedenumsspace);
+
+ /* Attach to the leader's serializable transaction, if SERIALIZABLE. */
+ AttachSerializableXact(fps->serializable_xact_handle);
+
+ /*
+ * We've initialized all of our state now; nothing should change
+ * hereafter.
+ */
+ InitializingParallelWorker = false;
+ EnterParallelMode();
+
+ /*
+ * Time to do the real work: invoke the caller-supplied code.
+ */
+ entrypt(seg, toc);
+
+ /* Must exit parallel mode to pop active snapshot. */
+ ExitParallelMode();
+
+ /* Must pop active snapshot so snapmgr.c doesn't complain. */
+ PopActiveSnapshot();
+
+ /* Shut down the parallel-worker transaction. */
+ EndParallelWorkerTransaction();
+
+ /* Detach from the per-session DSM segment. */
+ DetachSession();
+
+ /* Report success. */
+ pq_putmessage('X', NULL, 0);
+}
+
+/*
+ * Update shared memory with the ending location of the last WAL record we
+ * wrote, if it's greater than the value already stored there.
+ */
+void
+ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end)
+{
+ FixedParallelState *fps = MyFixedParallelState;
+
+ Assert(fps != NULL);
+ SpinLockAcquire(&fps->mutex);
+ if (fps->last_xlog_end < last_xlog_end)
+ fps->last_xlog_end = last_xlog_end;
+ SpinLockRelease(&fps->mutex);
+}
+
+/*
+ * Make sure the leader tries to read from our error queue one more time.
+ * This guards against the case where we exit uncleanly without sending an
+ * ErrorResponse to the leader, for example because some code calls proc_exit
+ * directly.
+ *
+ * Also explicitly detach from dsm segment so that subsystems using
+ * on_dsm_detach() have a chance to send stats before the stats subsystem is
+ * shut down as part of a before_shmem_exit() hook.
+ *
+ * One might think this could instead be solved by carefully ordering the
+ * attaching to dsm segments, so that the pgstats segments get detached from
+ * later than the parallel query one. That turns out to not work because the
+ * stats hash might need to grow which can cause new segments to be allocated,
+ * which then will be detached from earlier.
+ */
+static void
+ParallelWorkerShutdown(int code, Datum arg)
+{
+ SendProcSignal(ParallelLeaderPid,
+ PROCSIG_PARALLEL_MESSAGE,
+ ParallelLeaderBackendId);
+
+ dsm_detach((dsm_segment *) DatumGetPointer(arg));
+}
+
+/*
+ * Look up (and possibly load) a parallel worker entry point function.
+ *
+ * For functions contained in the core code, we use library name "postgres"
+ * and consult the InternalParallelWorkers array. External functions are
+ * looked up, and loaded if necessary, using load_external_function().
+ *
+ * The point of this is to pass function names as strings across process
+ * boundaries. We can't pass actual function addresses because of the
+ * possibility that the function has been loaded at a different address
+ * in a different process. This is obviously a hazard for functions in
+ * loadable libraries, but it can happen even for functions in the core code
+ * on platforms using EXEC_BACKEND (e.g., Windows).
+ *
+ * At some point it might be worthwhile to get rid of InternalParallelWorkers[]
+ * in favor of applying load_external_function() for core functions too;
+ * but that raises portability issues that are not worth addressing now.
+ */
+static parallel_worker_main_type
+LookupParallelWorkerFunction(const char *libraryname, const char *funcname)
+{
+ /*
+ * If the function is to be loaded from postgres itself, search the
+ * InternalParallelWorkers array.
+ */
+ if (strcmp(libraryname, "postgres") == 0)
+ {
+ int i;
+
+ for (i = 0; i < lengthof(InternalParallelWorkers); i++)
+ {
+ if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0)
+ return InternalParallelWorkers[i].fn_addr;
+ }
+
+ /* We can only reach this by programming error. */
+ elog(ERROR, "internal function \"%s\" not found", funcname);
+ }
+
+ /* Otherwise load from external library. */
+ return (parallel_worker_main_type)
+ load_external_function(libraryname, funcname, true, NULL);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
new file mode 100644
index 0000000..6bb4de3
--- /dev/null
+++ b/src/backend/access/transam/rmgr.c
@@ -0,0 +1,161 @@
+/*
+ * rmgr.c
+ *
+ * Resource managers definition
+ *
+ * src/backend/access/transam/rmgr.c
+ */
+#include "postgres.h"
+
+#include "access/brin_xlog.h"
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/generic_xlog.h"
+#include "access/ginxlog.h"
+#include "access/gistxlog.h"
+#include "access/hash_xlog.h"
+#include "access/heapam_xlog.h"
+#include "access/multixact.h"
+#include "access/nbtxlog.h"
+#include "access/spgxlog.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "catalog/storage_xlog.h"
+#include "commands/dbcommands_xlog.h"
+#include "commands/sequence.h"
+#include "commands/tablespace.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "replication/decode.h"
+#include "replication/message.h"
+#include "replication/origin.h"
+#include "storage/standby.h"
+#include "utils/builtins.h"
+#include "utils/relmapper.h"
+
+/* must be kept in sync with RmgrData definition in xlog_internal.h */
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \
+ { name, redo, desc, identify, startup, cleanup, mask, decode },
+
+RmgrData RmgrTable[RM_MAX_ID + 1] = {
+#include "access/rmgrlist.h"
+};
+
+/*
+ * Start up all resource managers.
+ */
+void
+RmgrStartup(void)
+{
+ for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (!RmgrIdExists(rmid))
+ continue;
+
+ if (RmgrTable[rmid].rm_startup != NULL)
+ RmgrTable[rmid].rm_startup();
+ }
+}
+
+/*
+ * Clean up all resource managers.
+ */
+void
+RmgrCleanup(void)
+{
+ for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (!RmgrIdExists(rmid))
+ continue;
+
+ if (RmgrTable[rmid].rm_cleanup != NULL)
+ RmgrTable[rmid].rm_cleanup();
+ }
+}
+
+/*
+ * Emit ERROR when we encounter a record with an RmgrId we don't
+ * recognize.
+ */
+void
+RmgrNotFound(RmgrId rmid)
+{
+ ereport(ERROR, (errmsg("resource manager with ID %d not registered", rmid),
+ errhint("Include the extension module that implements this resource manager in shared_preload_libraries.")));
+}
+
+/*
+ * Register a new custom WAL resource manager.
+ *
+ * Resource manager IDs must be globally unique across all extensions. Refer
+ * to https://wiki.postgresql.org/wiki/CustomWALResourceManagers to reserve a
+ * unique RmgrId for your extension, to avoid conflicts with other extension
+ * developers. During development, use RM_EXPERIMENTAL_ID to avoid needlessly
+ * reserving a new ID.
+ */
+void
+RegisterCustomRmgr(RmgrId rmid, RmgrData *rmgr)
+{
+ if (rmgr->rm_name == NULL || strlen(rmgr->rm_name) == 0)
+ ereport(ERROR, (errmsg("custom resource manager name is invalid"),
+ errhint("Provide a non-empty name for the custom resource manager.")));
+
+ if (!RmgrIdIsCustom(rmid))
+ ereport(ERROR, (errmsg("custom resource manager ID %d is out of range", rmid),
+ errhint("Provide a custom resource manager ID between %d and %d.",
+ RM_MIN_CUSTOM_ID, RM_MAX_CUSTOM_ID)));
+
+ if (!process_shared_preload_libraries_in_progress)
+ ereport(ERROR,
+ (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+ errdetail("Custom resource manager must be registered while initializing modules in shared_preload_libraries.")));
+
+ if (RmgrTable[rmid].rm_name != NULL)
+ ereport(ERROR,
+ (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+ errdetail("Custom resource manager \"%s\" already registered with the same ID.",
+ RmgrTable[rmid].rm_name)));
+
+ /* check for existing rmgr with the same name */
+ for (int existing_rmid = 0; existing_rmid <= RM_MAX_ID; existing_rmid++)
+ {
+ if (!RmgrIdExists(existing_rmid))
+ continue;
+
+ if (!pg_strcasecmp(RmgrTable[existing_rmid].rm_name, rmgr->rm_name))
+ ereport(ERROR,
+ (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+ errdetail("Existing resource manager with ID %d has the same name.", existing_rmid)));
+ }
+
+ /* register it */
+ RmgrTable[rmid] = *rmgr;
+ ereport(LOG,
+ (errmsg("registered custom resource manager \"%s\" with ID %d",
+ rmgr->rm_name, rmid)));
+}
+
+/* SQL SRF showing loaded resource managers */
+Datum
+pg_get_wal_resource_managers(PG_FUNCTION_ARGS)
+{
+#define PG_GET_RESOURCE_MANAGERS_COLS 3
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ Datum values[PG_GET_RESOURCE_MANAGERS_COLS];
+ bool nulls[PG_GET_RESOURCE_MANAGERS_COLS] = {0};
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (!RmgrIdExists(rmid))
+ continue;
+ values[0] = Int32GetDatum(rmid);
+ values[1] = CStringGetTextDatum(GetRmgr(rmid).rm_name);
+ values[2] = BoolGetDatum(RmgrIdIsBuiltin(rmid));
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ }
+
+ return (Datum) 0;
+}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
new file mode 100644
index 0000000..af57fe9
--- /dev/null
+++ b/src/backend/access/transam/slru.c
@@ -0,0 +1,1615 @@
+/*-------------------------------------------------------------------------
+ *
+ * slru.c
+ * Simple LRU buffering for transaction status logfiles
+ *
+ * We use a simple least-recently-used scheme to manage a pool of page
+ * buffers. Under ordinary circumstances we expect that write
+ * traffic will occur mostly to the latest page (and to the just-prior
+ * page, soon after a page transition). Read traffic will probably touch
+ * a larger span of pages, but in any case a fairly small number of page
+ * buffers should be sufficient. So, we just search the buffers using plain
+ * linear search; there's no need for a hashtable or anything fancy.
+ * The management algorithm is straight LRU except that we will never swap
+ * out the latest page (since we know it's going to be hit again eventually).
+ *
+ * We use a control LWLock to protect the shared data structures, plus
+ * per-buffer LWLocks that synchronize I/O for each buffer. The control lock
+ * must be held to examine or modify any shared state. A process that is
+ * reading in or writing out a page buffer does not hold the control lock,
+ * only the per-buffer lock for the buffer it is working on.
+ *
+ * "Holding the control lock" means exclusive lock in all cases except for
+ * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
+ * the implications of that.
+ *
+ * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
+ * before releasing the control lock. The per-buffer lock is released after
+ * completing the I/O, re-acquiring the control lock, and updating the shared
+ * state. (Deadlock is not possible here, because we never try to initiate
+ * I/O when someone else is already doing I/O on the same buffer.)
+ * To wait for I/O to complete, release the control lock, acquire the
+ * per-buffer lock in shared mode, immediately release the per-buffer lock,
+ * reacquire the control lock, and then recheck state (since arbitrary things
+ * could have happened while we didn't have the lock).
+ *
+ * As with the regular buffer manager, it is possible for another process
+ * to re-dirty a page that is currently being written out. This is handled
+ * by re-setting the page's page_dirty flag.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/slru.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/shmem.h"
+
+#define SlruFileName(ctl, path, seg) \
+ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
+
+/*
+ * During SimpleLruWriteAll(), we will usually not need to write more than one
+ * or two physical files, but we may need to write several pages per file. We
+ * can consolidate the I/O requests by leaving files open until control returns
+ * to SimpleLruWriteAll(). This data structure remembers which files are open.
+ */
+#define MAX_WRITEALL_BUFFERS 16
+
+typedef struct SlruWriteAllData
+{
+ int num_files; /* # files actually open */
+ int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
+ int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
+} SlruWriteAllData;
+
+typedef struct SlruWriteAllData *SlruWriteAll;
+
+/*
+ * Populate a file tag describing a segment file. We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
+( \
+ memset(&(a), 0, sizeof(FileTag)), \
+ (a).handler = (xx_handler), \
+ (a).segno = (xx_segno) \
+)
+
+/*
+ * Macro to mark a buffer slot "most recently used". Note multiple evaluation
+ * of arguments!
+ *
+ * The reason for the if-test is that there are often many consecutive
+ * accesses to the same page (particularly the latest page). By suppressing
+ * useless increments of cur_lru_count, we reduce the probability that old
+ * pages' counts will "wrap around" and make them appear recently used.
+ *
+ * We allow this code to be executed concurrently by multiple processes within
+ * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic,
+ * this should not cause any completely-bogus values to enter the computation.
+ * However, it is possible for either cur_lru_count or individual
+ * page_lru_count entries to be "reset" to lower values than they should have,
+ * in case a process is delayed while it executes this macro. With care in
+ * SlruSelectLRUPage(), this does little harm, and in any case the absolute
+ * worst possible consequence is a nonoptimal choice of page to evict. The
+ * gain from allowing concurrent reads of SLRU pages seems worth it.
+ */
+#define SlruRecentlyUsed(shared, slotno) \
+ do { \
+ int new_lru_count = (shared)->cur_lru_count; \
+ if (new_lru_count != (shared)->page_lru_count[slotno]) { \
+ (shared)->cur_lru_count = ++new_lru_count; \
+ (shared)->page_lru_count[slotno] = new_lru_count; \
+ } \
+ } while (0)
+
+/* Saved info for SlruReportIOError */
+typedef enum
+{
+ SLRU_OPEN_FAILED,
+ SLRU_SEEK_FAILED,
+ SLRU_READ_FAILED,
+ SLRU_WRITE_FAILED,
+ SLRU_FSYNC_FAILED,
+ SLRU_CLOSE_FAILED
+} SlruErrorCause;
+
+static SlruErrorCause slru_errcause;
+static int slru_errno;
+
+
+static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
+static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
+static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
+static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+ SlruWriteAll fdata);
+static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
+static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
+
+static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
+ int segpage, void *data);
+static void SlruInternalDeleteSegment(SlruCtl ctl, int segno);
+
+/*
+ * Initialization of shared memory
+ */
+
+Size
+SimpleLruShmemSize(int nslots, int nlsns)
+{
+ Size sz;
+
+ /* we assume nslots isn't so large as to risk overflow */
+ sz = MAXALIGN(sizeof(SlruSharedData));
+ sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
+ sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
+ sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
+ sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */
+ sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
+ sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
+
+ if (nlsns > 0)
+ sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
+
+ return BUFFERALIGN(sz) + BLCKSZ * nslots;
+}
+
+/*
+ * Initialize, or attach to, a simple LRU cache in shared memory.
+ *
+ * ctl: address of local (unshared) control structure.
+ * name: name of SLRU. (This is user-visible, pick with care!)
+ * nslots: number of page slots to use.
+ * nlsns: number of LSN groups per page (set to zero if not relevant).
+ * ctllock: LWLock to use to control access to the shared control structure.
+ * subdir: PGDATA-relative subdirectory that will contain the files.
+ * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
+ * sync_handler: which set of functions to use to handle sync requests
+ */
+void
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+ LWLock *ctllock, const char *subdir, int tranche_id,
+ SyncRequestHandler sync_handler)
+{
+ SlruShared shared;
+ bool found;
+
+ shared = (SlruShared) ShmemInitStruct(name,
+ SimpleLruShmemSize(nslots, nlsns),
+ &found);
+
+ if (!IsUnderPostmaster)
+ {
+ /* Initialize locks and shared memory area */
+ char *ptr;
+ Size offset;
+ int slotno;
+
+ Assert(!found);
+
+ memset(shared, 0, sizeof(SlruSharedData));
+
+ shared->ControlLock = ctllock;
+
+ shared->num_slots = nslots;
+ shared->lsn_groups_per_page = nlsns;
+
+ shared->cur_lru_count = 0;
+
+ /* shared->latest_page_number will be set later */
+
+ shared->slru_stats_idx = pgstat_get_slru_index(name);
+
+ ptr = (char *) shared;
+ offset = MAXALIGN(sizeof(SlruSharedData));
+ shared->page_buffer = (char **) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(char *));
+ shared->page_status = (SlruPageStatus *) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
+ shared->page_dirty = (bool *) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(bool));
+ shared->page_number = (int *) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(int));
+ shared->page_lru_count = (int *) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(int));
+
+ /* Initialize LWLocks */
+ shared->buffer_locks = (LWLockPadded *) (ptr + offset);
+ offset += MAXALIGN(nslots * sizeof(LWLockPadded));
+
+ if (nlsns > 0)
+ {
+ shared->group_lsn = (XLogRecPtr *) (ptr + offset);
+ offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
+ }
+
+ ptr += BUFFERALIGN(offset);
+ for (slotno = 0; slotno < nslots; slotno++)
+ {
+ LWLockInitialize(&shared->buffer_locks[slotno].lock,
+ tranche_id);
+
+ shared->page_buffer[slotno] = ptr;
+ shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+ shared->page_dirty[slotno] = false;
+ shared->page_lru_count[slotno] = 0;
+ ptr += BLCKSZ;
+ }
+
+ /* Should fit to estimated shmem size */
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ }
+ else
+ Assert(found);
+
+ /*
+ * Initialize the unshared control struct, including directory path. We
+ * assume caller set PagePrecedes.
+ */
+ ctl->shared = shared;
+ ctl->sync_handler = sync_handler;
+ strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
+}
+
+/*
+ * Initialize (or reinitialize) a page to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+int
+SimpleLruZeroPage(SlruCtl ctl, int pageno)
+{
+ SlruShared shared = ctl->shared;
+ int slotno;
+
+ /* Find a suitable buffer slot for the page */
+ slotno = SlruSelectLRUPage(ctl, pageno);
+ Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+ (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[slotno]) ||
+ shared->page_number[slotno] == pageno);
+
+ /* Mark the slot as containing this page */
+ shared->page_number[slotno] = pageno;
+ shared->page_status[slotno] = SLRU_PAGE_VALID;
+ shared->page_dirty[slotno] = true;
+ SlruRecentlyUsed(shared, slotno);
+
+ /* Set the buffer to zeroes */
+ MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+
+ /* Set the LSNs for this new page to zero */
+ SimpleLruZeroLSNs(ctl, slotno);
+
+ /* Assume this page is now the latest active page */
+ shared->latest_page_number = pageno;
+
+ /* update the stats counter of zeroed pages */
+ pgstat_count_slru_page_zeroed(shared->slru_stats_idx);
+
+ return slotno;
+}
+
+/*
+ * Zero all the LSNs we store for this slru page.
+ *
+ * This should be called each time we create a new page, and each time we read
+ * in a page from disk into an existing buffer. (Such an old page cannot
+ * have any interesting LSNs, since we'd have flushed them before writing
+ * the page in the first place.)
+ *
+ * This assumes that InvalidXLogRecPtr is bitwise-all-0.
+ */
+static void
+SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
+{
+ SlruShared shared = ctl->shared;
+
+ if (shared->lsn_groups_per_page > 0)
+ MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
+ shared->lsn_groups_per_page * sizeof(XLogRecPtr));
+}
+
+/*
+ * Wait for any active I/O on a page slot to finish. (This does not
+ * guarantee that new I/O hasn't been started before we return, though.
+ * In fact the slot might not even contain the same page anymore.)
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static void
+SimpleLruWaitIO(SlruCtl ctl, int slotno)
+{
+ SlruShared shared = ctl->shared;
+
+ /* See notes at top of file */
+ LWLockRelease(shared->ControlLock);
+ LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
+ LWLockRelease(&shared->buffer_locks[slotno].lock);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+ /*
+ * If the slot is still in an io-in-progress state, then either someone
+ * already started a new I/O on the slot, or a previous I/O failed and
+ * neglected to reset the page state. That shouldn't happen, really, but
+ * it seems worth a few extra cycles to check and recover from it. We can
+ * cheaply test for failure by seeing if the buffer lock is still held (we
+ * assume that transaction abort would release the lock).
+ */
+ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+ shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
+ {
+ if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
+ {
+ /* indeed, the I/O must have failed */
+ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
+ shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+ else /* write_in_progress */
+ {
+ shared->page_status[slotno] = SLRU_PAGE_VALID;
+ shared->page_dirty[slotno] = true;
+ }
+ LWLockRelease(&shared->buffer_locks[slotno].lock);
+ }
+ }
+}
+
+/*
+ * Find a page in a shared buffer, reading it in if necessary.
+ * The page number must correspond to an already-initialized page.
+ *
+ * If write_ok is true then it is OK to return a page that is in
+ * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
+ * that modification of the page is safe. If write_ok is false then we
+ * will not return the page until it is not undergoing active I/O.
+ *
+ * The passed-in xid is used only for error reporting, and may be
+ * InvalidTransactionId if no specific xid is associated with the action.
+ *
+ * Return value is the shared-buffer slot number now holding the page.
+ * The buffer's LRU access info is updated.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+int
+SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+ TransactionId xid)
+{
+ SlruShared shared = ctl->shared;
+
+ /* Outer loop handles restart if we must wait for someone else's I/O */
+ for (;;)
+ {
+ int slotno;
+ bool ok;
+
+ /* See if page already is in memory; if not, pick victim slot */
+ slotno = SlruSelectLRUPage(ctl, pageno);
+
+ /* Did we find the page in memory? */
+ if (shared->page_number[slotno] == pageno &&
+ shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+ {
+ /*
+ * If page is still being read in, we must wait for I/O. Likewise
+ * if the page is being written and the caller said that's not OK.
+ */
+ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+ (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+ !write_ok))
+ {
+ SimpleLruWaitIO(ctl, slotno);
+ /* Now we must recheck state from the top */
+ continue;
+ }
+ /* Otherwise, it's ready to use */
+ SlruRecentlyUsed(shared, slotno);
+
+ /* update the stats counter of pages found in the SLRU */
+ pgstat_count_slru_page_hit(shared->slru_stats_idx);
+
+ return slotno;
+ }
+
+ /* We found no match; assert we selected a freeable slot */
+ Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+ (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[slotno]));
+
+ /* Mark the slot read-busy */
+ shared->page_number[slotno] = pageno;
+ shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
+ shared->page_dirty[slotno] = false;
+
+ /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
+ LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
+
+ /* Release control lock while doing I/O */
+ LWLockRelease(shared->ControlLock);
+
+ /* Do the read */
+ ok = SlruPhysicalReadPage(ctl, pageno, slotno);
+
+ /* Set the LSNs for this newly read-in page to zero */
+ SimpleLruZeroLSNs(ctl, slotno);
+
+ /* Re-acquire control lock and update page state */
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+ Assert(shared->page_number[slotno] == pageno &&
+ shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
+ !shared->page_dirty[slotno]);
+
+ shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
+
+ LWLockRelease(&shared->buffer_locks[slotno].lock);
+
+ /* Now it's okay to ereport if we failed */
+ if (!ok)
+ SlruReportIOError(ctl, pageno, xid);
+
+ SlruRecentlyUsed(shared, slotno);
+
+ /* update the stats counter of pages not found in SLRU */
+ pgstat_count_slru_page_read(shared->slru_stats_idx);
+
+ return slotno;
+ }
+}
+
+/*
+ * Find a page in a shared buffer, reading it in if necessary.
+ * The page number must correspond to an already-initialized page.
+ * The caller must intend only read-only access to the page.
+ *
+ * The passed-in xid is used only for error reporting, and may be
+ * InvalidTransactionId if no specific xid is associated with the action.
+ *
+ * Return value is the shared-buffer slot number now holding the page.
+ * The buffer's LRU access info is updated.
+ *
+ * Control lock must NOT be held at entry, but will be held at exit.
+ * It is unspecified whether the lock will be shared or exclusive.
+ */
+int
+SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
+{
+ SlruShared shared = ctl->shared;
+ int slotno;
+
+ /* Try to find the page while holding only shared lock */
+ LWLockAcquire(shared->ControlLock, LW_SHARED);
+
+ /* See if page is already in a buffer */
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ if (shared->page_number[slotno] == pageno &&
+ shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
+ shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
+ {
+ /* See comments for SlruRecentlyUsed macro */
+ SlruRecentlyUsed(shared, slotno);
+
+ /* update the stats counter of pages found in the SLRU */
+ pgstat_count_slru_page_hit(shared->slru_stats_idx);
+
+ return slotno;
+ }
+ }
+
+ /* No luck, so switch to normal exclusive lock and do regular read */
+ LWLockRelease(shared->ControlLock);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+ return SimpleLruReadPage(ctl, pageno, true, xid);
+}
+
+/*
+ * Write a page from a shared buffer, if necessary.
+ * Does nothing if the specified slot is not dirty.
+ *
+ * NOTE: only one write attempt is made here. Hence, it is possible that
+ * the page is still dirty at exit (if someone else re-dirtied it during
+ * the write). However, we *do* attempt a fresh write even if the page
+ * is already being written; this is for checkpoints.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static void
+SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
+{
+ SlruShared shared = ctl->shared;
+ int pageno = shared->page_number[slotno];
+ bool ok;
+
+ /* If a write is in progress, wait for it to finish */
+ while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+ shared->page_number[slotno] == pageno)
+ {
+ SimpleLruWaitIO(ctl, slotno);
+ }
+
+ /*
+ * Do nothing if page is not dirty, or if buffer no longer contains the
+ * same page we were called for.
+ */
+ if (!shared->page_dirty[slotno] ||
+ shared->page_status[slotno] != SLRU_PAGE_VALID ||
+ shared->page_number[slotno] != pageno)
+ return;
+
+ /*
+ * Mark the slot write-busy, and clear the dirtybit. After this point, a
+ * transaction status update on this page will mark it dirty again.
+ */
+ shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
+ shared->page_dirty[slotno] = false;
+
+ /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
+ LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
+
+ /* Release control lock while doing I/O */
+ LWLockRelease(shared->ControlLock);
+
+ /* Do the write */
+ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+ /* If we failed, and we're in a flush, better close the files */
+ if (!ok && fdata)
+ {
+ int i;
+
+ for (i = 0; i < fdata->num_files; i++)
+ CloseTransientFile(fdata->fd[i]);
+ }
+
+ /* Re-acquire control lock and update page state */
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+ Assert(shared->page_number[slotno] == pageno &&
+ shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
+
+ /* If we failed to write, mark the page dirty again */
+ if (!ok)
+ shared->page_dirty[slotno] = true;
+
+ shared->page_status[slotno] = SLRU_PAGE_VALID;
+
+ LWLockRelease(&shared->buffer_locks[slotno].lock);
+
+ /* Now it's okay to ereport if we failed */
+ if (!ok)
+ SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+ /* If part of a checkpoint, count this as a buffer written. */
+ if (fdata)
+ CheckpointStats.ckpt_bufs_written++;
+}
+
+/*
+ * Wrapper of SlruInternalWritePage, for external callers.
+ * fdata is always passed a NULL here.
+ */
+void
+SimpleLruWritePage(SlruCtl ctl, int slotno)
+{
+ SlruInternalWritePage(ctl, slotno, NULL);
+}
+
+/*
+ * Return whether the given page exists on disk.
+ *
+ * A false return means that either the file does not exist, or that it's not
+ * large enough to contain the given page.
+ */
+bool
+SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
+{
+ int segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+ int offset = rpageno * BLCKSZ;
+ char path[MAXPGPATH];
+ int fd;
+ bool result;
+ off_t endpos;
+
+ /* update the stats counter of checked pages */
+ pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
+
+ SlruFileName(ctl, path, segno);
+
+ fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+ if (fd < 0)
+ {
+ /* expected: file doesn't exist */
+ if (errno == ENOENT)
+ return false;
+
+ /* report error normally */
+ slru_errcause = SLRU_OPEN_FAILED;
+ slru_errno = errno;
+ SlruReportIOError(ctl, pageno, 0);
+ }
+
+ if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
+ {
+ slru_errcause = SLRU_SEEK_FAILED;
+ slru_errno = errno;
+ SlruReportIOError(ctl, pageno, 0);
+ }
+
+ result = endpos >= (off_t) (offset + BLCKSZ);
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+
+ return result;
+}
+
+/*
+ * Physical read of a (previously existing) page into a buffer slot
+ *
+ * On failure, we cannot just ereport(ERROR) since caller has put state in
+ * shared memory that must be undone. So, we return false and save enough
+ * info in static variables to let SlruReportIOError make the report.
+ *
+ * For now, assume it's not worth keeping a file pointer open across
+ * read/write operations. We could cache one virtual file pointer ...
+ */
+static bool
+SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
+{
+ SlruShared shared = ctl->shared;
+ int segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+ off_t offset = rpageno * BLCKSZ;
+ char path[MAXPGPATH];
+ int fd;
+
+ SlruFileName(ctl, path, segno);
+
+ /*
+ * In a crash-and-restart situation, it's possible for us to receive
+ * commands to set the commit status of transactions whose bits are in
+ * already-truncated segments of the commit log (see notes in
+ * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
+ * where the file doesn't exist, and return zeroes instead.
+ */
+ fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+ if (fd < 0)
+ {
+ if (errno != ENOENT || !InRecovery)
+ {
+ slru_errcause = SLRU_OPEN_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+
+ ereport(LOG,
+ (errmsg("file \"%s\" doesn't exist, reading as zeroes",
+ path)));
+ MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ return true;
+ }
+
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
+ if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+ {
+ pgstat_report_wait_end();
+ slru_errcause = SLRU_READ_FAILED;
+ slru_errno = errno;
+ CloseTransientFile(fd);
+ return false;
+ }
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Physical write of a page from a buffer slot
+ *
+ * On failure, we cannot just ereport(ERROR) since caller has put state in
+ * shared memory that must be undone. So, we return false and save enough
+ * info in static variables to let SlruReportIOError make the report.
+ *
+ * For now, assume it's not worth keeping a file pointer open across
+ * independent read/write operations. We do batch operations during
+ * SimpleLruWriteAll, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruWriteAll.
+ */
+static bool
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
+{
+ SlruShared shared = ctl->shared;
+ int segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+ off_t offset = rpageno * BLCKSZ;
+ char path[MAXPGPATH];
+ int fd = -1;
+
+ /* update the stats counter of written pages */
+ pgstat_count_slru_page_written(shared->slru_stats_idx);
+
+ /*
+ * Honor the write-WAL-before-data rule, if appropriate, so that we do not
+ * write out data before associated WAL records. This is the same action
+ * performed during FlushBuffer() in the main buffer manager.
+ */
+ if (shared->group_lsn != NULL)
+ {
+ /*
+ * We must determine the largest async-commit LSN for the page. This
+ * is a bit tedious, but since this entire function is a slow path
+ * anyway, it seems better to do this here than to maintain a per-page
+ * LSN variable (which'd need an extra comparison in the
+ * transaction-commit path).
+ */
+ XLogRecPtr max_lsn;
+ int lsnindex,
+ lsnoff;
+
+ lsnindex = slotno * shared->lsn_groups_per_page;
+ max_lsn = shared->group_lsn[lsnindex++];
+ for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
+ {
+ XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
+
+ if (max_lsn < this_lsn)
+ max_lsn = this_lsn;
+ }
+
+ if (!XLogRecPtrIsInvalid(max_lsn))
+ {
+ /*
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
+ */
+ START_CRIT_SECTION();
+ XLogFlush(max_lsn);
+ END_CRIT_SECTION();
+ }
+ }
+
+ /*
+ * During a WriteAll, we may already have the desired file open.
+ */
+ if (fdata)
+ {
+ int i;
+
+ for (i = 0; i < fdata->num_files; i++)
+ {
+ if (fdata->segno[i] == segno)
+ {
+ fd = fdata->fd[i];
+ break;
+ }
+ }
+ }
+
+ if (fd < 0)
+ {
+ /*
+ * If the file doesn't already exist, we should create it. It is
+ * possible for this to need to happen when writing a page that's not
+ * first in its segment; we assume the OS can cope with that. (Note:
+ * it might seem that it'd be okay to create files only when
+ * SimpleLruZeroPage is called for the first page of a segment.
+ * However, if after a crash and restart the REDO logic elects to
+ * replay the log from a checkpoint before the latest one, then it's
+ * possible that we will get commands to set transaction status of
+ * transactions that have already been truncated from the commit log.
+ * Easiest way to deal with that is to accept references to
+ * nonexistent files here and in SlruPhysicalReadPage.)
+ *
+ * Note: it is possible for more than one backend to be executing this
+ * code simultaneously for different pages of the same file. Hence,
+ * don't use O_EXCL or O_TRUNC or anything like that.
+ */
+ SlruFileName(ctl, path, segno);
+ fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+ if (fd < 0)
+ {
+ slru_errcause = SLRU_OPEN_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+
+ if (fdata)
+ {
+ if (fdata->num_files < MAX_WRITEALL_BUFFERS)
+ {
+ fdata->fd[fdata->num_files] = fd;
+ fdata->segno[fdata->num_files] = segno;
+ fdata->num_files++;
+ }
+ else
+ {
+ /*
+ * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
+ * fall back to treating it as a standalone write.
+ */
+ fdata = NULL;
+ }
+ }
+ }
+
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
+ if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+ {
+ pgstat_report_wait_end();
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ slru_errcause = SLRU_WRITE_FAILED;
+ slru_errno = errno;
+ if (!fdata)
+ CloseTransientFile(fd);
+ return false;
+ }
+ pgstat_report_wait_end();
+
+ /* Queue up a sync request for the checkpointer. */
+ if (ctl->sync_handler != SYNC_HANDLER_NONE)
+ {
+ FileTag tag;
+
+ INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+ if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+ {
+ /* No space to enqueue sync request. Do it synchronously. */
+ pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+ if (pg_fsync(fd) != 0)
+ {
+ pgstat_report_wait_end();
+ slru_errcause = SLRU_FSYNC_FAILED;
+ slru_errno = errno;
+ CloseTransientFile(fd);
+ return false;
+ }
+ pgstat_report_wait_end();
+ }
+ }
+
+ /* Close file, unless part of flush request. */
+ if (!fdata)
+ {
+ if (CloseTransientFile(fd) != 0)
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Issue the error message after failure of SlruPhysicalReadPage or
+ * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
+ */
+static void
+SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
+{
+ int segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+ int offset = rpageno * BLCKSZ;
+ char path[MAXPGPATH];
+
+ SlruFileName(ctl, path, segno);
+ errno = slru_errno;
+ switch (slru_errcause)
+ {
+ case SLRU_OPEN_FAILED:
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not open file \"%s\": %m.", path)));
+ break;
+ case SLRU_SEEK_FAILED:
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not seek in file \"%s\" to offset %d: %m.",
+ path, offset)));
+ break;
+ case SLRU_READ_FAILED:
+ if (errno)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not read from file \"%s\" at offset %d: %m.",
+ path, offset)));
+ else
+ ereport(ERROR,
+ (errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
+ break;
+ case SLRU_WRITE_FAILED:
+ if (errno)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not write to file \"%s\" at offset %d: %m.",
+ path, offset)));
+ else
+ ereport(ERROR,
+ (errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
+ path, offset)));
+ break;
+ case SLRU_FSYNC_FAILED:
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not fsync file \"%s\": %m.",
+ path)));
+ break;
+ case SLRU_CLOSE_FAILED:
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Could not close file \"%s\": %m.",
+ path)));
+ break;
+ default:
+ /* can't get here, we trust */
+ elog(ERROR, "unrecognized SimpleLru error cause: %d",
+ (int) slru_errcause);
+ break;
+ }
+}
+
+/*
+ * Select the slot to re-use when we need a free slot.
+ *
+ * The target page number is passed because we need to consider the
+ * possibility that some other process reads in the target page while
+ * we are doing I/O to free a slot. Hence, check or recheck to see if
+ * any slot already holds the target page, and return that slot if so.
+ * Thus, the returned slot is *either* a slot already holding the pageno
+ * (could be any state except EMPTY), *or* a freeable slot (state EMPTY
+ * or CLEAN).
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+SlruSelectLRUPage(SlruCtl ctl, int pageno)
+{
+ SlruShared shared = ctl->shared;
+
+ /* Outer loop handles restart after I/O */
+ for (;;)
+ {
+ int slotno;
+ int cur_count;
+ int bestvalidslot = 0; /* keep compiler quiet */
+ int best_valid_delta = -1;
+ int best_valid_page_number = 0; /* keep compiler quiet */
+ int bestinvalidslot = 0; /* keep compiler quiet */
+ int best_invalid_delta = -1;
+ int best_invalid_page_number = 0; /* keep compiler quiet */
+
+ /* See if page already has a buffer assigned */
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ if (shared->page_number[slotno] == pageno &&
+ shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+ return slotno;
+ }
+
+ /*
+ * If we find any EMPTY slot, just select that one. Else choose a
+ * victim page to replace. We normally take the least recently used
+ * valid page, but we will never take the slot containing
+ * latest_page_number, even if it appears least recently used. We
+ * will select a slot that is already I/O busy only if there is no
+ * other choice: a read-busy slot will not be least recently used once
+ * the read finishes, and waiting for an I/O on a write-busy slot is
+ * inferior to just picking some other slot. Testing shows the slot
+ * we pick instead will often be clean, allowing us to begin a read at
+ * once.
+ *
+ * Normally the page_lru_count values will all be different and so
+ * there will be a well-defined LRU page. But since we allow
+ * concurrent execution of SlruRecentlyUsed() within
+ * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
+ * acquire the same lru_count values. In that case we break ties by
+ * choosing the furthest-back page.
+ *
+ * Notice that this next line forcibly advances cur_lru_count to a
+ * value that is certainly beyond any value that will be in the
+ * page_lru_count array after the loop finishes. This ensures that
+ * the next execution of SlruRecentlyUsed will mark the page newly
+ * used, even if it's for a page that has the current counter value.
+ * That gets us back on the path to having good data when there are
+ * multiple pages with the same lru_count.
+ */
+ cur_count = (shared->cur_lru_count)++;
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ int this_delta;
+ int this_page_number;
+
+ if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+ return slotno;
+ this_delta = cur_count - shared->page_lru_count[slotno];
+ if (this_delta < 0)
+ {
+ /*
+ * Clean up in case shared updates have caused cur_count
+ * increments to get "lost". We back off the page counts,
+ * rather than trying to increase cur_count, to avoid any
+ * question of infinite loops or failure in the presence of
+ * wrapped-around counts.
+ */
+ shared->page_lru_count[slotno] = cur_count;
+ this_delta = 0;
+ }
+ this_page_number = shared->page_number[slotno];
+ if (this_page_number == shared->latest_page_number)
+ continue;
+ if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+ {
+ if (this_delta > best_valid_delta ||
+ (this_delta == best_valid_delta &&
+ ctl->PagePrecedes(this_page_number,
+ best_valid_page_number)))
+ {
+ bestvalidslot = slotno;
+ best_valid_delta = this_delta;
+ best_valid_page_number = this_page_number;
+ }
+ }
+ else
+ {
+ if (this_delta > best_invalid_delta ||
+ (this_delta == best_invalid_delta &&
+ ctl->PagePrecedes(this_page_number,
+ best_invalid_page_number)))
+ {
+ bestinvalidslot = slotno;
+ best_invalid_delta = this_delta;
+ best_invalid_page_number = this_page_number;
+ }
+ }
+ }
+
+ /*
+ * If all pages (except possibly the latest one) are I/O busy, we'll
+ * have to wait for an I/O to complete and then retry. In that
+ * unhappy case, we choose to wait for the I/O on the least recently
+ * used slot, on the assumption that it was likely initiated first of
+ * all the I/Os in progress and may therefore finish first.
+ */
+ if (best_valid_delta < 0)
+ {
+ SimpleLruWaitIO(ctl, bestinvalidslot);
+ continue;
+ }
+
+ /*
+ * If the selected page is clean, we're set.
+ */
+ if (!shared->page_dirty[bestvalidslot])
+ return bestvalidslot;
+
+ /*
+ * Write the page.
+ */
+ SlruInternalWritePage(ctl, bestvalidslot, NULL);
+
+ /*
+ * Now loop back and try again. This is the easiest way of dealing
+ * with corner cases such as the victim page being re-dirtied while we
+ * wrote it.
+ */
+ }
+}
+
+/*
+ * Write dirty pages to disk during checkpoint or database shutdown. Flushing
+ * is deferred until the next call to ProcessSyncRequests(), though we do fsync
+ * the containing directory here to make sure that newly created directory
+ * entries are on disk.
+ */
+void
+SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
+{
+ SlruShared shared = ctl->shared;
+ SlruWriteAllData fdata;
+ int slotno;
+ int pageno = 0;
+ int i;
+ bool ok;
+
+ /* update the stats counter of flushes */
+ pgstat_count_slru_flush(shared->slru_stats_idx);
+
+ /*
+ * Find and write dirty pages
+ */
+ fdata.num_files = 0;
+
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ SlruInternalWritePage(ctl, slotno, &fdata);
+
+ /*
+ * In some places (e.g. checkpoints), we cannot assert that the slot
+ * is clean now, since another process might have re-dirtied it
+ * already. That's okay.
+ */
+ Assert(allow_redirtied ||
+ shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+ (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[slotno]));
+ }
+
+ LWLockRelease(shared->ControlLock);
+
+ /*
+ * Now close any files that were open
+ */
+ ok = true;
+ for (i = 0; i < fdata.num_files; i++)
+ {
+ if (CloseTransientFile(fdata.fd[i]) != 0)
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+ ok = false;
+ }
+ }
+ if (!ok)
+ SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+ /* Ensure that directory entries for new files are on disk. */
+ if (ctl->sync_handler != SYNC_HANDLER_NONE)
+ fsync_fname(ctl->Dir, true);
+}
+
+/*
+ * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint. Mutual exclusion must begin
+ * before computing cutoffPage. Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage. Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
+ */
+void
+SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
+{
+ SlruShared shared = ctl->shared;
+ int slotno;
+
+ /* update the stats counter of truncates */
+ pgstat_count_slru_truncate(shared->slru_stats_idx);
+
+ /*
+ * Scan shared memory and remove any pages preceding the cutoff page, to
+ * ensure we won't rewrite them later. (Since this is normally called in
+ * or just after a checkpoint, any dirty pages should have been flushed
+ * already ... we're just being extra careful here.)
+ */
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+restart:;
+
+ /*
+ * While we are holding the lock, make an important safety check: the
+ * current endpoint page must not be eligible for removal.
+ */
+ if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
+ {
+ LWLockRelease(shared->ControlLock);
+ ereport(LOG,
+ (errmsg("could not truncate directory \"%s\": apparent wraparound",
+ ctl->Dir)));
+ return;
+ }
+
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+ continue;
+ if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
+ continue;
+
+ /*
+ * If page is clean, just change state to EMPTY (expected case).
+ */
+ if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[slotno])
+ {
+ shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+ continue;
+ }
+
+ /*
+ * Hmm, we have (or may have) I/O operations acting on the page, so
+ * we've got to wait for them to finish and then start again. This is
+ * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
+ * wouldn't it be OK to just discard it without writing it?
+ * SlruMayDeleteSegment() uses a stricter qualification, so we might
+ * not delete this page in the end; even if we don't delete it, we
+ * won't have cause to read its data again. For now, keep the logic
+ * the same as it was.)
+ */
+ if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+ SlruInternalWritePage(ctl, slotno, NULL);
+ else
+ SimpleLruWaitIO(ctl, slotno);
+ goto restart;
+ }
+
+ LWLockRelease(shared->ControlLock);
+
+ /* Now we can remove the old segment(s) */
+ (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
+}
+
+/*
+ * Delete an individual SLRU segment.
+ *
+ * NB: This does not touch the SLRU buffers themselves, callers have to ensure
+ * they either can't yet contain anything, or have already been cleaned out.
+ */
+static void
+SlruInternalDeleteSegment(SlruCtl ctl, int segno)
+{
+ char path[MAXPGPATH];
+
+ /* Forget any fsync requests queued for this segment. */
+ if (ctl->sync_handler != SYNC_HANDLER_NONE)
+ {
+ FileTag tag;
+
+ INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+ RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+ }
+
+ /* Unlink the file. */
+ SlruFileName(ctl, path, segno);
+ ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
+ unlink(path);
+}
+
+/*
+ * Delete an individual SLRU segment, identified by the segment number.
+ */
+void
+SlruDeleteSegment(SlruCtl ctl, int segno)
+{
+ SlruShared shared = ctl->shared;
+ int slotno;
+ bool did_write;
+
+ /* Clean out any possibly existing references to the segment. */
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+restart:
+ did_write = false;
+ for (slotno = 0; slotno < shared->num_slots; slotno++)
+ {
+ int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
+
+ if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+ continue;
+
+ /* not the segment we're looking for */
+ if (pagesegno != segno)
+ continue;
+
+ /* If page is clean, just change state to EMPTY (expected case). */
+ if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+ !shared->page_dirty[slotno])
+ {
+ shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+ continue;
+ }
+
+ /* Same logic as SimpleLruTruncate() */
+ if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+ SlruInternalWritePage(ctl, slotno, NULL);
+ else
+ SimpleLruWaitIO(ctl, slotno);
+
+ did_write = true;
+ }
+
+ /*
+ * Be extra careful and re-check. The IO functions release the control
+ * lock, so new pages could have been read in.
+ */
+ if (did_write)
+ goto restart;
+
+ SlruInternalDeleteSegment(ctl, segno);
+
+ LWLockRelease(shared->ControlLock);
+}
+
+/*
+ * Determine whether a segment is okay to delete.
+ *
+ * segpage is the first page of the segment, and cutoffPage is the oldest (in
+ * PagePrecedes order) page in the SLRU containing still-useful data. Since
+ * every core PagePrecedes callback implements "wrap around", check the
+ * segment's first and last pages:
+ *
+ * first<cutoff && last<cutoff: yes
+ * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
+ * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
+ * first>=cutoff && last>=cutoff: no; every page of this segment is too young
+ */
+static bool
+SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage)
+{
+ int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
+
+ Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
+
+ return (ctl->PagePrecedes(segpage, cutoffPage) &&
+ ctl->PagePrecedes(seg_last_page, cutoffPage));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
+{
+ TransactionId lhs,
+ rhs;
+ int newestPage,
+ oldestPage;
+ TransactionId newestXact,
+ oldestXact;
+
+ /*
+ * Compare an XID pair having undefined order (see RFC 1982), a pair at
+ * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
+ * as preceding the other. If RHS is oldestXact, LHS is the first XID we
+ * must not assign.
+ */
+ lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
+ rhs = lhs + (1U << 31);
+ Assert(TransactionIdPrecedes(lhs, rhs));
+ Assert(TransactionIdPrecedes(rhs, lhs));
+ Assert(!TransactionIdPrecedes(lhs - 1, rhs));
+ Assert(TransactionIdPrecedes(rhs, lhs - 1));
+ Assert(TransactionIdPrecedes(lhs + 1, rhs));
+ Assert(!TransactionIdPrecedes(rhs, lhs + 1));
+ Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
+ Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
+ Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
+ Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
+ Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
+ Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
+ Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
+ Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
+ Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
+ || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
+ Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
+ || (1U << 31) % per_page != 0);
+ Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
+ Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
+ Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
+
+ /*
+ * GetNewTransactionId() has assigned the last XID it can safely use, and
+ * that XID is in the *LAST* page of the second segment. We must not
+ * delete that segment.
+ */
+ newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
+ newestXact = newestPage * per_page + offset;
+ Assert(newestXact / per_page == newestPage);
+ oldestXact = newestXact + 1;
+ oldestXact -= 1U << 31;
+ oldestPage = oldestXact / per_page;
+ Assert(!SlruMayDeleteSegment(ctl,
+ (newestPage -
+ newestPage % SLRU_PAGES_PER_SEGMENT),
+ oldestPage));
+
+ /*
+ * GetNewTransactionId() has assigned the last XID it can safely use, and
+ * that XID is in the *FIRST* page of the second segment. We must not
+ * delete that segment.
+ */
+ newestPage = SLRU_PAGES_PER_SEGMENT;
+ newestXact = newestPage * per_page + offset;
+ Assert(newestXact / per_page == newestPage);
+ oldestXact = newestXact + 1;
+ oldestXact -= 1U << 31;
+ oldestPage = oldestXact / per_page;
+ Assert(!SlruMayDeleteSegment(ctl,
+ (newestPage -
+ newestPage % SLRU_PAGES_PER_SEGMENT),
+ oldestPage));
+}
+
+/*
+ * Unit-test a PagePrecedes function.
+ *
+ * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
+ * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
+ * (MultiXactMemberCtl separates flags from XIDs. AsyncCtl has
+ * variable-length entries, no keys, and no random access. These unit tests
+ * do not apply to them.)
+ */
+void
+SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
+{
+ /* Test first, middle and last entries of a page. */
+ SlruPagePrecedesTestOffset(ctl, per_page, 0);
+ SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
+ SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
+}
+#endif
+
+/*
+ * SlruScanDirectory callback
+ * This callback reports true if there's any segment wholly prior to the
+ * one containing the page passed as "data".
+ */
+bool
+SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+ int cutoffPage = *(int *) data;
+
+ if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
+ return true; /* found one; don't iterate any more */
+
+ return false; /* keep going */
+}
+
+/*
+ * SlruScanDirectory callback.
+ * This callback deletes segments prior to the one passed in as "data".
+ */
+static bool
+SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+ int cutoffPage = *(int *) data;
+
+ if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
+ SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
+
+ return false; /* keep going */
+}
+
+/*
+ * SlruScanDirectory callback.
+ * This callback deletes all segments.
+ */
+bool
+SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+ SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
+
+ return false; /* keep going */
+}
+
+/*
+ * Scan the SimpleLru directory and apply a callback to each file found in it.
+ *
+ * If the callback returns true, the scan is stopped. The last return value
+ * from the callback is returned.
+ *
+ * The callback receives the following arguments: 1. the SlruCtl struct for the
+ * slru being truncated; 2. the filename being considered; 3. the page number
+ * for the first page of that file; 4. a pointer to the opaque data given to us
+ * by the caller.
+ *
+ * Note that the ordering in which the directory is scanned is not guaranteed.
+ *
+ * Note that no locking is applied.
+ */
+bool
+SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
+{
+ bool retval = false;
+ DIR *cldir;
+ struct dirent *clde;
+ int segno;
+ int segpage;
+
+ cldir = AllocateDir(ctl->Dir);
+ while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
+ {
+ size_t len;
+
+ len = strlen(clde->d_name);
+
+ if ((len == 4 || len == 5 || len == 6) &&
+ strspn(clde->d_name, "0123456789ABCDEF") == len)
+ {
+ segno = (int) strtol(clde->d_name, NULL, 16);
+ segpage = segno * SLRU_PAGES_PER_SEGMENT;
+
+ elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
+ ctl->Dir, clde->d_name);
+ retval = callback(ctl, clde->d_name, segpage, data);
+ if (retval)
+ break;
+ }
+ }
+ FreeDir(cldir);
+
+ return retval;
+}
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+ int fd;
+ int save_errno;
+ int result;
+
+ SlruFileName(ctl, path, ftag->segno);
+
+ fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+ if (fd < 0)
+ return -1;
+
+ pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
+ result = pg_fsync(fd);
+ pgstat_report_wait_end();
+ save_errno = errno;
+
+ CloseTransientFile(fd);
+
+ errno = save_errno;
+ return result;
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
new file mode 100644
index 0000000..66d3548
--- /dev/null
+++ b/src/backend/access/transam/subtrans.c
@@ -0,0 +1,374 @@
+/*-------------------------------------------------------------------------
+ *
+ * subtrans.c
+ * PostgreSQL subtransaction-log manager
+ *
+ * The pg_subtrans manager is a pg_xact-like manager that stores the parent
+ * transaction Id for each transaction. It is a fundamental part of the
+ * nested transactions implementation. A main transaction has a parent
+ * of InvalidTransactionId, and each subtransaction has its immediate parent.
+ * The tree can easily be walked from child to parent, but not in the
+ * opposite direction.
+ *
+ * This code is based on xact.c, but the robustness requirements
+ * are completely different from pg_xact, because we only need to remember
+ * pg_subtrans information for currently-open transactions. Thus, there is
+ * no need to preserve data over a crash and restart.
+ *
+ * There are no XLOG interactions since we do not care about preserving
+ * data across crashes. During database startup, we simply force the
+ * currently-active page of SUBTRANS to zeroes.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/subtrans.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * SubTrans page numbering also wraps around at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing
+ * them in StartupSUBTRANS.
+ */
+
+/* We need four bytes per xact */
+#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+
+#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+
+
+/*
+ * Link to shared-memory data structures for SUBTRANS control
+ */
+static SlruCtlData SubTransCtlData;
+
+#define SubTransCtl (&SubTransCtlData)
+
+
+static int ZeroSUBTRANSPage(int pageno);
+static bool SubTransPagePrecedes(int page1, int page2);
+
+
+/*
+ * Record the parent of a subtransaction in the subtrans log.
+ */
+void
+SubTransSetParent(TransactionId xid, TransactionId parent)
+{
+ int pageno = TransactionIdToPage(xid);
+ int entryno = TransactionIdToEntry(xid);
+ int slotno;
+ TransactionId *ptr;
+
+ Assert(TransactionIdIsValid(parent));
+ Assert(TransactionIdFollows(xid, parent));
+
+ LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+ slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
+ ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr += entryno;
+
+ /*
+ * It's possible we'll try to set the parent xid multiple times but we
+ * shouldn't ever be changing the xid from one valid xid to another valid
+ * xid, which would corrupt the data structure.
+ */
+ if (*ptr != parent)
+ {
+ Assert(*ptr == InvalidTransactionId);
+ *ptr = parent;
+ SubTransCtl->shared->page_dirty[slotno] = true;
+ }
+
+ LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Interrogate the parent of a transaction in the subtrans log.
+ */
+TransactionId
+SubTransGetParent(TransactionId xid)
+{
+ int pageno = TransactionIdToPage(xid);
+ int entryno = TransactionIdToEntry(xid);
+ int slotno;
+ TransactionId *ptr;
+ TransactionId parent;
+
+ /* Can't ask about stuff that might not be around anymore */
+ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+ /* Bootstrap and frozen XIDs have no parent */
+ if (!TransactionIdIsNormal(xid))
+ return InvalidTransactionId;
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+ slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
+ ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr += entryno;
+
+ parent = *ptr;
+
+ LWLockRelease(SubtransSLRULock);
+
+ return parent;
+}
+
+/*
+ * SubTransGetTopmostTransaction
+ *
+ * Returns the topmost transaction of the given transaction id.
+ *
+ * Because we cannot look back further than TransactionXmin, it is possible
+ * that this function will lie and return an intermediate subtransaction ID
+ * instead of the true topmost parent ID. This is OK, because in practice
+ * we only care about detecting whether the topmost parent is still running
+ * or is part of a current snapshot's list of still-running transactions.
+ * Therefore, any XID before TransactionXmin is as good as any other.
+ */
+TransactionId
+SubTransGetTopmostTransaction(TransactionId xid)
+{
+ TransactionId parentXid = xid,
+ previousXid = xid;
+
+ /* Can't ask about stuff that might not be around anymore */
+ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+ while (TransactionIdIsValid(parentXid))
+ {
+ previousXid = parentXid;
+ if (TransactionIdPrecedes(parentXid, TransactionXmin))
+ break;
+ parentXid = SubTransGetParent(parentXid);
+
+ /*
+ * By convention the parent xid gets allocated first, so should always
+ * precede the child xid. Anything else points to a corrupted data
+ * structure that could lead to an infinite loop, so exit.
+ */
+ if (!TransactionIdPrecedes(parentXid, previousXid))
+ elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u",
+ previousXid, parentXid);
+ }
+
+ Assert(TransactionIdIsValid(previousXid));
+
+ return previousXid;
+}
+
+
+/*
+ * Initialization of shared memory for SUBTRANS
+ */
+Size
+SUBTRANSShmemSize(void)
+{
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+}
+
+void
+SUBTRANSShmemInit(void)
+{
+ SubTransCtl->PagePrecedes = SubTransPagePrecedes;
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SubtransSLRULock, "pg_subtrans",
+ LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
+ SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE);
+}
+
+/*
+ * This func must be called ONCE on system install. It creates
+ * the initial SUBTRANS segment. (The SUBTRANS directory is assumed to
+ * have been created by the initdb shell script, and SUBTRANSShmemInit
+ * must have been called already.)
+ *
+ * Note: it's not really necessary to create the initial segment now,
+ * since slru.c would create it on first write anyway. But we may as well
+ * do it to be sure the directory is set up correctly.
+ */
+void
+BootStrapSUBTRANS(void)
+{
+ int slotno;
+
+ LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+ /* Create and zero the first page of the subtrans log */
+ slotno = ZeroSUBTRANSPage(0);
+
+ /* Make sure it's written out */
+ SimpleLruWritePage(SubTransCtl, slotno);
+ Assert(!SubTransCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of SUBTRANS to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroSUBTRANSPage(int pageno)
+{
+ return SimpleLruZeroPage(SubTransCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ */
+void
+StartupSUBTRANS(TransactionId oldestActiveXID)
+{
+ FullTransactionId nextXid;
+ int startPage;
+ int endPage;
+
+ /*
+ * Since we don't expect pg_subtrans to be valid across crashes, we
+ * initialize the currently-active page(s) to zeroes during startup.
+ * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
+ * the new page without regard to whatever was previously on disk.
+ */
+ LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+ startPage = TransactionIdToPage(oldestActiveXID);
+ nextXid = ShmemVariableCache->nextXid;
+ endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid));
+
+ while (startPage != endPage)
+ {
+ (void) ZeroSUBTRANSPage(startPage);
+ startPage++;
+ /* must account for wraparound */
+ if (startPage > TransactionIdToPage(MaxTransactionId))
+ startPage = 0;
+ }
+ (void) ZeroSUBTRANSPage(startPage);
+
+ LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointSUBTRANS(void)
+{
+ /*
+ * Write dirty SUBTRANS pages to disk
+ *
+ * This is not actually necessary from a correctness point of view. We do
+ * it merely to improve the odds that writing of dirty pages is done by
+ * the checkpoint process and not by backends.
+ */
+ TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
+ SimpleLruWriteAll(SubTransCtl, true);
+ TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
+}
+
+
+/*
+ * Make sure that SUBTRANS has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock. We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty subtrans page to make room
+ * in shared memory.
+ */
+void
+ExtendSUBTRANS(TransactionId newestXact)
+{
+ int pageno;
+
+ /*
+ * No work except at first XID of a page. But beware: just after
+ * wraparound, the first XID of page zero is FirstNormalTransactionId.
+ */
+ if (TransactionIdToEntry(newestXact) != 0 &&
+ !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+ return;
+
+ pageno = TransactionIdToPage(newestXact);
+
+ LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+ /* Zero the page */
+ ZeroSUBTRANSPage(pageno);
+
+ LWLockRelease(SubtransSLRULock);
+}
+
+
+/*
+ * Remove all SUBTRANS segments before the one holding the passed transaction ID
+ *
+ * oldestXact is the oldest TransactionXmin of any running transaction. This
+ * is called only during checkpoint.
+ */
+void
+TruncateSUBTRANS(TransactionId oldestXact)
+{
+ int cutoffPage;
+
+ /*
+ * The cutoff point is the start of the segment containing oldestXact. We
+ * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+ * back one transaction to avoid passing a cutoff page that hasn't been
+ * created yet in the rare case that oldestXact would be the first item on
+ * a page and oldestXact == next XID. In that case, if we didn't subtract
+ * one, we'd trigger SimpleLruTruncate's wraparound detection.
+ */
+ TransactionIdRetreat(oldestXact);
+ cutoffPage = TransactionIdToPage(oldestXact);
+
+ SimpleLruTruncate(SubTransCtl, cutoffPage);
+}
+
+
+/*
+ * Decide whether a SUBTRANS page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SubTransPagePrecedes(int page1, int page2)
+{
+ TransactionId xid1;
+ TransactionId xid2;
+
+ xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + SUBTRANS_XACTS_PER_PAGE - 1));
+}
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
new file mode 100644
index 0000000..be21968
--- /dev/null
+++ b/src/backend/access/transam/timeline.c
@@ -0,0 +1,600 @@
+/*-------------------------------------------------------------------------
+ *
+ * timeline.c
+ * Functions for reading and writing timeline history files.
+ *
+ * A timeline history file lists the timeline changes of the timeline, in
+ * a simple text format. They are archived along with the WAL segments.
+ *
+ * The files are named like "<tli>.history". For example, if the database
+ * starts up and switches to timeline 5, the timeline history file would be
+ * called "00000005.history".
+ *
+ * Each line in the file represents a timeline switch:
+ *
+ * <parentTLI> <switchpoint> <reason>
+ *
+ * parentTLI ID of the parent timeline
+ * switchpoint XLogRecPtr of the WAL location where the switch happened
+ * reason human-readable explanation of why the timeline was changed
+ *
+ * The fields are separated by tabs. Lines beginning with # are comments, and
+ * are ignored. Empty lines are also ignored.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/timeline.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogdefs.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+
+/*
+ * Copies all timeline history files with id's between 'begin' and 'end'
+ * from archive to pg_wal.
+ */
+void
+restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
+{
+ char path[MAXPGPATH];
+ char histfname[MAXFNAMELEN];
+ TimeLineID tli;
+
+ for (tli = begin; tli < end; tli++)
+ {
+ if (tli == 1)
+ continue;
+
+ TLHistoryFileName(histfname, tli);
+ if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
+ KeepFileRestoredFromArchive(path, histfname);
+ }
+}
+
+/*
+ * Try to read a timeline's history file.
+ *
+ * If successful, return the list of component TLIs (the given TLI followed by
+ * its ancestor TLIs). If we can't find the history file, assume that the
+ * timeline has no parents, and return a list of just the specified timeline
+ * ID.
+ */
+List *
+readTimeLineHistory(TimeLineID targetTLI)
+{
+ List *result;
+ char path[MAXPGPATH];
+ char histfname[MAXFNAMELEN];
+ FILE *fd;
+ TimeLineHistoryEntry *entry;
+ TimeLineID lasttli = 0;
+ XLogRecPtr prevend;
+ bool fromArchive = false;
+
+ /* Timeline 1 does not have a history file, so no need to check */
+ if (targetTLI == 1)
+ {
+ entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+ entry->tli = targetTLI;
+ entry->begin = entry->end = InvalidXLogRecPtr;
+ return list_make1(entry);
+ }
+
+ if (ArchiveRecoveryRequested)
+ {
+ TLHistoryFileName(histfname, targetTLI);
+ fromArchive =
+ RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+ }
+ else
+ TLHistoryFilePath(path, targetTLI);
+
+ fd = AllocateFile(path, "r");
+ if (fd == NULL)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ /* Not there, so assume no parents */
+ entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+ entry->tli = targetTLI;
+ entry->begin = entry->end = InvalidXLogRecPtr;
+ return list_make1(entry);
+ }
+
+ result = NIL;
+
+ /*
+ * Parse the file...
+ */
+ prevend = InvalidXLogRecPtr;
+ for (;;)
+ {
+ char fline[MAXPGPATH];
+ char *res;
+ char *ptr;
+ TimeLineID tli;
+ uint32 switchpoint_hi;
+ uint32 switchpoint_lo;
+ int nfields;
+
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ);
+ res = fgets(fline, sizeof(fline), fd);
+ pgstat_report_wait_end();
+ if (res == NULL)
+ {
+ if (ferror(fd))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", path)));
+
+ break;
+ }
+
+ /* skip leading whitespace and check for # comment */
+ for (ptr = fline; *ptr; ptr++)
+ {
+ if (!isspace((unsigned char) *ptr))
+ break;
+ }
+ if (*ptr == '\0' || *ptr == '#')
+ continue;
+
+ nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
+
+ if (nfields < 1)
+ {
+ /* expect a numeric timeline ID as first field of line */
+ ereport(FATAL,
+ (errmsg("syntax error in history file: %s", fline),
+ errhint("Expected a numeric timeline ID.")));
+ }
+ if (nfields != 3)
+ ereport(FATAL,
+ (errmsg("syntax error in history file: %s", fline),
+ errhint("Expected a write-ahead log switchpoint location.")));
+
+ if (result && tli <= lasttli)
+ ereport(FATAL,
+ (errmsg("invalid data in history file: %s", fline),
+ errhint("Timeline IDs must be in increasing sequence.")));
+
+ lasttli = tli;
+
+ entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+ entry->tli = tli;
+ entry->begin = prevend;
+ entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
+ prevend = entry->end;
+
+ /* Build list with newest item first */
+ result = lcons(entry, result);
+
+ /* we ignore the remainder of each line */
+ }
+
+ FreeFile(fd);
+
+ if (result && targetTLI <= lasttli)
+ ereport(FATAL,
+ (errmsg("invalid data in history file \"%s\"", path),
+ errhint("Timeline IDs must be less than child timeline's ID.")));
+
+ /*
+ * Create one more entry for the "tip" of the timeline, which has no entry
+ * in the history file.
+ */
+ entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+ entry->tli = targetTLI;
+ entry->begin = prevend;
+ entry->end = InvalidXLogRecPtr;
+
+ result = lcons(entry, result);
+
+ /*
+ * If the history file was fetched from archive, save it in pg_wal for
+ * future reference.
+ */
+ if (fromArchive)
+ KeepFileRestoredFromArchive(path, histfname);
+
+ return result;
+}
+
+/*
+ * Probe whether a timeline history file exists for the given timeline ID
+ */
+bool
+existsTimeLineHistory(TimeLineID probeTLI)
+{
+ char path[MAXPGPATH];
+ char histfname[MAXFNAMELEN];
+ FILE *fd;
+
+ /* Timeline 1 does not have a history file, so no need to check */
+ if (probeTLI == 1)
+ return false;
+
+ if (ArchiveRecoveryRequested)
+ {
+ TLHistoryFileName(histfname, probeTLI);
+ RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+ }
+ else
+ TLHistoryFilePath(path, probeTLI);
+
+ fd = AllocateFile(path, "r");
+ if (fd != NULL)
+ {
+ FreeFile(fd);
+ return true;
+ }
+ else
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return false;
+ }
+}
+
+/*
+ * Find the newest existing timeline, assuming that startTLI exists.
+ *
+ * Note: while this is somewhat heuristic, it does positively guarantee
+ * that (result + 1) is not a known timeline, and therefore it should
+ * be safe to assign that ID to a new timeline.
+ */
+TimeLineID
+findNewestTimeLine(TimeLineID startTLI)
+{
+ TimeLineID newestTLI;
+ TimeLineID probeTLI;
+
+ /*
+ * The algorithm is just to probe for the existence of timeline history
+ * files. XXX is it useful to allow gaps in the sequence?
+ */
+ newestTLI = startTLI;
+
+ for (probeTLI = startTLI + 1;; probeTLI++)
+ {
+ if (existsTimeLineHistory(probeTLI))
+ {
+ newestTLI = probeTLI; /* probeTLI exists */
+ }
+ else
+ {
+ /* doesn't exist, assume we're done */
+ break;
+ }
+ }
+
+ return newestTLI;
+}
+
+/*
+ * Create a new timeline history file.
+ *
+ * newTLI: ID of the new timeline
+ * parentTLI: ID of its immediate parent
+ * switchpoint: WAL location where the system switched to the new timeline
+ * reason: human-readable explanation of why the timeline was switched
+ *
+ * Currently this is only used at the end recovery, and so there are no locking
+ * considerations. But we should be just as tense as XLogFileInit to avoid
+ * emplacing a bogus file.
+ */
+void
+writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
+ XLogRecPtr switchpoint, char *reason)
+{
+ char path[MAXPGPATH];
+ char tmppath[MAXPGPATH];
+ char histfname[MAXFNAMELEN];
+ char buffer[BLCKSZ];
+ int srcfd;
+ int fd;
+ int nbytes;
+
+ Assert(newTLI > parentTLI); /* else bad selection of newTLI */
+
+ /*
+ * Write into a temp file name.
+ */
+ snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+ unlink(tmppath);
+
+ /* do not use get_sync_bit() here --- want to fsync only at end of fill */
+ fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tmppath)));
+
+ /*
+ * If a history file exists for the parent, copy it verbatim
+ */
+ if (ArchiveRecoveryRequested)
+ {
+ TLHistoryFileName(histfname, parentTLI);
+ RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+ }
+ else
+ TLHistoryFilePath(path, parentTLI);
+
+ srcfd = OpenTransientFile(path, O_RDONLY);
+ if (srcfd < 0)
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ /* Not there, so assume parent has no parents */
+ }
+ else
+ {
+ for (;;)
+ {
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ);
+ nbytes = (int) read(srcfd, buffer, sizeof(buffer));
+ pgstat_report_wait_end();
+ if (nbytes < 0 || errno != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", path)));
+ if (nbytes == 0)
+ break;
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE);
+ if ((int) write(fd, buffer, nbytes) != nbytes)
+ {
+ int save_errno = errno;
+
+ /*
+ * If we fail to make the file, delete it to release disk
+ * space
+ */
+ unlink(tmppath);
+
+ /*
+ * if write didn't set errno, assume problem is no disk space
+ */
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+ pgstat_report_wait_end();
+ }
+
+ if (CloseTransientFile(srcfd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", path)));
+ }
+
+ /*
+ * Append one line with the details of this timeline split.
+ *
+ * If we did have a parent file, insert an extra newline just in case the
+ * parent file failed to end with one.
+ */
+ snprintf(buffer, sizeof(buffer),
+ "%s%u\t%X/%X\t%s\n",
+ (srcfd < 0) ? "" : "\n",
+ parentTLI,
+ LSN_FORMAT_ARGS(switchpoint),
+ reason);
+
+ nbytes = strlen(buffer);
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE);
+ if ((int) write(fd, buffer, nbytes) != nbytes)
+ {
+ int save_errno = errno;
+
+ /*
+ * If we fail to make the file, delete it to release disk space
+ */
+ unlink(tmppath);
+ /* if write didn't set errno, assume problem is no disk space */
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+ pgstat_report_wait_end();
+
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
+ if (pg_fsync(fd) != 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", tmppath)));
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", tmppath)));
+
+ /*
+ * Now move the completed history file into place with its final name.
+ */
+ TLHistoryFilePath(path, newTLI);
+
+ /*
+ * Perform the rename using link if available, paranoidly trying to avoid
+ * overwriting an existing file (there shouldn't be one).
+ */
+ durable_rename_excl(tmppath, path, ERROR);
+
+ /* The history file can be archived immediately. */
+ if (XLogArchivingActive())
+ {
+ TLHistoryFileName(histfname, newTLI);
+ XLogArchiveNotify(histfname);
+ }
+}
+
+/*
+ * Writes a history file for given timeline and contents.
+ *
+ * Currently this is only used in the walreceiver process, and so there are
+ * no locking considerations. But we should be just as tense as XLogFileInit
+ * to avoid emplacing a bogus file.
+ */
+void
+writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
+{
+ char path[MAXPGPATH];
+ char tmppath[MAXPGPATH];
+ int fd;
+
+ /*
+ * Write into a temp file name.
+ */
+ snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+ unlink(tmppath);
+
+ /* do not use get_sync_bit() here --- want to fsync only at end of fill */
+ fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tmppath)));
+
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE);
+ if ((int) write(fd, content, size) != size)
+ {
+ int save_errno = errno;
+
+ /*
+ * If we fail to make the file, delete it to release disk space
+ */
+ unlink(tmppath);
+ /* if write didn't set errno, assume problem is no disk space */
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+ pgstat_report_wait_end();
+
+ pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
+ if (pg_fsync(fd) != 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", tmppath)));
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", tmppath)));
+
+ /*
+ * Now move the completed history file into place with its final name.
+ */
+ TLHistoryFilePath(path, tli);
+
+ /*
+ * Perform the rename using link if available, paranoidly trying to avoid
+ * overwriting an existing file (there shouldn't be one).
+ */
+ durable_rename_excl(tmppath, path, ERROR);
+}
+
+/*
+ * Returns true if 'expectedTLEs' contains a timeline with id 'tli'
+ */
+bool
+tliInHistory(TimeLineID tli, List *expectedTLEs)
+{
+ ListCell *cell;
+
+ foreach(cell, expectedTLEs)
+ {
+ if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Returns the ID of the timeline in use at a particular point in time, in
+ * the given timeline history.
+ */
+TimeLineID
+tliOfPointInHistory(XLogRecPtr ptr, List *history)
+{
+ ListCell *cell;
+
+ foreach(cell, history)
+ {
+ TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+
+ if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) &&
+ (XLogRecPtrIsInvalid(tle->end) || ptr < tle->end))
+ {
+ /* found it */
+ return tle->tli;
+ }
+ }
+
+ /* shouldn't happen. */
+ elog(ERROR, "timeline history was not contiguous");
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * Returns the point in history where we branched off the given timeline,
+ * and the timeline we branched to (*nextTLI). Returns InvalidXLogRecPtr if
+ * the timeline is current, ie. we have not branched off from it, and throws
+ * an error if the timeline is not part of this server's history.
+ */
+XLogRecPtr
+tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
+{
+ ListCell *cell;
+
+ if (nextTLI)
+ *nextTLI = 0;
+ foreach(cell, history)
+ {
+ TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+
+ if (tle->tli == tli)
+ return tle->end;
+ if (nextTLI)
+ *nextTLI = tle->tli;
+ }
+
+ ereport(ERROR,
+ (errmsg("requested timeline %u is not in this server's history",
+ tli)));
+ return InvalidXLogRecPtr; /* keep compiler quiet */
+}
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
new file mode 100644
index 0000000..5865810
--- /dev/null
+++ b/src/backend/access/transam/transam.c
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------
+ *
+ * transam.c
+ * postgres transaction (commit) log interface routines
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/transam.c
+ *
+ * NOTES
+ * This file contains the high level access-method interface to the
+ * transaction system.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Single-item cache for results of TransactionLogFetch. It's worth having
+ * such a cache because we frequently find ourselves repeatedly checking the
+ * same XID, for example when scanning a table just after a bulk insert,
+ * update, or delete.
+ */
+static TransactionId cachedFetchXid = InvalidTransactionId;
+static XidStatus cachedFetchXidStatus;
+static XLogRecPtr cachedCommitLSN;
+
+/* Local functions */
+static XidStatus TransactionLogFetch(TransactionId transactionId);
+
+
+/* ----------------------------------------------------------------
+ * Postgres log access method interface
+ *
+ * TransactionLogFetch
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * TransactionLogFetch --- fetch commit status of specified transaction id
+ */
+static XidStatus
+TransactionLogFetch(TransactionId transactionId)
+{
+ XidStatus xidstatus;
+ XLogRecPtr xidlsn;
+
+ /*
+ * Before going to the commit log manager, check our single item cache to
+ * see if we didn't just check the transaction status a moment ago.
+ */
+ if (TransactionIdEquals(transactionId, cachedFetchXid))
+ return cachedFetchXidStatus;
+
+ /*
+ * Also, check to see if the transaction ID is a permanent one.
+ */
+ if (!TransactionIdIsNormal(transactionId))
+ {
+ if (TransactionIdEquals(transactionId, BootstrapTransactionId))
+ return TRANSACTION_STATUS_COMMITTED;
+ if (TransactionIdEquals(transactionId, FrozenTransactionId))
+ return TRANSACTION_STATUS_COMMITTED;
+ return TRANSACTION_STATUS_ABORTED;
+ }
+
+ /*
+ * Get the transaction status.
+ */
+ xidstatus = TransactionIdGetStatus(transactionId, &xidlsn);
+
+ /*
+ * Cache it, but DO NOT cache status for unfinished or sub-committed
+ * transactions! We only cache status that is guaranteed not to change.
+ */
+ if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
+ xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
+ {
+ cachedFetchXid = transactionId;
+ cachedFetchXidStatus = xidstatus;
+ cachedCommitLSN = xidlsn;
+ }
+
+ return xidstatus;
+}
+
+/* ----------------------------------------------------------------
+ * Interface functions
+ *
+ * TransactionIdDidCommit
+ * TransactionIdDidAbort
+ * ========
+ * these functions test the transaction status of
+ * a specified transaction id.
+ *
+ * TransactionIdCommitTree
+ * TransactionIdAsyncCommitTree
+ * TransactionIdAbortTree
+ * ========
+ * these functions set the transaction status of the specified
+ * transaction tree.
+ *
+ * See also TransactionIdIsInProgress, which once was in this module
+ * but now lives in procarray.c.
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * TransactionIdDidCommit
+ * True iff transaction associated with the identifier did commit.
+ *
+ * Note:
+ * Assumes transaction identifier is valid and exists in clog.
+ */
+bool /* true if given transaction committed */
+TransactionIdDidCommit(TransactionId transactionId)
+{
+ XidStatus xidstatus;
+
+ xidstatus = TransactionLogFetch(transactionId);
+
+ /*
+ * If it's marked committed, it's committed.
+ */
+ if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+ return true;
+
+ /*
+ * If it's marked subcommitted, we have to check the parent recursively.
+ * However, if it's older than TransactionXmin, we can't look at
+ * pg_subtrans; instead assume that the parent crashed without cleaning up
+ * its children.
+ *
+ * Originally we Assert'ed that the result of SubTransGetParent was not
+ * zero. However with the introduction of prepared transactions, there can
+ * be a window just after database startup where we do not have complete
+ * knowledge in pg_subtrans of the transactions after TransactionXmin.
+ * StartupSUBTRANS() has ensured that any missing information will be
+ * zeroed. Since this case should not happen under normal conditions, it
+ * seems reasonable to emit a WARNING for it.
+ */
+ if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
+ {
+ TransactionId parentXid;
+
+ if (TransactionIdPrecedes(transactionId, TransactionXmin))
+ return false;
+ parentXid = SubTransGetParent(transactionId);
+ if (!TransactionIdIsValid(parentXid))
+ {
+ elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
+ transactionId);
+ return false;
+ }
+ return TransactionIdDidCommit(parentXid);
+ }
+
+ /*
+ * It's not committed.
+ */
+ return false;
+}
+
+/*
+ * TransactionIdDidAbort
+ * True iff transaction associated with the identifier did abort.
+ *
+ * Note:
+ * Assumes transaction identifier is valid and exists in clog.
+ */
+bool /* true if given transaction aborted */
+TransactionIdDidAbort(TransactionId transactionId)
+{
+ XidStatus xidstatus;
+
+ xidstatus = TransactionLogFetch(transactionId);
+
+ /*
+ * If it's marked aborted, it's aborted.
+ */
+ if (xidstatus == TRANSACTION_STATUS_ABORTED)
+ return true;
+
+ /*
+ * If it's marked subcommitted, we have to check the parent recursively.
+ * However, if it's older than TransactionXmin, we can't look at
+ * pg_subtrans; instead assume that the parent crashed without cleaning up
+ * its children.
+ */
+ if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
+ {
+ TransactionId parentXid;
+
+ if (TransactionIdPrecedes(transactionId, TransactionXmin))
+ return true;
+ parentXid = SubTransGetParent(transactionId);
+ if (!TransactionIdIsValid(parentXid))
+ {
+ /* see notes in TransactionIdDidCommit */
+ elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
+ transactionId);
+ return true;
+ }
+ return TransactionIdDidAbort(parentXid);
+ }
+
+ /*
+ * It's not aborted.
+ */
+ return false;
+}
+
+/*
+ * TransactionIdCommitTree
+ * Marks the given transaction and children as committed
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
+ *
+ * This commit operation is not guaranteed to be atomic, but if not, subxids
+ * are correctly marked subcommit first.
+ */
+void
+TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
+{
+ TransactionIdSetTreeStatus(xid, nxids, xids,
+ TRANSACTION_STATUS_COMMITTED,
+ InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdAsyncCommitTree
+ * Same as above, but for async commits. The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids,
+ XLogRecPtr lsn)
+{
+ TransactionIdSetTreeStatus(xid, nxids, xids,
+ TRANSACTION_STATUS_COMMITTED, lsn);
+}
+
+/*
+ * TransactionIdAbortTree
+ * Marks the given transaction and children as aborted.
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
+ *
+ * We don't need to worry about the non-atomic behavior, since any onlookers
+ * will consider all the xacts as not-yet-committed anyway.
+ */
+void
+TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
+{
+ TransactionIdSetTreeStatus(xid, nxids, xids,
+ TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdPrecedes --- is id1 logically < id2?
+ */
+bool
+TransactionIdPrecedes(TransactionId id1, TransactionId id2)
+{
+ /*
+ * If either ID is a permanent XID then we can just do unsigned
+ * comparison. If both are normal, do a modulo-2^32 comparison.
+ */
+ int32 diff;
+
+ if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+ return (id1 < id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff < 0);
+}
+
+/*
+ * TransactionIdPrecedesOrEquals --- is id1 logically <= id2?
+ */
+bool
+TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
+{
+ int32 diff;
+
+ if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+ return (id1 <= id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff <= 0);
+}
+
+/*
+ * TransactionIdFollows --- is id1 logically > id2?
+ */
+bool
+TransactionIdFollows(TransactionId id1, TransactionId id2)
+{
+ int32 diff;
+
+ if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+ return (id1 > id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff > 0);
+}
+
+/*
+ * TransactionIdFollowsOrEquals --- is id1 logically >= id2?
+ */
+bool
+TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
+{
+ int32 diff;
+
+ if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+ return (id1 >= id2);
+
+ diff = (int32) (id1 - id2);
+ return (diff >= 0);
+}
+
+
+/*
+ * TransactionIdLatest --- get latest XID among a main xact and its children
+ */
+TransactionId
+TransactionIdLatest(TransactionId mainxid,
+ int nxids, const TransactionId *xids)
+{
+ TransactionId result;
+
+ /*
+ * In practice it is highly likely that the xids[] array is sorted, and so
+ * we could save some cycles by just taking the last child XID, but this
+ * probably isn't so performance-critical that it's worth depending on
+ * that assumption. But just to show we're not totally stupid, scan the
+ * array back-to-front to avoid useless assignments.
+ */
+ result = mainxid;
+ while (--nxids >= 0)
+ {
+ if (TransactionIdPrecedes(result, xids[nxids]))
+ result = xids[nxids];
+ }
+ return result;
+}
+
+
+/*
+ * TransactionIdGetCommitLSN
+ *
+ * This function returns an LSN that is late enough to be able
+ * to guarantee that if we flush up to the LSN returned then we
+ * will have flushed the transaction's commit record to disk.
+ *
+ * The result is not necessarily the exact LSN of the transaction's
+ * commit record! For example, for long-past transactions (those whose
+ * clog pages already migrated to disk), we'll return InvalidXLogRecPtr.
+ * Also, because we group transactions on the same clog page to conserve
+ * storage, we might return the LSN of a later transaction that falls into
+ * the same group.
+ */
+XLogRecPtr
+TransactionIdGetCommitLSN(TransactionId xid)
+{
+ XLogRecPtr result;
+
+ /*
+ * Currently, all uses of this function are for xids that were just
+ * reported to be committed by TransactionLogFetch, so we expect that
+ * checking TransactionLogFetch's cache will usually succeed and avoid an
+ * extra trip to shared memory.
+ */
+ if (TransactionIdEquals(xid, cachedFetchXid))
+ return cachedCommitLSN;
+
+ /* Special XIDs are always known committed */
+ if (!TransactionIdIsNormal(xid))
+ return InvalidXLogRecPtr;
+
+ /*
+ * Get the transaction status.
+ */
+ (void) TransactionIdGetStatus(xid, &result);
+
+ return result;
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
new file mode 100644
index 0000000..5293c69
--- /dev/null
+++ b/src/backend/access/transam/twophase.c
@@ -0,0 +1,2662 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase.c
+ * Two-phase commit support functions.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/twophase.c
+ *
+ * NOTES
+ * Each global transaction is associated with a global transaction
+ * identifier (GID). The client assigns a GID to a postgres
+ * transaction with the PREPARE TRANSACTION command.
+ *
+ * We keep all active global transactions in a shared memory array.
+ * When the PREPARE TRANSACTION command is issued, the GID is
+ * reserved for the transaction in the array. This is done before
+ * a WAL entry is made, because the reservation checks for duplicate
+ * GIDs and aborts the transaction if there already is a global
+ * transaction in prepared state with the same GID.
+ *
+ * A global transaction (gxact) also has dummy PGPROC; this is what keeps
+ * the XID considered running by TransactionIdIsInProgress. It is also
+ * convenient as a PGPROC to hook the gxact's locks to.
+ *
+ * Information to recover prepared transactions in case of crash is
+ * now stored in WAL for the common case. In some cases there will be
+ * an extended period between preparing a GXACT and commit/abort, in
+ * which case we need to separately record prepared transaction data
+ * in permanent storage. This includes locking information, pending
+ * notifications etc. All that state information is written to the
+ * per-transaction state file in the pg_twophase directory.
+ * All prepared transactions will be written prior to shutdown.
+ *
+ * Life track of state data is following:
+ *
+ * * On PREPARE TRANSACTION backend writes state data only to the WAL and
+ * stores pointer to the start of the WAL record in
+ * gxact->prepare_start_lsn.
+ * * If COMMIT occurs before checkpoint then backend reads data from WAL
+ * using prepare_start_lsn.
+ * * On checkpoint state data copied to files in pg_twophase directory and
+ * fsynced
+ * * If COMMIT happens after checkpoint then backend reads state data from
+ * files
+ *
+ * During replay and replication, TwoPhaseState also holds information
+ * about active prepared transactions that haven't been moved to disk yet.
+ *
+ * Replay of twophase records happens by the following rules:
+ *
+ * * At the beginning of recovery, pg_twophase is scanned once, filling
+ * TwoPhaseState with entries marked with gxact->inredo and
+ * gxact->ondisk. Two-phase file data older than the XID horizon of
+ * the redo position are discarded.
+ * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts.
+ * gxact->inredo is set to true for such entries.
+ * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries
+ * that have gxact->inredo set and are behind the redo_horizon. We
+ * save them to disk and then switch gxact->ondisk to true.
+ * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts.
+ * If gxact->ondisk is true, the corresponding entry from the disk
+ * is additionally deleted.
+ * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions()
+ * and PrescanPreparedTransactions() have been modified to go through
+ * gxact->inredo entries that have not made it to disk.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/commit_ts.h"
+#include "access/htup_details.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "catalog/storage.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "replication/origin.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+/*
+ * Directory where Two-phase commit files reside within PGDATA
+ */
+#define TWOPHASE_DIR "pg_twophase"
+
+/* GUC variable, can't be changed after startup */
+int max_prepared_xacts = 0;
+
+/*
+ * This struct describes one global transaction that is in prepared state
+ * or attempting to become prepared.
+ *
+ * The lifecycle of a global transaction is:
+ *
+ * 1. After checking that the requested GID is not in use, set up an entry in
+ * the TwoPhaseState->prepXacts array with the correct GID and valid = false,
+ * and mark it as locked by my backend.
+ *
+ * 2. After successfully completing prepare, set valid = true and enter the
+ * referenced PGPROC into the global ProcArray.
+ *
+ * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is
+ * valid and not locked, then mark the entry as locked by storing my current
+ * backend ID into locking_backend. This prevents concurrent attempts to
+ * commit or rollback the same prepared xact.
+ *
+ * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
+ * from the ProcArray and the TwoPhaseState->prepXacts array and return it to
+ * the freelist.
+ *
+ * Note that if the preparing transaction fails between steps 1 and 2, the
+ * entry must be removed so that the GID and the GlobalTransaction struct
+ * can be reused. See AtAbort_Twophase().
+ *
+ * typedef struct GlobalTransactionData *GlobalTransaction appears in
+ * twophase.h
+ */
+
+typedef struct GlobalTransactionData
+{
+ GlobalTransaction next; /* list link for free list */
+ int pgprocno; /* ID of associated dummy PGPROC */
+ BackendId dummyBackendId; /* similar to backend id for backends */
+ TimestampTz prepared_at; /* time of preparation */
+
+ /*
+ * Note that we need to keep track of two LSNs for each GXACT. We keep
+ * track of the start LSN because this is the address we must use to read
+ * state data back from WAL when committing a prepared GXACT. We keep
+ * track of the end LSN because that is the LSN we need to wait for prior
+ * to commit.
+ */
+ XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */
+ XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */
+ TransactionId xid; /* The GXACT id */
+
+ Oid owner; /* ID of user that executed the xact */
+ BackendId locking_backend; /* backend currently working on the xact */
+ bool valid; /* true if PGPROC entry is in proc array */
+ bool ondisk; /* true if prepare state file is on disk */
+ bool inredo; /* true if entry was added via xlog_redo */
+ char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
+} GlobalTransactionData;
+
+/*
+ * Two Phase Commit shared state. Access to this struct is protected
+ * by TwoPhaseStateLock.
+ */
+typedef struct TwoPhaseStateData
+{
+ /* Head of linked list of free GlobalTransactionData structs */
+ GlobalTransaction freeGXacts;
+
+ /* Number of valid prepXacts entries. */
+ int numPrepXacts;
+
+ /* There are max_prepared_xacts items in this array */
+ GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER];
+} TwoPhaseStateData;
+
+static TwoPhaseStateData *TwoPhaseState;
+
+/*
+ * Global transaction entry currently locked by us, if any. Note that any
+ * access to the entry pointed to by this variable must be protected by
+ * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be
+ * (since it's just local memory).
+ */
+static GlobalTransaction MyLockedGxact = NULL;
+
+static bool twophaseExitRegistered = false;
+
+static void RecordTransactionCommitPrepared(TransactionId xid,
+ int nchildren,
+ TransactionId *children,
+ int nrels,
+ RelFileNode *rels,
+ int nstats,
+ xl_xact_stats_item *stats,
+ int ninvalmsgs,
+ SharedInvalidationMessage *invalmsgs,
+ bool initfileinval,
+ const char *gid);
+static void RecordTransactionAbortPrepared(TransactionId xid,
+ int nchildren,
+ TransactionId *children,
+ int nrels,
+ RelFileNode *rels,
+ int nstats,
+ xl_xact_stats_item *stats,
+ const char *gid);
+static void ProcessRecords(char *bufptr, TransactionId xid,
+ const TwoPhaseCallback callbacks[]);
+static void RemoveGXact(GlobalTransaction gxact);
+
+static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
+static char *ProcessTwoPhaseBuffer(TransactionId xid,
+ XLogRecPtr prepare_start_lsn,
+ bool fromdisk, bool setParent, bool setNextXid);
+static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
+ const char *gid, TimestampTz prepared_at, Oid owner,
+ Oid databaseid);
+static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
+static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
+
+/*
+ * Initialization of shared memory
+ */
+Size
+TwoPhaseShmemSize(void)
+{
+ Size size;
+
+ /* Need the fixed struct, the array of pointers, and the GTD structs */
+ size = offsetof(TwoPhaseStateData, prepXacts);
+ size = add_size(size, mul_size(max_prepared_xacts,
+ sizeof(GlobalTransaction)));
+ size = MAXALIGN(size);
+ size = add_size(size, mul_size(max_prepared_xacts,
+ sizeof(GlobalTransactionData)));
+
+ return size;
+}
+
+void
+TwoPhaseShmemInit(void)
+{
+ bool found;
+
+ TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
+ TwoPhaseShmemSize(),
+ &found);
+ if (!IsUnderPostmaster)
+ {
+ GlobalTransaction gxacts;
+ int i;
+
+ Assert(!found);
+ TwoPhaseState->freeGXacts = NULL;
+ TwoPhaseState->numPrepXacts = 0;
+
+ /*
+ * Initialize the linked list of free GlobalTransactionData structs
+ */
+ gxacts = (GlobalTransaction)
+ ((char *) TwoPhaseState +
+ MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
+ sizeof(GlobalTransaction) * max_prepared_xacts));
+ for (i = 0; i < max_prepared_xacts; i++)
+ {
+ /* insert into linked list */
+ gxacts[i].next = TwoPhaseState->freeGXacts;
+ TwoPhaseState->freeGXacts = &gxacts[i];
+
+ /* associate it with a PGPROC assigned by InitProcGlobal */
+ gxacts[i].pgprocno = PreparedXactProcs[i].pgprocno;
+
+ /*
+ * Assign a unique ID for each dummy proc, so that the range of
+ * dummy backend IDs immediately follows the range of normal
+ * backend IDs. We don't dare to assign a real backend ID to dummy
+ * procs, because prepared transactions don't take part in cache
+ * invalidation like a real backend ID would imply, but having a
+ * unique ID for them is nevertheless handy. This arrangement
+ * allows you to allocate an array of size (MaxBackends +
+ * max_prepared_xacts + 1), and have a slot for every backend and
+ * prepared transaction. Currently multixact.c uses that
+ * technique.
+ */
+ gxacts[i].dummyBackendId = MaxBackends + 1 + i;
+ }
+ }
+ else
+ Assert(found);
+}
+
+/*
+ * Exit hook to unlock the global transaction entry we're working on.
+ */
+static void
+AtProcExit_Twophase(int code, Datum arg)
+{
+ /* same logic as abort */
+ AtAbort_Twophase();
+}
+
+/*
+ * Abort hook to unlock the global transaction entry we're working on.
+ */
+void
+AtAbort_Twophase(void)
+{
+ if (MyLockedGxact == NULL)
+ return;
+
+ /*
+ * What to do with the locked global transaction entry? If we were in the
+ * process of preparing the transaction, but haven't written the WAL
+ * record and state file yet, the transaction must not be considered as
+ * prepared. Likewise, if we are in the process of finishing an
+ * already-prepared transaction, and fail after having already written the
+ * 2nd phase commit or rollback record to the WAL, the transaction should
+ * not be considered as prepared anymore. In those cases, just remove the
+ * entry from shared memory.
+ *
+ * Otherwise, the entry must be left in place so that the transaction can
+ * be finished later, so just unlock it.
+ *
+ * If we abort during prepare, after having written the WAL record, we
+ * might not have transferred all locks and other state to the prepared
+ * transaction yet. Likewise, if we abort during commit or rollback,
+ * after having written the WAL record, we might not have released all the
+ * resources held by the transaction yet. In those cases, the in-memory
+ * state can be wrong, but it's too late to back out.
+ */
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ if (!MyLockedGxact->valid)
+ RemoveGXact(MyLockedGxact);
+ else
+ MyLockedGxact->locking_backend = InvalidBackendId;
+ LWLockRelease(TwoPhaseStateLock);
+
+ MyLockedGxact = NULL;
+}
+
+/*
+ * This is called after we have finished transferring state to the prepared
+ * PGPROC entry.
+ */
+void
+PostPrepare_Twophase(void)
+{
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ MyLockedGxact->locking_backend = InvalidBackendId;
+ LWLockRelease(TwoPhaseStateLock);
+
+ MyLockedGxact = NULL;
+}
+
+
+/*
+ * MarkAsPreparing
+ * Reserve the GID for the given transaction.
+ */
+GlobalTransaction
+MarkAsPreparing(TransactionId xid, const char *gid,
+ TimestampTz prepared_at, Oid owner, Oid databaseid)
+{
+ GlobalTransaction gxact;
+ int i;
+
+ if (strlen(gid) >= GIDSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("transaction identifier \"%s\" is too long",
+ gid)));
+
+ /* fail immediately if feature is disabled */
+ if (max_prepared_xacts == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("prepared transactions are disabled"),
+ errhint("Set max_prepared_transactions to a nonzero value.")));
+
+ /* on first call, register the exit hook */
+ if (!twophaseExitRegistered)
+ {
+ before_shmem_exit(AtProcExit_Twophase, 0);
+ twophaseExitRegistered = true;
+ }
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+ /* Check for conflicting GID */
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ gxact = TwoPhaseState->prepXacts[i];
+ if (strcmp(gxact->gid, gid) == 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("transaction identifier \"%s\" is already in use",
+ gid)));
+ }
+ }
+
+ /* Get a free gxact from the freelist */
+ if (TwoPhaseState->freeGXacts == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("maximum number of prepared transactions reached"),
+ errhint("Increase max_prepared_transactions (currently %d).",
+ max_prepared_xacts)));
+ gxact = TwoPhaseState->freeGXacts;
+ TwoPhaseState->freeGXacts = gxact->next;
+
+ MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
+
+ gxact->ondisk = false;
+
+ /* And insert it into the active array */
+ Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
+ TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ return gxact;
+}
+
+/*
+ * MarkAsPreparingGuts
+ *
+ * This uses a gxact struct and puts it into the active array.
+ * NOTE: this is also used when reloading a gxact after a crash; so avoid
+ * assuming that we can use very much backend context.
+ *
+ * Note: This function should be called with appropriate locks held.
+ */
+static void
+MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
+ TimestampTz prepared_at, Oid owner, Oid databaseid)
+{
+ PGPROC *proc;
+ int i;
+
+ Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+ Assert(gxact != NULL);
+ proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+ /* Initialize the PGPROC entry */
+ MemSet(proc, 0, sizeof(PGPROC));
+ proc->pgprocno = gxact->pgprocno;
+ SHMQueueElemInit(&(proc->links));
+ proc->waitStatus = PROC_WAIT_STATUS_OK;
+ if (LocalTransactionIdIsValid(MyProc->lxid))
+ {
+ /* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */
+ proc->lxid = MyProc->lxid;
+ proc->backendId = MyBackendId;
+ }
+ else
+ {
+ Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+ /* GetLockConflicts() uses this to specify a wait on the XID */
+ proc->lxid = xid;
+ proc->backendId = InvalidBackendId;
+ }
+ proc->xid = xid;
+ Assert(proc->xmin == InvalidTransactionId);
+ proc->delayChkptFlags = 0;
+ proc->statusFlags = 0;
+ proc->pid = 0;
+ proc->databaseId = databaseid;
+ proc->roleId = owner;
+ proc->tempNamespaceId = InvalidOid;
+ proc->isBackgroundWorker = false;
+ proc->lwWaiting = false;
+ proc->lwWaitMode = 0;
+ proc->waitLock = NULL;
+ proc->waitProcLock = NULL;
+ pg_atomic_init_u64(&proc->waitStart, 0);
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ SHMQueueInit(&(proc->myProcLocks[i]));
+ /* subxid data must be filled later by GXactLoadSubxactData */
+ proc->subxidStatus.overflowed = false;
+ proc->subxidStatus.count = 0;
+
+ gxact->prepared_at = prepared_at;
+ gxact->xid = xid;
+ gxact->owner = owner;
+ gxact->locking_backend = MyBackendId;
+ gxact->valid = false;
+ gxact->inredo = false;
+ strcpy(gxact->gid, gid);
+
+ /*
+ * Remember that we have this GlobalTransaction entry locked for us. If we
+ * abort after this, we must release it.
+ */
+ MyLockedGxact = gxact;
+}
+
+/*
+ * GXactLoadSubxactData
+ *
+ * If the transaction being persisted had any subtransactions, this must
+ * be called before MarkAsPrepared() to load information into the dummy
+ * PGPROC.
+ */
+static void
+GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
+ TransactionId *children)
+{
+ PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+ /* We need no extra lock since the GXACT isn't valid yet */
+ if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
+ {
+ proc->subxidStatus.overflowed = true;
+ nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
+ }
+ if (nsubxacts > 0)
+ {
+ memcpy(proc->subxids.xids, children,
+ nsubxacts * sizeof(TransactionId));
+ proc->subxidStatus.count = nsubxacts;
+ }
+}
+
+/*
+ * MarkAsPrepared
+ * Mark the GXACT as fully valid, and enter it into the global ProcArray.
+ *
+ * lock_held indicates whether caller already holds TwoPhaseStateLock.
+ */
+static void
+MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
+{
+ /* Lock here may be overkill, but I'm not convinced of that ... */
+ if (!lock_held)
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ Assert(!gxact->valid);
+ gxact->valid = true;
+ if (!lock_held)
+ LWLockRelease(TwoPhaseStateLock);
+
+ /*
+ * Put it into the global ProcArray so TransactionIdIsInProgress considers
+ * the XID as still running.
+ */
+ ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]);
+}
+
+/*
+ * LockGXact
+ * Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
+ */
+static GlobalTransaction
+LockGXact(const char *gid, Oid user)
+{
+ int i;
+
+ /* on first call, register the exit hook */
+ if (!twophaseExitRegistered)
+ {
+ before_shmem_exit(AtProcExit_Twophase, 0);
+ twophaseExitRegistered = true;
+ }
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+ PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+ /* Ignore not-yet-valid GIDs */
+ if (!gxact->valid)
+ continue;
+ if (strcmp(gxact->gid, gid) != 0)
+ continue;
+
+ /* Found it, but has someone else got it locked? */
+ if (gxact->locking_backend != InvalidBackendId)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("prepared transaction with identifier \"%s\" is busy",
+ gid)));
+
+ if (user != gxact->owner && !superuser_arg(user))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to finish prepared transaction"),
+ errhint("Must be superuser or the user that prepared the transaction.")));
+
+ /*
+ * Note: it probably would be possible to allow committing from
+ * another database; but at the moment NOTIFY is known not to work and
+ * there may be some other issues as well. Hence disallow until
+ * someone gets motivated to make it work.
+ */
+ if (MyDatabaseId != proc->databaseId)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("prepared transaction belongs to another database"),
+ errhint("Connect to the database where the transaction was prepared to finish it.")));
+
+ /* OK for me to lock it */
+ gxact->locking_backend = MyBackendId;
+ MyLockedGxact = gxact;
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ return gxact;
+ }
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("prepared transaction with identifier \"%s\" does not exist",
+ gid)));
+
+ /* NOTREACHED */
+ return NULL;
+}
+
+/*
+ * RemoveGXact
+ * Remove the prepared transaction from the shared memory array.
+ *
+ * NB: caller should have already removed it from ProcArray
+ */
+static void
+RemoveGXact(GlobalTransaction gxact)
+{
+ int i;
+
+ Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ if (gxact == TwoPhaseState->prepXacts[i])
+ {
+ /* remove from the active array */
+ TwoPhaseState->numPrepXacts--;
+ TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts];
+
+ /* and put it back in the freelist */
+ gxact->next = TwoPhaseState->freeGXacts;
+ TwoPhaseState->freeGXacts = gxact;
+
+ return;
+ }
+ }
+
+ elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
+}
+
+/*
+ * Returns an array of all prepared transactions for the user-level
+ * function pg_prepared_xact.
+ *
+ * The returned array and all its elements are copies of internal data
+ * structures, to minimize the time we need to hold the TwoPhaseStateLock.
+ *
+ * WARNING -- we return even those transactions that are not fully prepared
+ * yet. The caller should filter them out if he doesn't want them.
+ *
+ * The returned array is palloc'd.
+ */
+static int
+GetPreparedTransactionList(GlobalTransaction *gxacts)
+{
+ GlobalTransaction array;
+ int num;
+ int i;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+ if (TwoPhaseState->numPrepXacts == 0)
+ {
+ LWLockRelease(TwoPhaseStateLock);
+
+ *gxacts = NULL;
+ return 0;
+ }
+
+ num = TwoPhaseState->numPrepXacts;
+ array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
+ *gxacts = array;
+ for (i = 0; i < num; i++)
+ memcpy(array + i, TwoPhaseState->prepXacts[i],
+ sizeof(GlobalTransactionData));
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ return num;
+}
+
+
+/* Working status for pg_prepared_xact */
+typedef struct
+{
+ GlobalTransaction array;
+ int ngxacts;
+ int currIdx;
+} Working_State;
+
+/*
+ * pg_prepared_xact
+ * Produce a view with one row per prepared transaction.
+ *
+ * This function is here so we don't have to export the
+ * GlobalTransactionData struct definition.
+ */
+Datum
+pg_prepared_xact(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ Working_State *status;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+
+ /* create a function context for cross-call persistence */
+ funcctx = SRF_FIRSTCALL_INIT();
+
+ /*
+ * Switch to memory context appropriate for multiple function calls
+ */
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ /* build tupdesc for result tuples */
+ /* this had better match pg_prepared_xacts view in system_views.sql */
+ tupdesc = CreateTemplateTupleDesc(5);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
+ XIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
+ TIMESTAMPTZOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
+ OIDOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
+ OIDOID, -1, 0);
+
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ /*
+ * Collect all the 2PC status information that we will format and send
+ * out as a result set.
+ */
+ status = (Working_State *) palloc(sizeof(Working_State));
+ funcctx->user_fctx = (void *) status;
+
+ status->ngxacts = GetPreparedTransactionList(&status->array);
+ status->currIdx = 0;
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ status = (Working_State *) funcctx->user_fctx;
+
+ while (status->array != NULL && status->currIdx < status->ngxacts)
+ {
+ GlobalTransaction gxact = &status->array[status->currIdx++];
+ PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+ Datum values[5];
+ bool nulls[5];
+ HeapTuple tuple;
+ Datum result;
+
+ if (!gxact->valid)
+ continue;
+
+ /*
+ * Form tuple with appropriate data.
+ */
+ MemSet(values, 0, sizeof(values));
+ MemSet(nulls, 0, sizeof(nulls));
+
+ values[0] = TransactionIdGetDatum(proc->xid);
+ values[1] = CStringGetTextDatum(gxact->gid);
+ values[2] = TimestampTzGetDatum(gxact->prepared_at);
+ values[3] = ObjectIdGetDatum(gxact->owner);
+ values[4] = ObjectIdGetDatum(proc->databaseId);
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ result = HeapTupleGetDatum(tuple);
+ SRF_RETURN_NEXT(funcctx, result);
+ }
+
+ SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * TwoPhaseGetGXact
+ * Get the GlobalTransaction struct for a prepared transaction
+ * specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
+ */
+static GlobalTransaction
+TwoPhaseGetGXact(TransactionId xid, bool lock_held)
+{
+ GlobalTransaction result = NULL;
+ int i;
+
+ static TransactionId cached_xid = InvalidTransactionId;
+ static GlobalTransaction cached_gxact = NULL;
+
+ Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
+
+ /*
+ * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
+ * repeatedly for the same XID. We can save work with a simple cache.
+ */
+ if (xid == cached_xid)
+ return cached_gxact;
+
+ if (!lock_held)
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ if (gxact->xid == xid)
+ {
+ result = gxact;
+ break;
+ }
+ }
+
+ if (!lock_held)
+ LWLockRelease(TwoPhaseStateLock);
+
+ if (result == NULL) /* should not happen */
+ elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
+
+ cached_xid = xid;
+ cached_gxact = result;
+
+ return result;
+}
+
+/*
+ * TwoPhaseGetXidByVirtualXID
+ * Lookup VXID among xacts prepared since last startup.
+ *
+ * (This won't find recovered xacts.) If more than one matches, return any
+ * and set "have_more" to true. To witness multiple matches, a single
+ * BackendId must consume 2^32 LXIDs, with no intervening database restart.
+ */
+TransactionId
+TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
+ bool *have_more)
+{
+ int i;
+ TransactionId result = InvalidTransactionId;
+
+ Assert(VirtualTransactionIdIsValid(vxid));
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+ PGPROC *proc;
+ VirtualTransactionId proc_vxid;
+
+ if (!gxact->valid)
+ continue;
+ proc = &ProcGlobal->allProcs[gxact->pgprocno];
+ GET_VXID_FROM_PGPROC(proc_vxid, *proc);
+ if (VirtualTransactionIdEquals(vxid, proc_vxid))
+ {
+ /* Startup process sets proc->backendId to InvalidBackendId. */
+ Assert(!gxact->inredo);
+
+ if (result != InvalidTransactionId)
+ {
+ *have_more = true;
+ break;
+ }
+ result = gxact->xid;
+ }
+ }
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ return result;
+}
+
+/*
+ * TwoPhaseGetDummyBackendId
+ * Get the dummy backend ID for prepared transaction specified by XID
+ *
+ * Dummy backend IDs are similar to real backend IDs of real backends.
+ * They start at MaxBackends + 1, and are unique across all currently active
+ * real backends and prepared transactions. If lock_held is set to true,
+ * TwoPhaseStateLock will not be taken, so the caller had better hold it.
+ */
+BackendId
+TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held)
+{
+ GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+
+ return gxact->dummyBackendId;
+}
+
+/*
+ * TwoPhaseGetDummyProc
+ * Get the PGPROC that represents a prepared transaction specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
+ */
+PGPROC *
+TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
+{
+ GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+
+ return &ProcGlobal->allProcs[gxact->pgprocno];
+}
+
+/************************************************************************/
+/* State file support */
+/************************************************************************/
+
+#define TwoPhaseFilePath(path, xid) \
+ snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid)
+
+/*
+ * 2PC state file format:
+ *
+ * 1. TwoPhaseFileHeader
+ * 2. TransactionId[] (subtransactions)
+ * 3. RelFileNode[] (files to be deleted at commit)
+ * 4. RelFileNode[] (files to be deleted at abort)
+ * 5. SharedInvalidationMessage[] (inval messages to be sent at commit)
+ * 6. TwoPhaseRecordOnDisk
+ * 7. ...
+ * 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
+ * 9. checksum (CRC-32C)
+ *
+ * Each segment except the final checksum is MAXALIGN'd.
+ */
+
+/*
+ * Header for a 2PC state file
+ */
+#define TWOPHASE_MAGIC 0x57F94534 /* format identifier */
+
+typedef xl_xact_prepare TwoPhaseFileHeader;
+
+/*
+ * Header for each record in a state file
+ *
+ * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
+ * The rmgr data will be stored starting on a MAXALIGN boundary.
+ */
+typedef struct TwoPhaseRecordOnDisk
+{
+ uint32 len; /* length of rmgr data */
+ TwoPhaseRmgrId rmid; /* resource manager for this record */
+ uint16 info; /* flag bits for use by rmgr */
+} TwoPhaseRecordOnDisk;
+
+/*
+ * During prepare, the state file is assembled in memory before writing it
+ * to WAL and the actual state file. We use a chain of StateFileChunk blocks
+ * for that.
+ */
+typedef struct StateFileChunk
+{
+ char *data;
+ uint32 len;
+ struct StateFileChunk *next;
+} StateFileChunk;
+
+static struct xllist
+{
+ StateFileChunk *head; /* first data block in the chain */
+ StateFileChunk *tail; /* last block in chain */
+ uint32 num_chunks;
+ uint32 bytes_free; /* free bytes left in tail block */
+ uint32 total_len; /* total data bytes in chain */
+} records;
+
+
+/*
+ * Append a block of data to records data structure.
+ *
+ * NB: each block is padded to a MAXALIGN multiple. This must be
+ * accounted for when the file is later read!
+ *
+ * The data is copied, so the caller is free to modify it afterwards.
+ */
+static void
+save_state_data(const void *data, uint32 len)
+{
+ uint32 padlen = MAXALIGN(len);
+
+ if (padlen > records.bytes_free)
+ {
+ records.tail->next = palloc0(sizeof(StateFileChunk));
+ records.tail = records.tail->next;
+ records.tail->len = 0;
+ records.tail->next = NULL;
+ records.num_chunks++;
+
+ records.bytes_free = Max(padlen, 512);
+ records.tail->data = palloc(records.bytes_free);
+ }
+
+ memcpy(((char *) records.tail->data) + records.tail->len, data, len);
+ records.tail->len += padlen;
+ records.bytes_free -= padlen;
+ records.total_len += padlen;
+}
+
+/*
+ * Start preparing a state file.
+ *
+ * Initializes data structure and inserts the 2PC file header record.
+ */
+void
+StartPrepare(GlobalTransaction gxact)
+{
+ PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+ TransactionId xid = gxact->xid;
+ TwoPhaseFileHeader hdr;
+ TransactionId *children;
+ RelFileNode *commitrels;
+ RelFileNode *abortrels;
+ xl_xact_stats_item *abortstats = NULL;
+ xl_xact_stats_item *commitstats = NULL;
+ SharedInvalidationMessage *invalmsgs;
+
+ /* Initialize linked list */
+ records.head = palloc0(sizeof(StateFileChunk));
+ records.head->len = 0;
+ records.head->next = NULL;
+
+ records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
+ records.head->data = palloc(records.bytes_free);
+
+ records.tail = records.head;
+ records.num_chunks = 1;
+
+ records.total_len = 0;
+
+ /* Create header */
+ hdr.magic = TWOPHASE_MAGIC;
+ hdr.total_len = 0; /* EndPrepare will fill this in */
+ hdr.xid = xid;
+ hdr.database = proc->databaseId;
+ hdr.prepared_at = gxact->prepared_at;
+ hdr.owner = gxact->owner;
+ hdr.nsubxacts = xactGetCommittedChildren(&children);
+ hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
+ hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
+ hdr.ncommitstats =
+ pgstat_get_transactional_drops(true, &commitstats);
+ hdr.nabortstats =
+ pgstat_get_transactional_drops(false, &abortstats);
+ hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs,
+ &hdr.initfileinval);
+ hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */
+ /* EndPrepare will fill the origin data, if necessary */
+ hdr.origin_lsn = InvalidXLogRecPtr;
+ hdr.origin_timestamp = 0;
+
+ save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
+ save_state_data(gxact->gid, hdr.gidlen);
+
+ /*
+ * Add the additional info about subxacts, deletable files and cache
+ * invalidation messages.
+ */
+ if (hdr.nsubxacts > 0)
+ {
+ save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
+ /* While we have the child-xact data, stuff it in the gxact too */
+ GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
+ }
+ if (hdr.ncommitrels > 0)
+ {
+ save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode));
+ pfree(commitrels);
+ }
+ if (hdr.nabortrels > 0)
+ {
+ save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode));
+ pfree(abortrels);
+ }
+ if (hdr.ncommitstats > 0)
+ {
+ save_state_data(commitstats,
+ hdr.ncommitstats * sizeof(xl_xact_stats_item));
+ pfree(commitstats);
+ }
+ if (hdr.nabortstats > 0)
+ {
+ save_state_data(abortstats,
+ hdr.nabortstats * sizeof(xl_xact_stats_item));
+ pfree(abortstats);
+ }
+ if (hdr.ninvalmsgs > 0)
+ {
+ save_state_data(invalmsgs,
+ hdr.ninvalmsgs * sizeof(SharedInvalidationMessage));
+ pfree(invalmsgs);
+ }
+}
+
+/*
+ * Finish preparing state data and writing it to WAL.
+ */
+void
+EndPrepare(GlobalTransaction gxact)
+{
+ TwoPhaseFileHeader *hdr;
+ StateFileChunk *record;
+ bool replorigin;
+
+ /* Add the end sentinel to the list of 2PC records */
+ RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
+ NULL, 0);
+
+ /* Go back and fill in total_len in the file header record */
+ hdr = (TwoPhaseFileHeader *) records.head->data;
+ Assert(hdr->magic == TWOPHASE_MAGIC);
+ hdr->total_len = records.total_len + sizeof(pg_crc32c);
+
+ replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+ replorigin_session_origin != DoNotReplicateId);
+
+ if (replorigin)
+ {
+ hdr->origin_lsn = replorigin_session_origin_lsn;
+ hdr->origin_timestamp = replorigin_session_origin_timestamp;
+ }
+
+ /*
+ * If the data size exceeds MaxAllocSize, we won't be able to read it in
+ * ReadTwoPhaseFile. Check for that now, rather than fail in the case
+ * where we write data to file and then re-read at commit time.
+ */
+ if (hdr->total_len > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("two-phase state file maximum length exceeded")));
+
+ /*
+ * Now writing 2PC state data to WAL. We let the WAL's CRC protection
+ * cover us, so no need to calculate a separate CRC.
+ *
+ * We have to set DELAY_CHKPT_START here, too; otherwise a checkpoint
+ * starting immediately after the WAL record is inserted could complete
+ * without fsync'ing our state file. (This is essentially the same kind
+ * of race condition as the COMMIT-to-clog-write case that
+ * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.)
+ *
+ * We save the PREPARE record's location in the gxact for later use by
+ * CheckPointTwoPhase.
+ */
+ XLogEnsureRecordSpace(0, records.num_chunks);
+
+ START_CRIT_SECTION();
+
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ XLogBeginInsert();
+ for (record = records.head; record != NULL; record = record->next)
+ XLogRegisterData(record->data, record->len);
+
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+ gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
+
+ if (replorigin)
+ {
+ /* Move LSNs forward for this replication origin */
+ replorigin_session_advance(replorigin_session_origin_lsn,
+ gxact->prepare_end_lsn);
+ }
+
+ XLogFlush(gxact->prepare_end_lsn);
+
+ /* If we crash now, we have prepared: WAL replay will fix things */
+
+ /* Store record's start location to read that later on Commit */
+ gxact->prepare_start_lsn = ProcLastRecPtr;
+
+ /*
+ * Mark the prepared transaction as valid. As soon as xact.c marks MyProc
+ * as not running our XID (which it will do immediately after this
+ * function returns), others can commit/rollback the xact.
+ *
+ * NB: a side effect of this is to make a dummy ProcArray entry for the
+ * prepared XID. This must happen before we clear the XID from MyProc /
+ * ProcGlobal->xids[], else there is a window where the XID is not running
+ * according to TransactionIdIsInProgress, and onlookers would be entitled
+ * to assume the xact crashed. Instead we have a window where the same
+ * XID appears twice in ProcArray, which is OK.
+ */
+ MarkAsPrepared(gxact, false);
+
+ /*
+ * Now we can mark ourselves as out of the commit critical section: a
+ * checkpoint starting after this will certainly see the gxact as a
+ * candidate for fsyncing.
+ */
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ /*
+ * Remember that we have this GlobalTransaction entry locked for us. If
+ * we crash after this point, it's too late to abort, but we must unlock
+ * it so that the prepared transaction can be committed or rolled back.
+ */
+ MyLockedGxact = gxact;
+
+ END_CRIT_SECTION();
+
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked the prepare, but still show as
+ * running in the procarray (twice!) and continue to hold locks.
+ */
+ SyncRepWaitForLSN(gxact->prepare_end_lsn, false);
+
+ records.tail = records.head = NULL;
+ records.num_chunks = 0;
+}
+
+/*
+ * Register a 2PC record to be written to state file.
+ */
+void
+RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
+ const void *data, uint32 len)
+{
+ TwoPhaseRecordOnDisk record;
+
+ record.rmid = rmid;
+ record.info = info;
+ record.len = len;
+ save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
+ if (len > 0)
+ save_state_data(data, len);
+}
+
+
+/*
+ * Read and validate the state file for xid.
+ *
+ * If it looks OK (has a valid magic number and CRC), return the palloc'd
+ * contents of the file, issuing an error when finding corrupted data. If
+ * missing_ok is true, which indicates that missing files can be safely
+ * ignored, then return NULL. This state can be reached when doing recovery.
+ */
+static char *
+ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
+{
+ char path[MAXPGPATH];
+ char *buf;
+ TwoPhaseFileHeader *hdr;
+ int fd;
+ struct stat stat;
+ uint32 crc_offset;
+ pg_crc32c calc_crc,
+ file_crc;
+ int r;
+
+ TwoPhaseFilePath(path, xid);
+
+ fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+ if (fd < 0)
+ {
+ if (missing_ok && errno == ENOENT)
+ return NULL;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ }
+
+ /*
+ * Check file length. We can determine a lower bound pretty easily. We
+ * set an upper bound to avoid palloc() failure on a corrupt file, though
+ * we can't guarantee that we won't get an out of memory error anyway,
+ * even on a valid file.
+ */
+ if (fstat(fd, &stat))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
+
+ if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
+ MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
+ sizeof(pg_crc32c)) ||
+ stat.st_size > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg_plural("incorrect size of file \"%s\": %lld byte",
+ "incorrect size of file \"%s\": %lld bytes",
+ (long long int) stat.st_size, path,
+ (long long int) stat.st_size)));
+
+ crc_offset = stat.st_size - sizeof(pg_crc32c);
+ if (crc_offset != MAXALIGN(crc_offset))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect alignment of CRC offset for file \"%s\"",
+ path)));
+
+ /*
+ * OK, slurp in the file.
+ */
+ buf = (char *) palloc(stat.st_size);
+
+ pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ);
+ r = read(fd, buf, stat.st_size);
+ if (r != stat.st_size)
+ {
+ if (r < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", path)));
+ else
+ ereport(ERROR,
+ (errmsg("could not read file \"%s\": read %d of %lld",
+ path, r, (long long int) stat.st_size)));
+ }
+
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", path)));
+
+ hdr = (TwoPhaseFileHeader *) buf;
+ if (hdr->magic != TWOPHASE_MAGIC)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid magic number stored in file \"%s\"",
+ path)));
+
+ if (hdr->total_len != stat.st_size)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid size stored in file \"%s\"",
+ path)));
+
+ INIT_CRC32C(calc_crc);
+ COMP_CRC32C(calc_crc, buf, crc_offset);
+ FIN_CRC32C(calc_crc);
+
+ file_crc = *((pg_crc32c *) (buf + crc_offset));
+
+ if (!EQ_CRC32C(calc_crc, file_crc))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("calculated CRC checksum does not match value stored in file \"%s\"",
+ path)));
+
+ return buf;
+}
+
+
+/*
+ * Reads 2PC data from xlog. During checkpoint this data will be moved to
+ * twophase files and ReadTwoPhaseFile should be used instead.
+ *
+ * Note clearly that this function can access WAL during normal operation,
+ * similarly to the way WALSender or Logical Decoding would do.
+ */
+static void
+XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
+{
+ XLogRecord *record;
+ XLogReaderState *xlogreader;
+ char *errormsg;
+
+ xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
+ XL_ROUTINE(.page_read = &read_local_xlog_page,
+ .segment_open = &wal_segment_open,
+ .segment_close = &wal_segment_close),
+ NULL);
+ if (!xlogreader)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while allocating a WAL reading processor.")));
+
+ XLogBeginRead(xlogreader, lsn);
+ record = XLogReadRecord(xlogreader, &errormsg);
+
+ if (record == NULL)
+ {
+ if (errormsg)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read two-phase state from WAL at %X/%X: %s",
+ LSN_FORMAT_ARGS(lsn), errormsg)));
+ else
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read two-phase state from WAL at %X/%X",
+ LSN_FORMAT_ARGS(lsn))));
+ }
+
+ if (XLogRecGetRmid(xlogreader) != RM_XACT_ID ||
+ (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("expected two-phase state data is not present in WAL at %X/%X",
+ LSN_FORMAT_ARGS(lsn))));
+
+ if (len != NULL)
+ *len = XLogRecGetDataLen(xlogreader);
+
+ *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
+ memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
+
+ XLogReaderFree(xlogreader);
+}
+
+
+/*
+ * Confirms an xid is prepared, during recovery
+ */
+bool
+StandbyTransactionIdIsPrepared(TransactionId xid)
+{
+ char *buf;
+ TwoPhaseFileHeader *hdr;
+ bool result;
+
+ Assert(TransactionIdIsValid(xid));
+
+ if (max_prepared_xacts <= 0)
+ return false; /* nothing to do */
+
+ /* Read and validate file */
+ buf = ReadTwoPhaseFile(xid, true);
+ if (buf == NULL)
+ return false;
+
+ /* Check header also */
+ hdr = (TwoPhaseFileHeader *) buf;
+ result = TransactionIdEquals(hdr->xid, xid);
+ pfree(buf);
+
+ return result;
+}
+
+/*
+ * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
+ */
+void
+FinishPreparedTransaction(const char *gid, bool isCommit)
+{
+ GlobalTransaction gxact;
+ PGPROC *proc;
+ TransactionId xid;
+ char *buf;
+ char *bufptr;
+ TwoPhaseFileHeader *hdr;
+ TransactionId latestXid;
+ TransactionId *children;
+ RelFileNode *commitrels;
+ RelFileNode *abortrels;
+ RelFileNode *delrels;
+ int ndelrels;
+ xl_xact_stats_item *commitstats;
+ xl_xact_stats_item *abortstats;
+ SharedInvalidationMessage *invalmsgs;
+
+ /*
+ * Validate the GID, and lock the GXACT to ensure that two backends do not
+ * try to commit the same GID at once.
+ */
+ gxact = LockGXact(gid, GetUserId());
+ proc = &ProcGlobal->allProcs[gxact->pgprocno];
+ xid = gxact->xid;
+
+ /*
+ * Read and validate 2PC state data. State data will typically be stored
+ * in WAL files if the LSN is after the last checkpoint record, or moved
+ * to disk if for some reason they have lived for a long time.
+ */
+ if (gxact->ondisk)
+ buf = ReadTwoPhaseFile(xid, false);
+ else
+ XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+
+ /*
+ * Disassemble the header area
+ */
+ hdr = (TwoPhaseFileHeader *) buf;
+ Assert(TransactionIdEquals(hdr->xid, xid));
+ bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+ bufptr += MAXALIGN(hdr->gidlen);
+ children = (TransactionId *) bufptr;
+ bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
+ commitrels = (RelFileNode *) bufptr;
+ bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
+ abortrels = (RelFileNode *) bufptr;
+ bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+ commitstats = (xl_xact_stats_item *) bufptr;
+ bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
+ abortstats = (xl_xact_stats_item *) bufptr;
+ bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
+ invalmsgs = (SharedInvalidationMessage *) bufptr;
+ bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
+
+ /* compute latestXid among all children */
+ latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
+
+ /* Prevent cancel/die interrupt while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /*
+ * The order of operations here is critical: make the XLOG entry for
+ * commit or abort, then mark the transaction committed or aborted in
+ * pg_xact, then remove its PGPROC from the global ProcArray (which means
+ * TransactionIdIsInProgress will stop saying the prepared xact is in
+ * progress), then run the post-commit or post-abort callbacks. The
+ * callbacks will release the locks the transaction held.
+ */
+ if (isCommit)
+ RecordTransactionCommitPrepared(xid,
+ hdr->nsubxacts, children,
+ hdr->ncommitrels, commitrels,
+ hdr->ncommitstats,
+ commitstats,
+ hdr->ninvalmsgs, invalmsgs,
+ hdr->initfileinval, gid);
+ else
+ RecordTransactionAbortPrepared(xid,
+ hdr->nsubxacts, children,
+ hdr->nabortrels, abortrels,
+ hdr->nabortstats,
+ abortstats,
+ gid);
+
+ ProcArrayRemove(proc, latestXid);
+
+ /*
+ * In case we fail while running the callbacks, mark the gxact invalid so
+ * no one else will try to commit/rollback, and so it will be recycled if
+ * we fail after this point. It is still locked by our backend so it
+ * won't go away yet.
+ *
+ * (We assume it's safe to do this without taking TwoPhaseStateLock.)
+ */
+ gxact->valid = false;
+
+ /*
+ * We have to remove any files that were supposed to be dropped. For
+ * consistency with the regular xact.c code paths, must do this before
+ * releasing locks, so do it before running the callbacks.
+ *
+ * NB: this code knows that we couldn't be dropping any temp rels ...
+ */
+ if (isCommit)
+ {
+ delrels = commitrels;
+ ndelrels = hdr->ncommitrels;
+ }
+ else
+ {
+ delrels = abortrels;
+ ndelrels = hdr->nabortrels;
+ }
+
+ /* Make sure files supposed to be dropped are dropped */
+ DropRelationFiles(delrels, ndelrels, false);
+
+ if (isCommit)
+ pgstat_execute_transactional_drops(hdr->ncommitstats, commitstats, false);
+ else
+ pgstat_execute_transactional_drops(hdr->nabortstats, abortstats, false);
+
+ /*
+ * Handle cache invalidation messages.
+ *
+ * Relcache init file invalidation requires processing both before and
+ * after we send the SI messages, only when committing. See
+ * AtEOXact_Inval().
+ */
+ if (isCommit)
+ {
+ if (hdr->initfileinval)
+ RelationCacheInitFilePreInvalidate();
+ SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
+ if (hdr->initfileinval)
+ RelationCacheInitFilePostInvalidate();
+ }
+
+ /*
+ * Acquire the two-phase lock. We want to work on the two-phase callbacks
+ * while holding it to avoid potential conflicts with other transactions
+ * attempting to use the same GID, so the lock is released once the shared
+ * memory state is cleared.
+ */
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+ /* And now do the callbacks */
+ if (isCommit)
+ ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
+ else
+ ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
+
+ PredicateLockTwoPhaseFinish(xid, isCommit);
+
+ /* Clear shared memory state */
+ RemoveGXact(gxact);
+
+ /*
+ * Release the lock as all callbacks are called and shared memory cleanup
+ * is done.
+ */
+ LWLockRelease(TwoPhaseStateLock);
+
+ /* Count the prepared xact as committed or aborted */
+ AtEOXact_PgStat(isCommit, false);
+
+ /*
+ * And now we can clean up any files we may have left.
+ */
+ if (gxact->ondisk)
+ RemoveTwoPhaseFile(xid, true);
+
+ MyLockedGxact = NULL;
+
+ RESUME_INTERRUPTS();
+
+ pfree(buf);
+}
+
+/*
+ * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
+ */
+static void
+ProcessRecords(char *bufptr, TransactionId xid,
+ const TwoPhaseCallback callbacks[])
+{
+ for (;;)
+ {
+ TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
+
+ Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
+ if (record->rmid == TWOPHASE_RM_END_ID)
+ break;
+
+ bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
+
+ if (callbacks[record->rmid] != NULL)
+ callbacks[record->rmid] (xid, record->info,
+ (void *) bufptr, record->len);
+
+ bufptr += MAXALIGN(record->len);
+ }
+}
+
+/*
+ * Remove the 2PC file for the specified XID.
+ *
+ * If giveWarning is false, do not complain about file-not-present;
+ * this is an expected case during WAL replay.
+ */
+static void
+RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
+{
+ char path[MAXPGPATH];
+
+ TwoPhaseFilePath(path, xid);
+ if (unlink(path))
+ if (errno != ENOENT || giveWarning)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path)));
+}
+
+/*
+ * Recreates a state file. This is used in WAL replay and during
+ * checkpoint creation.
+ *
+ * Note: content and len don't include CRC.
+ */
+static void
+RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
+{
+ char path[MAXPGPATH];
+ pg_crc32c statefile_crc;
+ int fd;
+
+ /* Recompute CRC */
+ INIT_CRC32C(statefile_crc);
+ COMP_CRC32C(statefile_crc, content, len);
+ FIN_CRC32C(statefile_crc);
+
+ TwoPhaseFilePath(path, xid);
+
+ fd = OpenTransientFile(path,
+ O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not recreate file \"%s\": %m", path)));
+
+ /* Write content and CRC */
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE);
+ if (write(fd, content, len) != len)
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m", path)));
+ }
+ if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c))
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m", path)));
+ }
+ pgstat_report_wait_end();
+
+ /*
+ * We must fsync the file because the end-of-replay checkpoint will not do
+ * so, there being no GXACT in shared memory yet to tell it to.
+ */
+ pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC);
+ if (pg_fsync(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", path)));
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", path)));
+}
+
+/*
+ * CheckPointTwoPhase -- handle 2PC component of checkpointing.
+ *
+ * We must fsync the state file of any GXACT that is valid or has been
+ * generated during redo and has a PREPARE LSN <= the checkpoint's redo
+ * horizon. (If the gxact isn't valid yet, has not been generated in
+ * redo, or has a later LSN, this checkpoint is not responsible for
+ * fsyncing it.)
+ *
+ * This is deliberately run as late as possible in the checkpoint sequence,
+ * because GXACTs ordinarily have short lifespans, and so it is quite
+ * possible that GXACTs that were valid at checkpoint start will no longer
+ * exist if we wait a little bit. With typical checkpoint settings this
+ * will be about 3 minutes for an online checkpoint, so as a result we
+ * expect that there will be no GXACTs that need to be copied to disk.
+ *
+ * If a GXACT remains valid across multiple checkpoints, it will already
+ * be on disk so we don't bother to repeat that write.
+ */
+void
+CheckPointTwoPhase(XLogRecPtr redo_horizon)
+{
+ int i;
+ int serialized_xacts = 0;
+
+ if (max_prepared_xacts <= 0)
+ return; /* nothing to do */
+
+ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
+
+ /*
+ * We are expecting there to be zero GXACTs that need to be copied to
+ * disk, so we perform all I/O while holding TwoPhaseStateLock for
+ * simplicity. This prevents any new xacts from preparing while this
+ * occurs, which shouldn't be a problem since the presence of long-lived
+ * prepared xacts indicates the transaction manager isn't active.
+ *
+ * It's also possible to move I/O out of the lock, but on every error we
+ * should check whether somebody committed our transaction in different
+ * backend. Let's leave this optimization for future, if somebody will
+ * spot that this place cause bottleneck.
+ *
+ * Note that it isn't possible for there to be a GXACT with a
+ * prepare_end_lsn set prior to the last checkpoint yet is marked invalid,
+ * because of the efforts with delayChkptFlags.
+ */
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ /*
+ * Note that we are using gxact not PGPROC so this works in recovery
+ * also
+ */
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ if ((gxact->valid || gxact->inredo) &&
+ !gxact->ondisk &&
+ gxact->prepare_end_lsn <= redo_horizon)
+ {
+ char *buf;
+ int len;
+
+ XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
+ RecreateTwoPhaseFile(gxact->xid, buf, len);
+ gxact->ondisk = true;
+ gxact->prepare_start_lsn = InvalidXLogRecPtr;
+ gxact->prepare_end_lsn = InvalidXLogRecPtr;
+ pfree(buf);
+ serialized_xacts++;
+ }
+ }
+ LWLockRelease(TwoPhaseStateLock);
+
+ /*
+ * Flush unconditionally the parent directory to make any information
+ * durable on disk. Two-phase files could have been removed and those
+ * removals need to be made persistent as well as any files newly created
+ * previously since the last checkpoint.
+ */
+ fsync_fname(TWOPHASE_DIR, true);
+
+ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
+
+ if (log_checkpoints && serialized_xacts > 0)
+ ereport(LOG,
+ (errmsg_plural("%u two-phase state file was written "
+ "for a long-running prepared transaction",
+ "%u two-phase state files were written "
+ "for long-running prepared transactions",
+ serialized_xacts,
+ serialized_xacts)));
+}
+
+/*
+ * restoreTwoPhaseData
+ *
+ * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
+ * This is called once at the beginning of recovery, saving any extra
+ * lookups in the future. Two-phase files that are newer than the
+ * minimum XID horizon are discarded on the way.
+ */
+void
+restoreTwoPhaseData(void)
+{
+ DIR *cldir;
+ struct dirent *clde;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ cldir = AllocateDir(TWOPHASE_DIR);
+ while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+ {
+ if (strlen(clde->d_name) == 8 &&
+ strspn(clde->d_name, "0123456789ABCDEF") == 8)
+ {
+ TransactionId xid;
+ char *buf;
+
+ xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+ buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
+ true, false, false);
+ if (buf == NULL)
+ continue;
+
+ PrepareRedoAdd(buf, InvalidXLogRecPtr,
+ InvalidXLogRecPtr, InvalidRepOriginId);
+ }
+ }
+ LWLockRelease(TwoPhaseStateLock);
+ FreeDir(cldir);
+}
+
+/*
+ * PrescanPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and determine the range
+ * of valid XIDs present. This is run during database startup, after we
+ * have completed reading WAL. ShmemVariableCache->nextXid has been set to
+ * one more than the highest XID for which evidence exists in WAL.
+ *
+ * We throw away any prepared xacts with main XID beyond nextXid --- if any
+ * are present, it suggests that the DBA has done a PITR recovery to an
+ * earlier point in time without cleaning out pg_twophase. We dare not
+ * try to recover such prepared xacts since they likely depend on database
+ * state that doesn't exist now.
+ *
+ * However, we will advance nextXid beyond any subxact XIDs belonging to
+ * valid prepared xacts. We need to do this since subxact commit doesn't
+ * write a WAL entry, and so there might be no evidence in WAL of those
+ * subxact XIDs.
+ *
+ * On corrupted two-phase files, fail immediately. Keeping around broken
+ * entries and let replay continue causes harm on the system, and a new
+ * backup should be rolled in.
+ *
+ * Our other responsibility is to determine and return the oldest valid XID
+ * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
+ * This is needed to synchronize pg_subtrans startup properly.
+ *
+ * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
+ * top-level xids is stored in *xids_p. The number of entries in the array
+ * is returned in *nxids_p.
+ */
+TransactionId
+PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
+{
+ FullTransactionId nextXid = ShmemVariableCache->nextXid;
+ TransactionId origNextXid = XidFromFullTransactionId(nextXid);
+ TransactionId result = origNextXid;
+ TransactionId *xids = NULL;
+ int nxids = 0;
+ int allocsize = 0;
+ int i;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ TransactionId xid;
+ char *buf;
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ Assert(gxact->inredo);
+
+ xid = gxact->xid;
+
+ buf = ProcessTwoPhaseBuffer(xid,
+ gxact->prepare_start_lsn,
+ gxact->ondisk, false, true);
+
+ if (buf == NULL)
+ continue;
+
+ /*
+ * OK, we think this file is valid. Incorporate xid into the
+ * running-minimum result.
+ */
+ if (TransactionIdPrecedes(xid, result))
+ result = xid;
+
+ if (xids_p)
+ {
+ if (nxids == allocsize)
+ {
+ if (nxids == 0)
+ {
+ allocsize = 10;
+ xids = palloc(allocsize * sizeof(TransactionId));
+ }
+ else
+ {
+ allocsize = allocsize * 2;
+ xids = repalloc(xids, allocsize * sizeof(TransactionId));
+ }
+ }
+ xids[nxids++] = xid;
+ }
+
+ pfree(buf);
+ }
+ LWLockRelease(TwoPhaseStateLock);
+
+ if (xids_p)
+ {
+ *xids_p = xids;
+ *nxids_p = nxids;
+ }
+
+ return result;
+}
+
+/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and setup all the required
+ * information to allow standby queries to treat prepared transactions as still
+ * active.
+ *
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * The lack of calls to SubTransSetParent() calls here is by design;
+ * those calls are made by RecoverPreparedTransactions() at the end of recovery
+ * for those xacts that need this.
+ */
+void
+StandbyRecoverPreparedTransactions(void)
+{
+ int i;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ TransactionId xid;
+ char *buf;
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ Assert(gxact->inredo);
+
+ xid = gxact->xid;
+
+ buf = ProcessTwoPhaseBuffer(xid,
+ gxact->prepare_start_lsn,
+ gxact->ondisk, false, false);
+ if (buf != NULL)
+ pfree(buf);
+ }
+ LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * RecoverPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and reload the state for
+ * each prepared transaction (reacquire locks, etc).
+ *
+ * This is run at the end of recovery, but before we allow backends to write
+ * WAL.
+ *
+ * At the end of recovery the way we take snapshots will change. We now need
+ * to mark all running transactions with their full SubTransSetParent() info
+ * to allow normal snapshots to work correctly if snapshots overflow.
+ * We do this here because by definition prepared transactions are the only
+ * type of write transaction still running, so this is necessary and
+ * complete.
+ */
+void
+RecoverPreparedTransactions(void)
+{
+ int i;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ TransactionId xid;
+ char *buf;
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+ char *bufptr;
+ TwoPhaseFileHeader *hdr;
+ TransactionId *subxids;
+ const char *gid;
+
+ xid = gxact->xid;
+
+ /*
+ * Reconstruct subtrans state for the transaction --- needed because
+ * pg_subtrans is not preserved over a restart. Note that we are
+ * linking all the subtransactions directly to the top-level XID;
+ * there may originally have been a more complex hierarchy, but
+ * there's no need to restore that exactly. It's possible that
+ * SubTransSetParent has been set before, if the prepared transaction
+ * generated xid assignment records.
+ */
+ buf = ProcessTwoPhaseBuffer(xid,
+ gxact->prepare_start_lsn,
+ gxact->ondisk, true, false);
+ if (buf == NULL)
+ continue;
+
+ ereport(LOG,
+ (errmsg("recovering prepared transaction %u from shared memory", xid)));
+
+ hdr = (TwoPhaseFileHeader *) buf;
+ Assert(TransactionIdEquals(hdr->xid, xid));
+ bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+ gid = (const char *) bufptr;
+ bufptr += MAXALIGN(hdr->gidlen);
+ subxids = (TransactionId *) bufptr;
+ bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
+ bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
+ bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+ bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
+ bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
+ bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
+
+ /*
+ * Recreate its GXACT and dummy PGPROC. But, check whether it was
+ * added in redo and already has a shmem entry for it.
+ */
+ MarkAsPreparingGuts(gxact, xid, gid,
+ hdr->prepared_at,
+ hdr->owner, hdr->database);
+
+ /* recovered, so reset the flag for entries generated by redo */
+ gxact->inredo = false;
+
+ GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
+ MarkAsPrepared(gxact, true);
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ /*
+ * Recover other state (notably locks) using resource managers.
+ */
+ ProcessRecords(bufptr, xid, twophase_recover_callbacks);
+
+ /*
+ * Release locks held by the standby process after we process each
+ * prepared transaction. As a result, we don't need too many
+ * additional locks at any one time.
+ */
+ if (InHotStandby)
+ StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
+
+ /*
+ * We're done with recovering this transaction. Clear MyLockedGxact,
+ * like we do in PrepareTransaction() during normal operation.
+ */
+ PostPrepare_Twophase();
+
+ pfree(buf);
+
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ }
+
+ LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * ProcessTwoPhaseBuffer
+ *
+ * Given a transaction id, read it either from disk or read it directly
+ * via shmem xlog record pointer using the provided "prepare_start_lsn".
+ *
+ * If setParent is true, set up subtransaction parent linkages.
+ *
+ * If setNextXid is true, set ShmemVariableCache->nextXid to the newest
+ * value scanned.
+ */
+static char *
+ProcessTwoPhaseBuffer(TransactionId xid,
+ XLogRecPtr prepare_start_lsn,
+ bool fromdisk,
+ bool setParent, bool setNextXid)
+{
+ FullTransactionId nextXid = ShmemVariableCache->nextXid;
+ TransactionId origNextXid = XidFromFullTransactionId(nextXid);
+ TransactionId *subxids;
+ char *buf;
+ TwoPhaseFileHeader *hdr;
+ int i;
+
+ Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+ if (!fromdisk)
+ Assert(prepare_start_lsn != InvalidXLogRecPtr);
+
+ /* Already processed? */
+ if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+ {
+ if (fromdisk)
+ {
+ ereport(WARNING,
+ (errmsg("removing stale two-phase state file for transaction %u",
+ xid)));
+ RemoveTwoPhaseFile(xid, true);
+ }
+ else
+ {
+ ereport(WARNING,
+ (errmsg("removing stale two-phase state from memory for transaction %u",
+ xid)));
+ PrepareRedoRemove(xid, true);
+ }
+ return NULL;
+ }
+
+ /* Reject XID if too new */
+ if (TransactionIdFollowsOrEquals(xid, origNextXid))
+ {
+ if (fromdisk)
+ {
+ ereport(WARNING,
+ (errmsg("removing future two-phase state file for transaction %u",
+ xid)));
+ RemoveTwoPhaseFile(xid, true);
+ }
+ else
+ {
+ ereport(WARNING,
+ (errmsg("removing future two-phase state from memory for transaction %u",
+ xid)));
+ PrepareRedoRemove(xid, true);
+ }
+ return NULL;
+ }
+
+ if (fromdisk)
+ {
+ /* Read and validate file */
+ buf = ReadTwoPhaseFile(xid, false);
+ }
+ else
+ {
+ /* Read xlog data */
+ XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL);
+ }
+
+ /* Deconstruct header */
+ hdr = (TwoPhaseFileHeader *) buf;
+ if (!TransactionIdEquals(hdr->xid, xid))
+ {
+ if (fromdisk)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted two-phase state file for transaction %u",
+ xid)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted two-phase state in memory for transaction %u",
+ xid)));
+ }
+
+ /*
+ * Examine subtransaction XIDs ... they should all follow main XID, and
+ * they may force us to advance nextXid.
+ */
+ subxids = (TransactionId *) (buf +
+ MAXALIGN(sizeof(TwoPhaseFileHeader)) +
+ MAXALIGN(hdr->gidlen));
+ for (i = 0; i < hdr->nsubxacts; i++)
+ {
+ TransactionId subxid = subxids[i];
+
+ Assert(TransactionIdFollows(subxid, xid));
+
+ /* update nextXid if needed */
+ if (setNextXid)
+ AdvanceNextFullTransactionIdPastXid(subxid);
+
+ if (setParent)
+ SubTransSetParent(subxid, xid);
+ }
+
+ return buf;
+}
+
+
+/*
+ * RecordTransactionCommitPrepared
+ *
+ * This is basically the same as RecordTransactionCommit (q.v. if you change
+ * this function): in particular, we must set DELAY_CHKPT_START to avoid a
+ * race condition.
+ *
+ * We know the transaction made at least one XLOG entry (its PREPARE),
+ * so it is never possible to optimize out the commit record.
+ */
+static void
+RecordTransactionCommitPrepared(TransactionId xid,
+ int nchildren,
+ TransactionId *children,
+ int nrels,
+ RelFileNode *rels,
+ int nstats,
+ xl_xact_stats_item *stats,
+ int ninvalmsgs,
+ SharedInvalidationMessage *invalmsgs,
+ bool initfileinval,
+ const char *gid)
+{
+ XLogRecPtr recptr;
+ TimestampTz committs = GetCurrentTimestamp();
+ bool replorigin;
+
+ /*
+ * Are we using the replication origins feature? Or, in other words, are
+ * we replaying remote actions?
+ */
+ replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+ replorigin_session_origin != DoNotReplicateId);
+
+ START_CRIT_SECTION();
+
+ /* See notes in RecordTransactionCommit */
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ /*
+ * Emit the XLOG commit record. Note that we mark 2PC commits as
+ * potentially having AccessExclusiveLocks since we don't know whether or
+ * not they do.
+ */
+ recptr = XactLogCommitRecord(committs,
+ nchildren, children, nrels, rels,
+ nstats, stats,
+ ninvalmsgs, invalmsgs,
+ initfileinval,
+ MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
+ xid, gid);
+
+
+ if (replorigin)
+ /* Move LSNs forward for this replication origin */
+ replorigin_session_advance(replorigin_session_origin_lsn,
+ XactLastRecEnd);
+
+ /*
+ * Record commit timestamp. The value comes from plain commit timestamp
+ * if replorigin is not enabled, or replorigin already set a value for us
+ * in replorigin_session_origin_timestamp otherwise.
+ *
+ * We don't need to WAL-log anything here, as the commit record written
+ * above already contains the data.
+ */
+ if (!replorigin || replorigin_session_origin_timestamp == 0)
+ replorigin_session_origin_timestamp = committs;
+
+ TransactionTreeSetCommitTsData(xid, nchildren, children,
+ replorigin_session_origin_timestamp,
+ replorigin_session_origin);
+
+ /*
+ * We don't currently try to sleep before flush here ... nor is there any
+ * support for async commit of a prepared xact (the very idea is probably
+ * a contradiction)
+ */
+
+ /* Flush XLOG to disk */
+ XLogFlush(recptr);
+
+ /* Mark the transaction committed in pg_xact */
+ TransactionIdCommitTree(xid, nchildren, children);
+
+ /* Checkpoint can proceed now */
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ END_CRIT_SECTION();
+
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ SyncRepWaitForLSN(recptr, true);
+}
+
+/*
+ * RecordTransactionAbortPrepared
+ *
+ * This is basically the same as RecordTransactionAbort.
+ *
+ * We know the transaction made at least one XLOG entry (its PREPARE),
+ * so it is never possible to optimize out the abort record.
+ */
+static void
+RecordTransactionAbortPrepared(TransactionId xid,
+ int nchildren,
+ TransactionId *children,
+ int nrels,
+ RelFileNode *rels,
+ int nstats,
+ xl_xact_stats_item *stats,
+ const char *gid)
+{
+ XLogRecPtr recptr;
+ bool replorigin;
+
+ /*
+ * Are we using the replication origins feature? Or, in other words, are
+ * we replaying remote actions?
+ */
+ replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+ replorigin_session_origin != DoNotReplicateId);
+
+ /*
+ * Catch the scenario where we aborted partway through
+ * RecordTransactionCommitPrepared ...
+ */
+ if (TransactionIdDidCommit(xid))
+ elog(PANIC, "cannot abort transaction %u, it was already committed",
+ xid);
+
+ START_CRIT_SECTION();
+
+ /*
+ * Emit the XLOG commit record. Note that we mark 2PC aborts as
+ * potentially having AccessExclusiveLocks since we don't know whether or
+ * not they do.
+ */
+ recptr = XactLogAbortRecord(GetCurrentTimestamp(),
+ nchildren, children,
+ nrels, rels,
+ nstats, stats,
+ MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
+ xid, gid);
+
+ if (replorigin)
+ /* Move LSNs forward for this replication origin */
+ replorigin_session_advance(replorigin_session_origin_lsn,
+ XactLastRecEnd);
+
+ /* Always flush, since we're about to remove the 2PC state file */
+ XLogFlush(recptr);
+
+ /*
+ * Mark the transaction aborted in clog. This is not absolutely necessary
+ * but we may as well do it while we are here.
+ */
+ TransactionIdAbortTree(xid, nchildren, children);
+
+ END_CRIT_SECTION();
+
+ /*
+ * Wait for synchronous replication, if required.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ SyncRepWaitForLSN(recptr, false);
+}
+
+/*
+ * PrepareRedoAdd
+ *
+ * Store pointers to the start/end of the WAL record along with the xid in
+ * a gxact entry in shared memory TwoPhaseState structure. If caller
+ * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase
+ * data, the entry is marked as located on disk.
+ */
+void
+PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
+ XLogRecPtr end_lsn, RepOriginId origin_id)
+{
+ TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf;
+ char *bufptr;
+ const char *gid;
+ GlobalTransaction gxact;
+
+ Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+ Assert(RecoveryInProgress());
+
+ bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+ gid = (const char *) bufptr;
+
+ /*
+ * Reserve the GID for the given transaction in the redo code path.
+ *
+ * This creates a gxact struct and puts it into the active array.
+ *
+ * In redo, this struct is mainly used to track PREPARE/COMMIT entries in
+ * shared memory. Hence, we only fill up the bare minimum contents here.
+ * The gxact also gets marked with gxact->inredo set to true to indicate
+ * that it got added in the redo phase
+ */
+
+ /*
+ * In the event of a crash while a checkpoint was running, it may be
+ * possible that some two-phase data found its way to disk while its
+ * corresponding record needs to be replayed in the follow-up recovery. As
+ * the 2PC data was on disk, it has already been restored at the beginning
+ * of recovery with restoreTwoPhaseData(), so skip this record to avoid
+ * duplicates in TwoPhaseState. If a consistent state has been reached,
+ * the record is added to TwoPhaseState and it should have no
+ * corresponding file in pg_twophase.
+ */
+ if (!XLogRecPtrIsInvalid(start_lsn))
+ {
+ char path[MAXPGPATH];
+
+ TwoPhaseFilePath(path, hdr->xid);
+
+ if (access(path, F_OK) == 0)
+ {
+ ereport(reachedConsistency ? ERROR : WARNING,
+ (errmsg("could not recover two-phase state file for transaction %u",
+ hdr->xid),
+ errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.",
+ LSN_FORMAT_ARGS(start_lsn))));
+ return;
+ }
+
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access file \"%s\": %m", path)));
+ }
+
+ /* Get a free gxact from the freelist */
+ if (TwoPhaseState->freeGXacts == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("maximum number of prepared transactions reached"),
+ errhint("Increase max_prepared_transactions (currently %d).",
+ max_prepared_xacts)));
+ gxact = TwoPhaseState->freeGXacts;
+ TwoPhaseState->freeGXacts = gxact->next;
+
+ gxact->prepared_at = hdr->prepared_at;
+ gxact->prepare_start_lsn = start_lsn;
+ gxact->prepare_end_lsn = end_lsn;
+ gxact->xid = hdr->xid;
+ gxact->owner = hdr->owner;
+ gxact->locking_backend = InvalidBackendId;
+ gxact->valid = false;
+ gxact->ondisk = XLogRecPtrIsInvalid(start_lsn);
+ gxact->inredo = true; /* yes, added in redo */
+ strcpy(gxact->gid, gid);
+
+ /* And insert it into the active array */
+ Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
+ TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
+
+ if (origin_id != InvalidRepOriginId)
+ {
+ /* recover apply progress */
+ replorigin_advance(origin_id, hdr->origin_lsn, end_lsn,
+ false /* backward */ , false /* WAL */ );
+ }
+
+ elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid);
+}
+
+/*
+ * PrepareRedoRemove
+ *
+ * Remove the corresponding gxact entry from TwoPhaseState. Also remove
+ * the 2PC file if a prepared transaction was saved via an earlier checkpoint.
+ *
+ * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
+ * is updated.
+ */
+void
+PrepareRedoRemove(TransactionId xid, bool giveWarning)
+{
+ GlobalTransaction gxact = NULL;
+ int i;
+ bool found = false;
+
+ Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+ Assert(RecoveryInProgress());
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ gxact = TwoPhaseState->prepXacts[i];
+
+ if (gxact->xid == xid)
+ {
+ Assert(gxact->inredo);
+ found = true;
+ break;
+ }
+ }
+
+ /*
+ * Just leave if there is nothing, this is expected during WAL replay.
+ */
+ if (!found)
+ return;
+
+ /*
+ * And now we can clean up any files we may have left.
+ */
+ elog(DEBUG2, "removing 2PC data for transaction %u", xid);
+ if (gxact->ondisk)
+ RemoveTwoPhaseFile(xid, giveWarning);
+ RemoveGXact(gxact);
+}
+
+/*
+ * LookupGXact
+ * Check if the prepared transaction with the given GID, lsn and timestamp
+ * exists.
+ *
+ * Note that we always compare with the LSN where prepare ends because that is
+ * what is stored as origin_lsn in the 2PC file.
+ *
+ * This function is primarily used to check if the prepared transaction
+ * received from the upstream (remote node) already exists. Checking only GID
+ * is not sufficient because a different prepared xact with the same GID can
+ * exist on the same node. So, we are ensuring to match origin_lsn and
+ * origin_timestamp of prepared xact to avoid the possibility of a match of
+ * prepared xact from two different nodes.
+ */
+bool
+LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
+ TimestampTz origin_prepare_timestamp)
+{
+ int i;
+ bool found = false;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ /* Ignore not-yet-valid GIDs. */
+ if (gxact->valid && strcmp(gxact->gid, gid) == 0)
+ {
+ char *buf;
+ TwoPhaseFileHeader *hdr;
+
+ /*
+ * We are not expecting collisions of GXACTs (same gid) between
+ * publisher and subscribers, so we perform all I/O while holding
+ * TwoPhaseStateLock for simplicity.
+ *
+ * To move the I/O out of the lock, we need to ensure that no
+ * other backend commits the prepared xact in the meantime. We can
+ * do this optimization if we encounter many collisions in GID
+ * between publisher and subscriber.
+ */
+ if (gxact->ondisk)
+ buf = ReadTwoPhaseFile(gxact->xid, false);
+ else
+ {
+ Assert(gxact->prepare_start_lsn);
+ XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+ }
+
+ hdr = (TwoPhaseFileHeader *) buf;
+
+ if (hdr->origin_lsn == prepare_end_lsn &&
+ hdr->origin_timestamp == origin_prepare_timestamp)
+ {
+ found = true;
+ pfree(buf);
+ break;
+ }
+
+ pfree(buf);
+ }
+ }
+ LWLockRelease(TwoPhaseStateLock);
+ return found;
+}
diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c
new file mode 100644
index 0000000..35a9b32
--- /dev/null
+++ b/src/backend/access/transam/twophase_rmgr.c
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase_rmgr.c
+ * Two-phase-commit resource managers tables
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/twophase_rmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/multixact.h"
+#include "access/twophase_rmgr.h"
+#include "pgstat.h"
+#include "storage/lock.h"
+#include "storage/predicate.h"
+
+
+const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+ NULL, /* END ID */
+ lock_twophase_recover, /* Lock */
+ NULL, /* pgstat */
+ multixact_twophase_recover, /* MultiXact */
+ predicatelock_twophase_recover /* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+ NULL, /* END ID */
+ lock_twophase_postcommit, /* Lock */
+ pgstat_twophase_postcommit, /* pgstat */
+ multixact_twophase_postcommit, /* MultiXact */
+ NULL /* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+ NULL, /* END ID */
+ lock_twophase_postabort, /* Lock */
+ pgstat_twophase_postabort, /* pgstat */
+ multixact_twophase_postabort, /* MultiXact */
+ NULL /* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+ NULL, /* END ID */
+ lock_twophase_standby_recover, /* Lock */
+ NULL, /* pgstat */
+ NULL, /* MultiXact */
+ NULL /* PredicateLock */
+};
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
new file mode 100644
index 0000000..748120a
--- /dev/null
+++ b/src/backend/access/transam/varsup.c
@@ -0,0 +1,678 @@
+/*-------------------------------------------------------------------------
+ *
+ * varsup.c
+ * postgres OID & XID variables support routines
+ *
+ * Copyright (c) 2000-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/varsup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlogutils.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "utils/syscache.h"
+
+
+/* Number of OIDs to prefetch (preallocate) per XLOG write */
+#define VAR_OID_PREFETCH 8192
+
+/* pointer to "variable cache" in shared memory (set up by shmem.c) */
+VariableCache ShmemVariableCache = NULL;
+
+
+/*
+ * Allocate the next FullTransactionId for a new transaction or
+ * subtransaction.
+ *
+ * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before
+ * returning.
+ *
+ * Note: when this is called, we are actually already inside a valid
+ * transaction, since XIDs are now not allocated until the transaction
+ * does something. So it is safe to do a database lookup if we want to
+ * issue a warning about XID wrap.
+ */
+FullTransactionId
+GetNewTransactionId(bool isSubXact)
+{
+ FullTransactionId full_xid;
+ TransactionId xid;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for new XIDs after that point.
+ */
+ if (IsInParallelMode())
+ elog(ERROR, "cannot assign TransactionIds during a parallel operation");
+
+ /*
+ * During bootstrap initialization, we return the special bootstrap
+ * transaction id.
+ */
+ if (IsBootstrapProcessingMode())
+ {
+ Assert(!isSubXact);
+ MyProc->xid = BootstrapTransactionId;
+ ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId;
+ return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId);
+ }
+
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot assign TransactionIds during recovery");
+
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+ full_xid = ShmemVariableCache->nextXid;
+ xid = XidFromFullTransactionId(full_xid);
+
+ /*----------
+ * Check to see if it's safe to assign another XID. This protects against
+ * catastrophic data loss due to XID wraparound. The basic rules are:
+ *
+ * If we're past xidVacLimit, start trying to force autovacuum cycles.
+ * If we're past xidWarnLimit, start issuing warnings.
+ * If we're past xidStopLimit, refuse to execute transactions, unless
+ * we are running in single-user mode (which gives an escape hatch
+ * to the DBA who somehow got past the earlier defenses).
+ *
+ * Note that this coding also appears in GetNewMultiXactId.
+ *----------
+ */
+ if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit))
+ {
+ /*
+ * For safety's sake, we release XidGenLock while sending signals,
+ * warnings, etc. This is not so much because we care about
+ * preserving concurrency in this situation, as to avoid any
+ * possibility of deadlock while doing get_database_name(). First,
+ * copy all the shared values we'll need in this path.
+ */
+ TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit;
+ TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit;
+ TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit;
+ Oid oldest_datoid = ShmemVariableCache->oldestXidDB;
+
+ LWLockRelease(XidGenLock);
+
+ /*
+ * To avoid swamping the postmaster with signals, we issue the autovac
+ * request only once per 64K transaction starts. This still gives
+ * plenty of chances before we get into real trouble.
+ */
+ if (IsUnderPostmaster && (xid % 65536) == 0)
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ if (IsUnderPostmaster &&
+ TransactionIdFollowsOrEquals(xid, xidStopLimit))
+ {
+ char *oldest_datname = get_database_name(oldest_datoid);
+
+ /* complain even if that DB has disappeared */
+ if (oldest_datname)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"",
+ oldest_datname),
+ errhint("Stop the postmaster and vacuum that database in single-user mode.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u",
+ oldest_datoid),
+ errhint("Stop the postmaster and vacuum that database in single-user mode.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+ else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit))
+ {
+ char *oldest_datname = get_database_name(oldest_datoid);
+
+ /* complain even if that DB has disappeared */
+ if (oldest_datname)
+ ereport(WARNING,
+ (errmsg("database \"%s\" must be vacuumed within %u transactions",
+ oldest_datname,
+ xidWrapLimit - xid),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(WARNING,
+ (errmsg("database with OID %u must be vacuumed within %u transactions",
+ oldest_datoid,
+ xidWrapLimit - xid),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+
+ /* Re-acquire lock and start over */
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+ full_xid = ShmemVariableCache->nextXid;
+ xid = XidFromFullTransactionId(full_xid);
+ }
+
+ /*
+ * If we are allocating the first XID of a new page of the commit log,
+ * zero out that commit-log page before returning. We must do this while
+ * holding XidGenLock, else another xact could acquire and commit a later
+ * XID before we zero the page. Fortunately, a page of the commit log
+ * holds 32K or more transactions, so we don't have to do this very often.
+ *
+ * Extend pg_subtrans and pg_commit_ts too.
+ */
+ ExtendCLOG(xid);
+ ExtendCommitTs(xid);
+ ExtendSUBTRANS(xid);
+
+ /*
+ * Now advance the nextXid counter. This must not happen until after we
+ * have successfully completed ExtendCLOG() --- if that routine fails, we
+ * want the next incoming transaction to try it again. We cannot assign
+ * more XIDs until there is CLOG space for them.
+ */
+ FullTransactionIdAdvance(&ShmemVariableCache->nextXid);
+
+ /*
+ * We must store the new XID into the shared ProcArray before releasing
+ * XidGenLock. This ensures that every active XID older than
+ * latestCompletedXid is present in the ProcArray, which is essential for
+ * correct OldestXmin tracking; see src/backend/access/transam/README.
+ *
+ * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful to
+ * fetch the value for each proc only once, rather than assume they can
+ * read a value multiple times and get the same answer each time. Note we
+ * are assuming that TransactionId and int fetch/store are atomic.
+ *
+ * The same comments apply to the subxact xid count and overflow fields.
+ *
+ * Use of a write barrier prevents dangerous code rearrangement in this
+ * function; other backends could otherwise e.g. be examining my subxids
+ * info concurrently, and we don't want them to see an invalid
+ * intermediate state, such as an incremented nxids before the array entry
+ * is filled.
+ *
+ * Other processes that read nxids should do so before reading xids
+ * elements with a pg_read_barrier() in between, so that they can be sure
+ * not to read an uninitialized array element; see
+ * src/backend/storage/lmgr/README.barrier.
+ *
+ * If there's no room to fit a subtransaction XID into PGPROC, set the
+ * cache-overflowed flag instead. This forces readers to look in
+ * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a
+ * race-condition window, in that the new XID will not appear as running
+ * until its parent link has been placed into pg_subtrans. However, that
+ * will happen before anyone could possibly have a reason to inquire about
+ * the status of the XID, so it seems OK. (Snapshots taken during this
+ * window *will* include the parent XID, so they will deliver the correct
+ * answer later on when someone does have a reason to inquire.)
+ */
+ if (!isSubXact)
+ {
+ Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0);
+ Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed);
+ Assert(MyProc->subxidStatus.count == 0);
+ Assert(!MyProc->subxidStatus.overflowed);
+
+ /* LWLockRelease acts as barrier */
+ MyProc->xid = xid;
+ ProcGlobal->xids[MyProc->pgxactoff] = xid;
+ }
+ else
+ {
+ XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+ int nxids = MyProc->subxidStatus.count;
+
+ Assert(substat->count == MyProc->subxidStatus.count);
+ Assert(substat->overflowed == MyProc->subxidStatus.overflowed);
+
+ if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
+ {
+ MyProc->subxids.xids[nxids] = xid;
+ pg_write_barrier();
+ MyProc->subxidStatus.count = substat->count = nxids + 1;
+ }
+ else
+ MyProc->subxidStatus.overflowed = substat->overflowed = true;
+ }
+
+ LWLockRelease(XidGenLock);
+
+ return full_xid;
+}
+
+/*
+ * Read nextXid but don't allocate it.
+ */
+FullTransactionId
+ReadNextFullTransactionId(void)
+{
+ FullTransactionId fullXid;
+
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ fullXid = ShmemVariableCache->nextXid;
+ LWLockRelease(XidGenLock);
+
+ return fullXid;
+}
+
+/*
+ * Advance nextXid to the value after a given xid. The epoch is inferred.
+ * This must only be called during recovery or from two-phase start-up code.
+ */
+void
+AdvanceNextFullTransactionIdPastXid(TransactionId xid)
+{
+ FullTransactionId newNextFullXid;
+ TransactionId next_xid;
+ uint32 epoch;
+
+ /*
+ * It is safe to read nextXid without a lock, because this is only called
+ * from the startup process or single-process mode, meaning that no other
+ * process can modify it.
+ */
+ Assert(AmStartupProcess() || !IsUnderPostmaster);
+
+ /* Fast return if this isn't an xid high enough to move the needle. */
+ next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ if (!TransactionIdFollowsOrEquals(xid, next_xid))
+ return;
+
+ /*
+ * Compute the FullTransactionId that comes after the given xid. To do
+ * this, we preserve the existing epoch, but detect when we've wrapped
+ * into a new epoch. This is necessary because WAL records and 2PC state
+ * currently contain 32 bit xids. The wrap logic is safe in those cases
+ * because the span of active xids cannot exceed one epoch at any given
+ * point in the WAL stream.
+ */
+ TransactionIdAdvance(xid);
+ epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
+ if (unlikely(xid < next_xid))
+ ++epoch;
+ newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid);
+
+ /*
+ * We still need to take a lock to modify the value when there are
+ * concurrent readers.
+ */
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextXid = newNextFullXid;
+ LWLockRelease(XidGenLock);
+}
+
+/*
+ * Advance the cluster-wide value for the oldest valid clog entry.
+ *
+ * We must acquire XactTruncationLock to advance the oldestClogXid. It's not
+ * necessary to hold the lock during the actual clog truncation, only when we
+ * advance the limit, as code looking up arbitrary xids is required to hold
+ * XactTruncationLock from when it tests oldestClogXid through to when it
+ * completes the clog lookup.
+ */
+void
+AdvanceOldestClogXid(TransactionId oldest_datfrozenxid)
+{
+ LWLockAcquire(XactTruncationLock, LW_EXCLUSIVE);
+ if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid,
+ oldest_datfrozenxid))
+ {
+ ShmemVariableCache->oldestClogXid = oldest_datfrozenxid;
+ }
+ LWLockRelease(XactTruncationLock);
+}
+
+/*
+ * Determine the last safe XID to allocate using the currently oldest
+ * datfrozenxid (ie, the oldest XID that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ */
+void
+SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
+{
+ TransactionId xidVacLimit;
+ TransactionId xidWarnLimit;
+ TransactionId xidStopLimit;
+ TransactionId xidWrapLimit;
+ TransactionId curXid;
+
+ Assert(TransactionIdIsNormal(oldest_datfrozenxid));
+
+ /*
+ * The place where we actually get into deep trouble is halfway around
+ * from the oldest potentially-existing XID. (This calculation is
+ * probably off by one or two counts, because the special XIDs reduce the
+ * size of the loop a little bit. But we throw in plenty of slop below,
+ * so it doesn't matter.)
+ */
+ xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1);
+ if (xidWrapLimit < FirstNormalTransactionId)
+ xidWrapLimit += FirstNormalTransactionId;
+
+ /*
+ * We'll refuse to continue assigning XIDs in interactive mode once we get
+ * within 3M transactions of data loss. This leaves lots of room for the
+ * DBA to fool around fixing things in a standalone backend, while not
+ * being significant compared to total XID space. (VACUUM requires an XID
+ * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA
+ * might do by reflex, assigns an XID. Hence, we had better be sure
+ * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two
+ * completely-idle segments. In the event of edge-case bugs involving
+ * page or segment arithmetic, idle segments render the bugs unreachable
+ * outside of single-user mode.
+ */
+ xidStopLimit = xidWrapLimit - 3000000;
+ if (xidStopLimit < FirstNormalTransactionId)
+ xidStopLimit -= FirstNormalTransactionId;
+
+ /*
+ * We'll start complaining loudly when we get within 40M transactions of
+ * data loss. This is kind of arbitrary, but if you let your gas gauge
+ * get down to 2% of full, would you be looking for the next gas station?
+ * We need to be fairly liberal about this number because there are lots
+ * of scenarios where most transactions are done by automatic clients that
+ * won't pay attention to warnings. (No, we're not gonna make this
+ * configurable. If you know enough to configure it, you know enough to
+ * not get in this kind of trouble in the first place.)
+ */
+ xidWarnLimit = xidWrapLimit - 40000000;
+ if (xidWarnLimit < FirstNormalTransactionId)
+ xidWarnLimit -= FirstNormalTransactionId;
+
+ /*
+ * We'll start trying to force autovacuums when oldest_datfrozenxid gets
+ * to be more than autovacuum_freeze_max_age transactions old.
+ *
+ * Note: guc.c ensures that autovacuum_freeze_max_age is in a sane range,
+ * so that xidVacLimit will be well before xidWarnLimit.
+ *
+ * Note: autovacuum_freeze_max_age is a PGC_POSTMASTER parameter so that
+ * we don't have to worry about dealing with on-the-fly changes in its
+ * value. It doesn't look practical to update shared state from a GUC
+ * assign hook (too many processes would try to execute the hook,
+ * resulting in race conditions as well as crashes of those not connected
+ * to shared memory). Perhaps this can be improved someday. See also
+ * SetMultiXactIdLimit.
+ */
+ xidVacLimit = oldest_datfrozenxid + autovacuum_freeze_max_age;
+ if (xidVacLimit < FirstNormalTransactionId)
+ xidVacLimit += FirstNormalTransactionId;
+
+ /* Grab lock for just long enough to set the new limit values */
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->oldestXid = oldest_datfrozenxid;
+ ShmemVariableCache->xidVacLimit = xidVacLimit;
+ ShmemVariableCache->xidWarnLimit = xidWarnLimit;
+ ShmemVariableCache->xidStopLimit = xidStopLimit;
+ ShmemVariableCache->xidWrapLimit = xidWrapLimit;
+ ShmemVariableCache->oldestXidDB = oldest_datoid;
+ curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ LWLockRelease(XidGenLock);
+
+ /* Log the info */
+ ereport(DEBUG1,
+ (errmsg_internal("transaction ID wrap limit is %u, limited by database with OID %u",
+ xidWrapLimit, oldest_datoid)));
+
+ /*
+ * If past the autovacuum force point, immediately signal an autovac
+ * request. The reason for this is that autovac only processes one
+ * database per invocation. Once it's finished cleaning up the oldest
+ * database, it'll call here, and we'll signal the postmaster to start
+ * another iteration immediately if there are still any old databases.
+ */
+ if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) &&
+ IsUnderPostmaster && !InRecovery)
+ SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+ /* Give an immediate warning if past the wrap warn point */
+ if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery)
+ {
+ char *oldest_datname;
+
+ /*
+ * We can be called when not inside a transaction, for example during
+ * StartupXLOG(). In such a case we cannot do database access, so we
+ * must just report the oldest DB's OID.
+ *
+ * Note: it's also possible that get_database_name fails and returns
+ * NULL, for example because the database just got dropped. We'll
+ * still warn, even though the warning might now be unnecessary.
+ */
+ if (IsTransactionState())
+ oldest_datname = get_database_name(oldest_datoid);
+ else
+ oldest_datname = NULL;
+
+ if (oldest_datname)
+ ereport(WARNING,
+ (errmsg("database \"%s\" must be vacuumed within %u transactions",
+ oldest_datname,
+ xidWrapLimit - curXid),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ else
+ ereport(WARNING,
+ (errmsg("database with OID %u must be vacuumed within %u transactions",
+ oldest_datoid,
+ xidWrapLimit - curXid),
+ errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ }
+}
+
+
+/*
+ * ForceTransactionIdLimitUpdate -- does the XID wrap-limit data need updating?
+ *
+ * We primarily check whether oldestXidDB is valid. The cases we have in
+ * mind are that that database was dropped, or the field was reset to zero
+ * by pg_resetwal. In either case we should force recalculation of the
+ * wrap limit. Also do it if oldestXid is old enough to be forcing
+ * autovacuums or other actions; this ensures we update our state as soon
+ * as possible once extra overhead is being incurred.
+ */
+bool
+ForceTransactionIdLimitUpdate(void)
+{
+ TransactionId nextXid;
+ TransactionId xidVacLimit;
+ TransactionId oldestXid;
+ Oid oldestXidDB;
+
+ /* Locking is probably not really necessary, but let's be careful */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ xidVacLimit = ShmemVariableCache->xidVacLimit;
+ oldestXid = ShmemVariableCache->oldestXid;
+ oldestXidDB = ShmemVariableCache->oldestXidDB;
+ LWLockRelease(XidGenLock);
+
+ if (!TransactionIdIsNormal(oldestXid))
+ return true; /* shouldn't happen, but just in case */
+ if (!TransactionIdIsValid(xidVacLimit))
+ return true; /* this shouldn't happen anymore either */
+ if (TransactionIdFollowsOrEquals(nextXid, xidVacLimit))
+ return true; /* past xidVacLimit, don't delay updating */
+ if (!SearchSysCacheExists1(DATABASEOID, ObjectIdGetDatum(oldestXidDB)))
+ return true; /* could happen, per comments above */
+ return false;
+}
+
+
+/*
+ * GetNewObjectId -- allocate a new OID
+ *
+ * OIDs are generated by a cluster-wide counter. Since they are only 32 bits
+ * wide, counter wraparound will occur eventually, and therefore it is unwise
+ * to assume they are unique unless precautions are taken to make them so.
+ * Hence, this routine should generally not be used directly. The only direct
+ * callers should be GetNewOidWithIndex() and GetNewRelFileNode() in
+ * catalog/catalog.c.
+ */
+Oid
+GetNewObjectId(void)
+{
+ Oid result;
+
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot assign OIDs during recovery");
+
+ LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+
+ /*
+ * Check for wraparound of the OID counter. We *must* not return 0
+ * (InvalidOid), and in normal operation we mustn't return anything below
+ * FirstNormalObjectId since that range is reserved for initdb (see
+ * IsCatalogRelationOid()). Note we are relying on unsigned comparison.
+ *
+ * During initdb, we start the OID generator at FirstGenbkiObjectId, so we
+ * only wrap if before that point when in bootstrap or standalone mode.
+ * The first time through this routine after normal postmaster start, the
+ * counter will be forced up to FirstNormalObjectId. This mechanism
+ * leaves the OIDs between FirstGenbkiObjectId and FirstNormalObjectId
+ * available for automatic assignment during initdb, while ensuring they
+ * will never conflict with user-assigned OIDs.
+ */
+ if (ShmemVariableCache->nextOid < ((Oid) FirstNormalObjectId))
+ {
+ if (IsPostmasterEnvironment)
+ {
+ /* wraparound, or first post-initdb assignment, in normal mode */
+ ShmemVariableCache->nextOid = FirstNormalObjectId;
+ ShmemVariableCache->oidCount = 0;
+ }
+ else
+ {
+ /* we may be bootstrapping, so don't enforce the full range */
+ if (ShmemVariableCache->nextOid < ((Oid) FirstGenbkiObjectId))
+ {
+ /* wraparound in standalone mode (unlikely but possible) */
+ ShmemVariableCache->nextOid = FirstNormalObjectId;
+ ShmemVariableCache->oidCount = 0;
+ }
+ }
+ }
+
+ /* If we run out of logged for use oids then we must log more */
+ if (ShmemVariableCache->oidCount == 0)
+ {
+ XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH);
+ ShmemVariableCache->oidCount = VAR_OID_PREFETCH;
+ }
+
+ result = ShmemVariableCache->nextOid;
+
+ (ShmemVariableCache->nextOid)++;
+ (ShmemVariableCache->oidCount)--;
+
+ LWLockRelease(OidGenLock);
+
+ return result;
+}
+
+/*
+ * SetNextObjectId
+ *
+ * This may only be called during initdb; it advances the OID counter
+ * to the specified value.
+ */
+static void
+SetNextObjectId(Oid nextOid)
+{
+ /* Safety check, this is only allowable during initdb */
+ if (IsPostmasterEnvironment)
+ elog(ERROR, "cannot advance OID counter anymore");
+
+ /* Taking the lock is, therefore, just pro forma; but do it anyway */
+ LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+
+ if (ShmemVariableCache->nextOid > nextOid)
+ elog(ERROR, "too late to advance OID counter to %u, it is now %u",
+ nextOid, ShmemVariableCache->nextOid);
+
+ ShmemVariableCache->nextOid = nextOid;
+ ShmemVariableCache->oidCount = 0;
+
+ LWLockRelease(OidGenLock);
+}
+
+/*
+ * StopGeneratingPinnedObjectIds
+ *
+ * This is called once during initdb to force the OID counter up to
+ * FirstUnpinnedObjectId. This supports letting initdb's post-bootstrap
+ * processing create some pinned objects early on. Once it's done doing
+ * so, it calls this (via pg_stop_making_pinned_objects()) so that the
+ * remaining objects it makes will be considered un-pinned.
+ */
+void
+StopGeneratingPinnedObjectIds(void)
+{
+ SetNextObjectId(FirstUnpinnedObjectId);
+}
+
+
+#ifdef USE_ASSERT_CHECKING
+
+/*
+ * Assert that xid is between [oldestXid, nextXid], which is the range we
+ * expect XIDs coming from tables etc to be in.
+ *
+ * As ShmemVariableCache->oldestXid could change just after this call without
+ * further precautions, and as a wrapped-around xid could again fall within
+ * the valid range, this assertion can only detect if something is definitely
+ * wrong, but not establish correctness.
+ *
+ * This intentionally does not expose a return value, to avoid code being
+ * introduced that depends on the return value.
+ */
+void
+AssertTransactionIdInAllowableRange(TransactionId xid)
+{
+ TransactionId oldest_xid;
+ TransactionId next_xid;
+
+ Assert(TransactionIdIsValid(xid));
+
+ /* we may see bootstrap / frozen */
+ if (!TransactionIdIsNormal(xid))
+ return;
+
+ /*
+ * We can't acquire XidGenLock, as this may be called with XidGenLock
+ * already held (or with other locks that don't allow XidGenLock to be
+ * nested). That's ok for our purposes though, since we already rely on
+ * 32bit reads to be atomic. While nextXid is 64 bit, we only look at the
+ * lower 32bit, so a skewed read doesn't hurt.
+ *
+ * There's no increased danger of falling outside [oldest, next] by
+ * accessing them without a lock. xid needs to have been created with
+ * GetNewTransactionId() in the originating session, and the locks there
+ * pair with the memory barrier below. We do however accept xid to be <=
+ * to next_xid, instead of just <, as xid could be from the procarray,
+ * before we see the updated nextXid value.
+ */
+ pg_memory_barrier();
+ oldest_xid = ShmemVariableCache->oldestXid;
+ next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) ||
+ TransactionIdPrecedesOrEquals(xid, next_xid));
+}
+#endif
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
new file mode 100644
index 0000000..e0c7ad1
--- /dev/null
+++ b/src/backend/access/transam/xact.c
@@ -0,0 +1,6249 @@
+/*-------------------------------------------------------------------------
+ *
+ * xact.c
+ * top level transaction system support routines
+ *
+ * See src/backend/access/transam/README for more information.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/xact.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "access/commit_ts.h"
+#include "access/multixact.h"
+#include "access/parallel.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/index.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_enum.h"
+#include "catalog/storage.h"
+#include "commands/async.h"
+#include "commands/tablecmds.h"
+#include "commands/trigger.h"
+#include "common/pg_prng.h"
+#include "executor/spi.h"
+#include "libpq/be-fsstubs.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "replication/logical.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/snapbuild.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/fd.h"
+#include "storage/lmgr.h"
+#include "storage/md.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/combocid.h"
+#include "utils/guc.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/relmapper.h"
+#include "utils/snapmgr.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/*
+ * User-tweakable parameters
+ */
+int DefaultXactIsoLevel = XACT_READ_COMMITTED;
+int XactIsoLevel;
+
+bool DefaultXactReadOnly = false;
+bool XactReadOnly;
+
+bool DefaultXactDeferrable = false;
+bool XactDeferrable;
+
+int synchronous_commit = SYNCHRONOUS_COMMIT_ON;
+
+/*
+ * CheckXidAlive is a xid value pointing to a possibly ongoing (sub)
+ * transaction. Currently, it is used in logical decoding. It's possible
+ * that such transactions can get aborted while the decoding is ongoing in
+ * which case we skip decoding that particular transaction. To ensure that we
+ * check whether the CheckXidAlive is aborted after fetching the tuple from
+ * system tables. We also ensure that during logical decoding we never
+ * directly access the tableam or heap APIs because we are checking for the
+ * concurrent aborts only in systable_* APIs.
+ */
+TransactionId CheckXidAlive = InvalidTransactionId;
+bool bsysscan = false;
+
+/*
+ * When running as a parallel worker, we place only a single
+ * TransactionStateData on the parallel worker's state stack, and the XID
+ * reflected there will be that of the *innermost* currently-active
+ * subtransaction in the backend that initiated parallelism. However,
+ * GetTopTransactionId() and TransactionIdIsCurrentTransactionId()
+ * need to return the same answers in the parallel worker as they would have
+ * in the user backend, so we need some additional bookkeeping.
+ *
+ * XactTopFullTransactionId stores the XID of our toplevel transaction, which
+ * will be the same as TopTransactionStateData.fullTransactionId in an
+ * ordinary backend; but in a parallel backend, which does not have the entire
+ * transaction state, it will instead be copied from the backend that started
+ * the parallel operation.
+ *
+ * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary
+ * backend, but in a parallel backend, nParallelCurrentXids will contain the
+ * number of XIDs that need to be considered current, and ParallelCurrentXids
+ * will contain the XIDs themselves. This includes all XIDs that were current
+ * or sub-committed in the parent at the time the parallel operation began.
+ * The XIDs are stored sorted in numerical order (not logical order) to make
+ * lookups as fast as possible.
+ */
+static FullTransactionId XactTopFullTransactionId = {InvalidTransactionId};
+static int nParallelCurrentXids = 0;
+static TransactionId *ParallelCurrentXids;
+
+/*
+ * Miscellaneous flag bits to record events which occur on the top level
+ * transaction. These flags are only persisted in MyXactFlags and are intended
+ * so we remember to do certain things later on in the transaction. This is
+ * globally accessible, so can be set from anywhere in the code that requires
+ * recording flags.
+ */
+int MyXactFlags;
+
+/*
+ * transaction states - transaction state from server perspective
+ */
+typedef enum TransState
+{
+ TRANS_DEFAULT, /* idle */
+ TRANS_START, /* transaction starting */
+ TRANS_INPROGRESS, /* inside a valid transaction */
+ TRANS_COMMIT, /* commit in progress */
+ TRANS_ABORT, /* abort in progress */
+ TRANS_PREPARE /* prepare in progress */
+} TransState;
+
+/*
+ * transaction block states - transaction state of client queries
+ *
+ * Note: the subtransaction states are used only for non-topmost
+ * transactions; the others appear only in the topmost transaction.
+ */
+typedef enum TBlockState
+{
+ /* not-in-transaction-block states */
+ TBLOCK_DEFAULT, /* idle */
+ TBLOCK_STARTED, /* running single-query transaction */
+
+ /* transaction block states */
+ TBLOCK_BEGIN, /* starting transaction block */
+ TBLOCK_INPROGRESS, /* live transaction */
+ TBLOCK_IMPLICIT_INPROGRESS, /* live transaction after implicit BEGIN */
+ TBLOCK_PARALLEL_INPROGRESS, /* live transaction inside parallel worker */
+ TBLOCK_END, /* COMMIT received */
+ TBLOCK_ABORT, /* failed xact, awaiting ROLLBACK */
+ TBLOCK_ABORT_END, /* failed xact, ROLLBACK received */
+ TBLOCK_ABORT_PENDING, /* live xact, ROLLBACK received */
+ TBLOCK_PREPARE, /* live xact, PREPARE received */
+
+ /* subtransaction states */
+ TBLOCK_SUBBEGIN, /* starting a subtransaction */
+ TBLOCK_SUBINPROGRESS, /* live subtransaction */
+ TBLOCK_SUBRELEASE, /* RELEASE received */
+ TBLOCK_SUBCOMMIT, /* COMMIT received while TBLOCK_SUBINPROGRESS */
+ TBLOCK_SUBABORT, /* failed subxact, awaiting ROLLBACK */
+ TBLOCK_SUBABORT_END, /* failed subxact, ROLLBACK received */
+ TBLOCK_SUBABORT_PENDING, /* live subxact, ROLLBACK received */
+ TBLOCK_SUBRESTART, /* live subxact, ROLLBACK TO received */
+ TBLOCK_SUBABORT_RESTART /* failed subxact, ROLLBACK TO received */
+} TBlockState;
+
+/*
+ * transaction state structure
+ */
+typedef struct TransactionStateData
+{
+ FullTransactionId fullTransactionId; /* my FullTransactionId */
+ SubTransactionId subTransactionId; /* my subxact ID */
+ char *name; /* savepoint name, if any */
+ int savepointLevel; /* savepoint level */
+ TransState state; /* low-level state */
+ TBlockState blockState; /* high-level state */
+ int nestingLevel; /* transaction nesting depth */
+ int gucNestLevel; /* GUC context nesting depth */
+ MemoryContext curTransactionContext; /* my xact-lifetime context */
+ ResourceOwner curTransactionOwner; /* my query resources */
+ TransactionId *childXids; /* subcommitted child XIDs, in XID order */
+ int nChildXids; /* # of subcommitted child XIDs */
+ int maxChildXids; /* allocated size of childXids[] */
+ Oid prevUser; /* previous CurrentUserId setting */
+ int prevSecContext; /* previous SecurityRestrictionContext */
+ bool prevXactReadOnly; /* entry-time xact r/o state */
+ bool startedInRecovery; /* did we start in recovery? */
+ bool didLogXid; /* has xid been included in WAL record? */
+ int parallelModeLevel; /* Enter/ExitParallelMode counter */
+ bool chain; /* start a new block after this one */
+ bool topXidLogged; /* for a subxact: is top-level XID logged? */
+ struct TransactionStateData *parent; /* back link to parent */
+} TransactionStateData;
+
+typedef TransactionStateData *TransactionState;
+
+/*
+ * Serialized representation used to transmit transaction state to parallel
+ * workers through shared memory.
+ */
+typedef struct SerializedTransactionState
+{
+ int xactIsoLevel;
+ bool xactDeferrable;
+ FullTransactionId topFullTransactionId;
+ FullTransactionId currentFullTransactionId;
+ CommandId currentCommandId;
+ int nParallelCurrentXids;
+ TransactionId parallelCurrentXids[FLEXIBLE_ARRAY_MEMBER];
+} SerializedTransactionState;
+
+/* The size of SerializedTransactionState, not including the final array. */
+#define SerializedTransactionStateHeaderSize \
+ offsetof(SerializedTransactionState, parallelCurrentXids)
+
+/*
+ * CurrentTransactionState always points to the current transaction state
+ * block. It will point to TopTransactionStateData when not in a
+ * transaction at all, or when in a top-level transaction.
+ */
+static TransactionStateData TopTransactionStateData = {
+ .state = TRANS_DEFAULT,
+ .blockState = TBLOCK_DEFAULT,
+ .topXidLogged = false,
+};
+
+/*
+ * unreportedXids holds XIDs of all subtransactions that have not yet been
+ * reported in an XLOG_XACT_ASSIGNMENT record.
+ */
+static int nUnreportedXids;
+static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS];
+
+static TransactionState CurrentTransactionState = &TopTransactionStateData;
+
+/*
+ * The subtransaction ID and command ID assignment counters are global
+ * to a whole transaction, so we do not keep them in the state stack.
+ */
+static SubTransactionId currentSubTransactionId;
+static CommandId currentCommandId;
+static bool currentCommandIdUsed;
+
+/*
+ * xactStartTimestamp is the value of transaction_timestamp().
+ * stmtStartTimestamp is the value of statement_timestamp().
+ * xactStopTimestamp is the time at which we log a commit or abort WAL record.
+ * These do not change as we enter and exit subtransactions, so we don't
+ * keep them inside the TransactionState stack.
+ */
+static TimestampTz xactStartTimestamp;
+static TimestampTz stmtStartTimestamp;
+static TimestampTz xactStopTimestamp;
+
+/*
+ * GID to be used for preparing the current transaction. This is also
+ * global to a whole transaction, so we don't keep it in the state stack.
+ */
+static char *prepareGID;
+
+/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
+
+/* Flag for logging statements in a transaction. */
+bool xact_is_sampled = false;
+
+/*
+ * Private context for transaction-abort work --- we reserve space for this
+ * at startup to ensure that AbortTransaction and AbortSubTransaction can work
+ * when we've run out of memory.
+ */
+static MemoryContext TransactionAbortContext = NULL;
+
+/*
+ * List of add-on start- and end-of-xact callbacks
+ */
+typedef struct XactCallbackItem
+{
+ struct XactCallbackItem *next;
+ XactCallback callback;
+ void *arg;
+} XactCallbackItem;
+
+static XactCallbackItem *Xact_callbacks = NULL;
+
+/*
+ * List of add-on start- and end-of-subxact callbacks
+ */
+typedef struct SubXactCallbackItem
+{
+ struct SubXactCallbackItem *next;
+ SubXactCallback callback;
+ void *arg;
+} SubXactCallbackItem;
+
+static SubXactCallbackItem *SubXact_callbacks = NULL;
+
+
+/* local function prototypes */
+static void AssignTransactionId(TransactionState s);
+static void AbortTransaction(void);
+static void AtAbort_Memory(void);
+static void AtCleanup_Memory(void);
+static void AtAbort_ResourceOwner(void);
+static void AtCCI_LocalCache(void);
+static void AtCommit_Memory(void);
+static void AtStart_Cache(void);
+static void AtStart_Memory(void);
+static void AtStart_ResourceOwner(void);
+static void CallXactCallbacks(XactEvent event);
+static void CallSubXactCallbacks(SubXactEvent event,
+ SubTransactionId mySubid,
+ SubTransactionId parentSubid);
+static void CleanupTransaction(void);
+static void CheckTransactionBlock(bool isTopLevel, bool throwError,
+ const char *stmtType);
+static void CommitTransaction(void);
+static TransactionId RecordTransactionAbort(bool isSubXact);
+static void StartTransaction(void);
+
+static void StartSubTransaction(void);
+static void CommitSubTransaction(void);
+static void AbortSubTransaction(void);
+static void CleanupSubTransaction(void);
+static void PushTransaction(void);
+static void PopTransaction(void);
+
+static void AtSubAbort_Memory(void);
+static void AtSubCleanup_Memory(void);
+static void AtSubAbort_ResourceOwner(void);
+static void AtSubCommit_Memory(void);
+static void AtSubStart_Memory(void);
+static void AtSubStart_ResourceOwner(void);
+
+static void ShowTransactionState(const char *str);
+static void ShowTransactionStateRec(const char *str, TransactionState state);
+static const char *BlockStateAsString(TBlockState blockState);
+static const char *TransStateAsString(TransState state);
+
+
+/* ----------------------------------------------------------------
+ * transaction state accessors
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * IsTransactionState
+ *
+ * This returns true if we are inside a valid transaction; that is,
+ * it is safe to initiate database access, take heavyweight locks, etc.
+ */
+bool
+IsTransactionState(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states. However, we
+ * also reject the startup/shutdown states TRANS_START, TRANS_COMMIT,
+ * TRANS_PREPARE since it might be too soon or too late within those
+ * transition states to do anything interesting. Hence, the only "valid"
+ * state is TRANS_INPROGRESS.
+ */
+ return (s->state == TRANS_INPROGRESS);
+}
+
+/*
+ * IsAbortedTransactionBlockState
+ *
+ * This returns true if we are within an aborted transaction block.
+ */
+bool
+IsAbortedTransactionBlockState(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->blockState == TBLOCK_ABORT ||
+ s->blockState == TBLOCK_SUBABORT)
+ return true;
+
+ return false;
+}
+
+
+/*
+ * GetTopTransactionId
+ *
+ * This will return the XID of the main transaction, assigning one if
+ * it's not yet set. Be careful to call this only inside a valid xact.
+ */
+TransactionId
+GetTopTransactionId(void)
+{
+ if (!FullTransactionIdIsValid(XactTopFullTransactionId))
+ AssignTransactionId(&TopTransactionStateData);
+ return XidFromFullTransactionId(XactTopFullTransactionId);
+}
+
+/*
+ * GetTopTransactionIdIfAny
+ *
+ * This will return the XID of the main transaction, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't yet been assigned an XID.
+ */
+TransactionId
+GetTopTransactionIdIfAny(void)
+{
+ return XidFromFullTransactionId(XactTopFullTransactionId);
+}
+
+/*
+ * GetCurrentTransactionId
+ *
+ * This will return the XID of the current transaction (main or sub
+ * transaction), assigning one if it's not yet set. Be careful to call this
+ * only inside a valid xact.
+ */
+TransactionId
+GetCurrentTransactionId(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (!FullTransactionIdIsValid(s->fullTransactionId))
+ AssignTransactionId(s);
+ return XidFromFullTransactionId(s->fullTransactionId);
+}
+
+/*
+ * GetCurrentTransactionIdIfAny
+ *
+ * This will return the XID of the current sub xact, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't been assigned an XID yet.
+ */
+TransactionId
+GetCurrentTransactionIdIfAny(void)
+{
+ return XidFromFullTransactionId(CurrentTransactionState->fullTransactionId);
+}
+
+/*
+ * GetTopFullTransactionId
+ *
+ * This will return the FullTransactionId of the main transaction, assigning
+ * one if it's not yet set. Be careful to call this only inside a valid xact.
+ */
+FullTransactionId
+GetTopFullTransactionId(void)
+{
+ if (!FullTransactionIdIsValid(XactTopFullTransactionId))
+ AssignTransactionId(&TopTransactionStateData);
+ return XactTopFullTransactionId;
+}
+
+/*
+ * GetTopFullTransactionIdIfAny
+ *
+ * This will return the FullTransactionId of the main transaction, if one is
+ * assigned. It will return InvalidFullTransactionId if we are not currently
+ * inside a transaction, or inside a transaction that hasn't yet been assigned
+ * one.
+ */
+FullTransactionId
+GetTopFullTransactionIdIfAny(void)
+{
+ return XactTopFullTransactionId;
+}
+
+/*
+ * GetCurrentFullTransactionId
+ *
+ * This will return the FullTransactionId of the current transaction (main or
+ * sub transaction), assigning one if it's not yet set. Be careful to call
+ * this only inside a valid xact.
+ */
+FullTransactionId
+GetCurrentFullTransactionId(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (!FullTransactionIdIsValid(s->fullTransactionId))
+ AssignTransactionId(s);
+ return s->fullTransactionId;
+}
+
+/*
+ * GetCurrentFullTransactionIdIfAny
+ *
+ * This will return the FullTransactionId of the current sub xact, if one is
+ * assigned. It will return InvalidFullTransactionId if we are not currently
+ * inside a transaction, or inside a transaction that hasn't been assigned one
+ * yet.
+ */
+FullTransactionId
+GetCurrentFullTransactionIdIfAny(void)
+{
+ return CurrentTransactionState->fullTransactionId;
+}
+
+/*
+ * MarkCurrentTransactionIdLoggedIfAny
+ *
+ * Remember that the current xid - if it is assigned - now has been wal logged.
+ */
+void
+MarkCurrentTransactionIdLoggedIfAny(void)
+{
+ if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId))
+ CurrentTransactionState->didLogXid = true;
+}
+
+/*
+ * IsSubxactTopXidLogPending
+ *
+ * This is used to decide whether we need to WAL log the top-level XID for
+ * operation in a subtransaction. We require that for logical decoding, see
+ * LogicalDecodingProcessRecord.
+ *
+ * This returns true if wal_level >= logical and we are inside a valid
+ * subtransaction, for which the assignment was not yet written to any WAL
+ * record.
+ */
+bool
+IsSubxactTopXidLogPending(void)
+{
+ /* check whether it is already logged */
+ if (CurrentTransactionState->topXidLogged)
+ return false;
+
+ /* wal_level has to be logical */
+ if (!XLogLogicalInfoActive())
+ return false;
+
+ /* we need to be in a transaction state */
+ if (!IsTransactionState())
+ return false;
+
+ /* it has to be a subtransaction */
+ if (!IsSubTransaction())
+ return false;
+
+ /* the subtransaction has to have a XID assigned */
+ if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+ return false;
+
+ return true;
+}
+
+/*
+ * MarkSubxactTopXidLogged
+ *
+ * Remember that the top transaction id for the current subtransaction is WAL
+ * logged now.
+ */
+void
+MarkSubxactTopXidLogged(void)
+{
+ Assert(IsSubxactTopXidLogPending());
+
+ CurrentTransactionState->topXidLogged = true;
+}
+
+/*
+ * GetStableLatestTransactionId
+ *
+ * Get the transaction's XID if it has one, else read the next-to-be-assigned
+ * XID. Once we have a value, return that same value for the remainder of the
+ * current transaction. This is meant to provide the reference point for the
+ * age(xid) function, but might be useful for other maintenance tasks as well.
+ */
+TransactionId
+GetStableLatestTransactionId(void)
+{
+ static LocalTransactionId lxid = InvalidLocalTransactionId;
+ static TransactionId stablexid = InvalidTransactionId;
+
+ if (lxid != MyProc->lxid)
+ {
+ lxid = MyProc->lxid;
+ stablexid = GetTopTransactionIdIfAny();
+ if (!TransactionIdIsValid(stablexid))
+ stablexid = ReadNextTransactionId();
+ }
+
+ Assert(TransactionIdIsValid(stablexid));
+
+ return stablexid;
+}
+
+/*
+ * AssignTransactionId
+ *
+ * Assigns a new permanent FullTransactionId to the given TransactionState.
+ * We do not assign XIDs to transactions until/unless this is called.
+ * Also, any parent TransactionStates that don't yet have XIDs are assigned
+ * one; this maintains the invariant that a child transaction has an XID
+ * following its parent's.
+ */
+static void
+AssignTransactionId(TransactionState s)
+{
+ bool isSubXact = (s->parent != NULL);
+ ResourceOwner currentOwner;
+ bool log_unknown_top = false;
+
+ /* Assert that caller didn't screw up */
+ Assert(!FullTransactionIdIsValid(s->fullTransactionId));
+ Assert(s->state == TRANS_INPROGRESS);
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for new XIDs at this point.
+ */
+ if (IsInParallelMode() || IsParallelWorker())
+ elog(ERROR, "cannot assign XIDs during a parallel operation");
+
+ /*
+ * Ensure parent(s) have XIDs, so that a child always has an XID later
+ * than its parent. Mustn't recurse here, or we might get a stack
+ * overflow if we're at the bottom of a huge stack of subtransactions none
+ * of which have XIDs yet.
+ */
+ if (isSubXact && !FullTransactionIdIsValid(s->parent->fullTransactionId))
+ {
+ TransactionState p = s->parent;
+ TransactionState *parents;
+ size_t parentOffset = 0;
+
+ parents = palloc(sizeof(TransactionState) * s->nestingLevel);
+ while (p != NULL && !FullTransactionIdIsValid(p->fullTransactionId))
+ {
+ parents[parentOffset++] = p;
+ p = p->parent;
+ }
+
+ /*
+ * This is technically a recursive call, but the recursion will never
+ * be more than one layer deep.
+ */
+ while (parentOffset != 0)
+ AssignTransactionId(parents[--parentOffset]);
+
+ pfree(parents);
+ }
+
+ /*
+ * When wal_level=logical, guarantee that a subtransaction's xid can only
+ * be seen in the WAL stream if its toplevel xid has been logged before.
+ * If necessary we log an xact_assignment record with fewer than
+ * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set
+ * for a transaction even though it appears in a WAL record, we just might
+ * superfluously log something. That can happen when an xid is included
+ * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in
+ * xl_standby_locks.
+ */
+ if (isSubXact && XLogLogicalInfoActive() &&
+ !TopTransactionStateData.didLogXid)
+ log_unknown_top = true;
+
+ /*
+ * Generate a new FullTransactionId and record its xid in PG_PROC and
+ * pg_subtrans.
+ *
+ * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in
+ * shared storage other than PG_PROC; because if there's no room for it in
+ * PG_PROC, the subtrans entry is needed to ensure that other backends see
+ * the Xid as "running". See GetNewTransactionId.
+ */
+ s->fullTransactionId = GetNewTransactionId(isSubXact);
+ if (!isSubXact)
+ XactTopFullTransactionId = s->fullTransactionId;
+
+ if (isSubXact)
+ SubTransSetParent(XidFromFullTransactionId(s->fullTransactionId),
+ XidFromFullTransactionId(s->parent->fullTransactionId));
+
+ /*
+ * If it's a top-level transaction, the predicate locking system needs to
+ * be told about it too.
+ */
+ if (!isSubXact)
+ RegisterPredicateLockingXid(XidFromFullTransactionId(s->fullTransactionId));
+
+ /*
+ * Acquire lock on the transaction XID. (We assume this cannot block.) We
+ * have to ensure that the lock is assigned to the transaction's own
+ * ResourceOwner.
+ */
+ currentOwner = CurrentResourceOwner;
+ CurrentResourceOwner = s->curTransactionOwner;
+
+ XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId));
+
+ CurrentResourceOwner = currentOwner;
+
+ /*
+ * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
+ * top-level transaction we issue a WAL record for the assignment. We
+ * include the top-level xid and all the subxids that have not yet been
+ * reported using XLOG_XACT_ASSIGNMENT records.
+ *
+ * This is required to limit the amount of shared memory required in a hot
+ * standby server to keep track of in-progress XIDs. See notes for
+ * RecordKnownAssignedTransactionIds().
+ *
+ * We don't keep track of the immediate parent of each subxid, only the
+ * top-level transaction that each subxact belongs to. This is correct in
+ * recovery only because aborted subtransactions are separately WAL
+ * logged.
+ *
+ * This is correct even for the case where several levels above us didn't
+ * have an xid assigned as we recursed up to them beforehand.
+ */
+ if (isSubXact && XLogStandbyInfoActive())
+ {
+ unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId);
+ nUnreportedXids++;
+
+ /*
+ * ensure this test matches similar one in
+ * RecoverPreparedTransactions()
+ */
+ if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS ||
+ log_unknown_top)
+ {
+ xl_xact_assignment xlrec;
+
+ /*
+ * xtop is always set by now because we recurse up transaction
+ * stack to the highest unassigned xid and then come back down
+ */
+ xlrec.xtop = GetTopTransactionId();
+ Assert(TransactionIdIsValid(xlrec.xtop));
+ xlrec.nsubxacts = nUnreportedXids;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment);
+ XLogRegisterData((char *) unreportedXids,
+ nUnreportedXids * sizeof(TransactionId));
+
+ (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT);
+
+ nUnreportedXids = 0;
+ /* mark top, not current xact as having been logged */
+ TopTransactionStateData.didLogXid = true;
+ }
+ }
+}
+
+/*
+ * GetCurrentSubTransactionId
+ */
+SubTransactionId
+GetCurrentSubTransactionId(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ return s->subTransactionId;
+}
+
+/*
+ * SubTransactionIsActive
+ *
+ * Test if the specified subxact ID is still active. Note caller is
+ * responsible for checking whether this ID is relevant to the current xact.
+ */
+bool
+SubTransactionIsActive(SubTransactionId subxid)
+{
+ TransactionState s;
+
+ for (s = CurrentTransactionState; s != NULL; s = s->parent)
+ {
+ if (s->state == TRANS_ABORT)
+ continue;
+ if (s->subTransactionId == subxid)
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * GetCurrentCommandId
+ *
+ * "used" must be true if the caller intends to use the command ID to mark
+ * inserted/updated/deleted tuples. false means the ID is being fetched
+ * for read-only purposes (ie, as a snapshot validity cutoff). See
+ * CommandCounterIncrement() for discussion.
+ */
+CommandId
+GetCurrentCommandId(bool used)
+{
+ /* this is global to a transaction, not subtransaction-local */
+ if (used)
+ {
+ /*
+ * Forbid setting currentCommandIdUsed in a parallel worker, because
+ * we have no provision for communicating this back to the leader. We
+ * could relax this restriction when currentCommandIdUsed was already
+ * true at the start of the parallel operation.
+ */
+ Assert(!IsParallelWorker());
+ currentCommandIdUsed = true;
+ }
+ return currentCommandId;
+}
+
+/*
+ * SetParallelStartTimestamps
+ *
+ * In a parallel worker, we should inherit the parent transaction's
+ * timestamps rather than setting our own. The parallel worker
+ * infrastructure must call this to provide those values before
+ * calling StartTransaction() or SetCurrentStatementStartTimestamp().
+ */
+void
+SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts)
+{
+ Assert(IsParallelWorker());
+ xactStartTimestamp = xact_ts;
+ stmtStartTimestamp = stmt_ts;
+}
+
+/*
+ * GetCurrentTransactionStartTimestamp
+ */
+TimestampTz
+GetCurrentTransactionStartTimestamp(void)
+{
+ return xactStartTimestamp;
+}
+
+/*
+ * GetCurrentStatementStartTimestamp
+ */
+TimestampTz
+GetCurrentStatementStartTimestamp(void)
+{
+ return stmtStartTimestamp;
+}
+
+/*
+ * GetCurrentTransactionStopTimestamp
+ *
+ * We return current time if the transaction stop time hasn't been set
+ * (which can happen if we decide we don't need to log an XLOG record).
+ */
+TimestampTz
+GetCurrentTransactionStopTimestamp(void)
+{
+ if (xactStopTimestamp != 0)
+ return xactStopTimestamp;
+ return GetCurrentTimestamp();
+}
+
+/*
+ * SetCurrentStatementStartTimestamp
+ *
+ * In a parallel worker, this should already have been provided by a call
+ * to SetParallelStartTimestamps().
+ */
+void
+SetCurrentStatementStartTimestamp(void)
+{
+ if (!IsParallelWorker())
+ stmtStartTimestamp = GetCurrentTimestamp();
+ else
+ Assert(stmtStartTimestamp != 0);
+}
+
+/*
+ * SetCurrentTransactionStopTimestamp
+ */
+static inline void
+SetCurrentTransactionStopTimestamp(void)
+{
+ xactStopTimestamp = GetCurrentTimestamp();
+}
+
+/*
+ * GetCurrentTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction, one when
+ * inside a top-level transaction, etc.
+ */
+int
+GetCurrentTransactionNestLevel(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ return s->nestingLevel;
+}
+
+
+/*
+ * TransactionIdIsCurrentTransactionId
+ */
+bool
+TransactionIdIsCurrentTransactionId(TransactionId xid)
+{
+ TransactionState s;
+
+ /*
+ * We always say that BootstrapTransactionId is "not my transaction ID"
+ * even when it is (ie, during bootstrap). Along with the fact that
+ * transam.c always treats BootstrapTransactionId as already committed,
+ * this causes the heapam_visibility.c routines to see all tuples as
+ * committed, which is what we need during bootstrap. (Bootstrap mode
+ * only inserts tuples, it never updates or deletes them, so all tuples
+ * can be presumed good immediately.)
+ *
+ * Likewise, InvalidTransactionId and FrozenTransactionId are certainly
+ * not my transaction ID, so we can just return "false" immediately for
+ * any non-normal XID.
+ */
+ if (!TransactionIdIsNormal(xid))
+ return false;
+
+ if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+ return true;
+
+ /*
+ * In parallel workers, the XIDs we must consider as current are stored in
+ * ParallelCurrentXids rather than the transaction-state stack. Note that
+ * the XIDs in this array are sorted numerically rather than according to
+ * transactionIdPrecedes order.
+ */
+ if (nParallelCurrentXids > 0)
+ {
+ int low,
+ high;
+
+ low = 0;
+ high = nParallelCurrentXids - 1;
+ while (low <= high)
+ {
+ int middle;
+ TransactionId probe;
+
+ middle = low + (high - low) / 2;
+ probe = ParallelCurrentXids[middle];
+ if (probe == xid)
+ return true;
+ else if (probe < xid)
+ low = middle + 1;
+ else
+ high = middle - 1;
+ }
+ return false;
+ }
+
+ /*
+ * We will return true for the Xid of the current subtransaction, any of
+ * its subcommitted children, any of its parents, or any of their
+ * previously subcommitted children. However, a transaction being aborted
+ * is no longer "current", even though it may still have an entry on the
+ * state stack.
+ */
+ for (s = CurrentTransactionState; s != NULL; s = s->parent)
+ {
+ int low,
+ high;
+
+ if (s->state == TRANS_ABORT)
+ continue;
+ if (!FullTransactionIdIsValid(s->fullTransactionId))
+ continue; /* it can't have any child XIDs either */
+ if (TransactionIdEquals(xid, XidFromFullTransactionId(s->fullTransactionId)))
+ return true;
+ /* As the childXids array is ordered, we can use binary search */
+ low = 0;
+ high = s->nChildXids - 1;
+ while (low <= high)
+ {
+ int middle;
+ TransactionId probe;
+
+ middle = low + (high - low) / 2;
+ probe = s->childXids[middle];
+ if (TransactionIdEquals(probe, xid))
+ return true;
+ else if (TransactionIdPrecedes(probe, xid))
+ low = middle + 1;
+ else
+ high = middle - 1;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * TransactionStartedDuringRecovery
+ *
+ * Returns true if the current transaction started while recovery was still
+ * in progress. Recovery might have ended since so RecoveryInProgress() might
+ * return false already.
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+ return CurrentTransactionState->startedInRecovery;
+}
+
+/*
+ * EnterParallelMode
+ */
+void
+EnterParallelMode(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(s->parallelModeLevel >= 0);
+
+ ++s->parallelModeLevel;
+}
+
+/*
+ * ExitParallelMode
+ */
+void
+ExitParallelMode(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(s->parallelModeLevel > 0);
+ Assert(s->parallelModeLevel > 1 || !ParallelContextActive());
+
+ --s->parallelModeLevel;
+}
+
+/*
+ * IsInParallelMode
+ *
+ * Are we in a parallel operation, as either the leader or a worker? Check
+ * this to prohibit operations that change backend-local state expected to
+ * match across all workers. Mere caches usually don't require such a
+ * restriction. State modified in a strict push/pop fashion, such as the
+ * active snapshot stack, is often fine.
+ */
+bool
+IsInParallelMode(void)
+{
+ return CurrentTransactionState->parallelModeLevel != 0;
+}
+
+/*
+ * CommandCounterIncrement
+ */
+void
+CommandCounterIncrement(void)
+{
+ /*
+ * If the current value of the command counter hasn't been "used" to mark
+ * tuples, we need not increment it, since there's no need to distinguish
+ * a read-only command from others. This helps postpone command counter
+ * overflow, and keeps no-op CommandCounterIncrement operations cheap.
+ */
+ if (currentCommandIdUsed)
+ {
+ /*
+ * Workers synchronize transaction state at the beginning of each
+ * parallel operation, so we can't account for new commands after that
+ * point.
+ */
+ if (IsInParallelMode() || IsParallelWorker())
+ elog(ERROR, "cannot start commands during a parallel operation");
+
+ currentCommandId += 1;
+ if (currentCommandId == InvalidCommandId)
+ {
+ currentCommandId -= 1;
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot have more than 2^32-2 commands in a transaction")));
+ }
+ currentCommandIdUsed = false;
+
+ /* Propagate new command ID into static snapshots */
+ SnapshotSetCommandId(currentCommandId);
+
+ /*
+ * Make any catalog changes done by the just-completed command visible
+ * in the local syscache. We obviously don't need to do this after a
+ * read-only command. (But see hacks in inval.c to make real sure we
+ * don't think a command that queued inval messages was read-only.)
+ */
+ AtCCI_LocalCache();
+ }
+}
+
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction. Currently, two-phase commit does not
+ * persist and restore this variable. So long as all callers use
+ * PreventInTransactionBlock(), that omission has no consequences.
+ */
+void
+ForceSyncCommit(void)
+{
+ forceSyncCommit = true;
+}
+
+
+/* ----------------------------------------------------------------
+ * StartTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtStart_Cache
+ */
+static void
+AtStart_Cache(void)
+{
+ AcceptInvalidationMessages();
+}
+
+/*
+ * AtStart_Memory
+ */
+static void
+AtStart_Memory(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * If this is the first time through, create a private context for
+ * AbortTransaction to work in. By reserving some space now, we can
+ * insulate AbortTransaction from out-of-memory scenarios. Like
+ * ErrorContext, we set it up with slow growth rate and a nonzero minimum
+ * size, so that space will be reserved immediately.
+ */
+ if (TransactionAbortContext == NULL)
+ TransactionAbortContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "TransactionAbortContext",
+ 32 * 1024,
+ 32 * 1024,
+ 32 * 1024);
+
+ /*
+ * We shouldn't have a transaction context already.
+ */
+ Assert(TopTransactionContext == NULL);
+
+ /*
+ * Create a toplevel context for the transaction.
+ */
+ TopTransactionContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "TopTransactionContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * In a top-level transaction, CurTransactionContext is the same as
+ * TopTransactionContext.
+ */
+ CurTransactionContext = TopTransactionContext;
+ s->curTransactionContext = CurTransactionContext;
+
+ /* Make the CurTransactionContext active. */
+ MemoryContextSwitchTo(CurTransactionContext);
+}
+
+/*
+ * AtStart_ResourceOwner
+ */
+static void
+AtStart_ResourceOwner(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * We shouldn't have a transaction resource owner already.
+ */
+ Assert(TopTransactionResourceOwner == NULL);
+
+ /*
+ * Create a toplevel resource owner for the transaction.
+ */
+ s->curTransactionOwner = ResourceOwnerCreate(NULL, "TopTransaction");
+
+ TopTransactionResourceOwner = s->curTransactionOwner;
+ CurTransactionResourceOwner = s->curTransactionOwner;
+ CurrentResourceOwner = s->curTransactionOwner;
+}
+
+/* ----------------------------------------------------------------
+ * StartSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubStart_Memory
+ */
+static void
+AtSubStart_Memory(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(CurTransactionContext != NULL);
+
+ /*
+ * Create a CurTransactionContext, which will be used to hold data that
+ * survives subtransaction commit but disappears on subtransaction abort.
+ * We make it a child of the immediate parent's CurTransactionContext.
+ */
+ CurTransactionContext = AllocSetContextCreate(CurTransactionContext,
+ "CurTransactionContext",
+ ALLOCSET_DEFAULT_SIZES);
+ s->curTransactionContext = CurTransactionContext;
+
+ /* Make the CurTransactionContext active. */
+ MemoryContextSwitchTo(CurTransactionContext);
+}
+
+/*
+ * AtSubStart_ResourceOwner
+ */
+static void
+AtSubStart_ResourceOwner(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(s->parent != NULL);
+
+ /*
+ * Create a resource owner for the subtransaction. We make it a child of
+ * the immediate parent's resource owner.
+ */
+ s->curTransactionOwner =
+ ResourceOwnerCreate(s->parent->curTransactionOwner,
+ "SubTransaction");
+
+ CurTransactionResourceOwner = s->curTransactionOwner;
+ CurrentResourceOwner = s->curTransactionOwner;
+}
+
+/* ----------------------------------------------------------------
+ * CommitTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * RecordTransactionCommit
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID. (We compute that here just because it's easier.)
+ *
+ * If you change this function, see RecordTransactionCommitPrepared also.
+ */
+static TransactionId
+RecordTransactionCommit(void)
+{
+ TransactionId xid = GetTopTransactionIdIfAny();
+ bool markXidCommitted = TransactionIdIsValid(xid);
+ TransactionId latestXid = InvalidTransactionId;
+ int nrels;
+ RelFileNode *rels;
+ int nchildren;
+ TransactionId *children;
+ int ndroppedstats = 0;
+ xl_xact_stats_item *droppedstats = NULL;
+ int nmsgs = 0;
+ SharedInvalidationMessage *invalMessages = NULL;
+ bool RelcacheInitFileInval = false;
+ bool wrote_xlog;
+
+ /*
+ * Log pending invalidations for logical decoding of in-progress
+ * transactions. Normally for DDLs, we log this at each command end,
+ * however, for certain cases where we directly update the system table
+ * without a transaction block, the invalidations are not logged till this
+ * time.
+ */
+ if (XLogLogicalInfoActive())
+ LogLogicalInvalidations();
+
+ /* Get data needed for commit record */
+ nrels = smgrGetPendingDeletes(true, &rels);
+ nchildren = xactGetCommittedChildren(&children);
+ ndroppedstats = pgstat_get_transactional_drops(true, &droppedstats);
+ if (XLogStandbyInfoActive())
+ nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
+ &RelcacheInitFileInval);
+ wrote_xlog = (XactLastRecEnd != 0);
+
+ /*
+ * If we haven't been assigned an XID yet, we neither can, nor do we want
+ * to write a COMMIT record.
+ */
+ if (!markXidCommitted)
+ {
+ /*
+ * We expect that every RelationDropStorage is followed by a catalog
+ * update, and hence XID assignment, so we shouldn't get here with any
+ * pending deletes. Same is true for dropping stats.
+ *
+ * Use a real test not just an Assert to check this, since it's a bit
+ * fragile.
+ */
+ if (nrels != 0 || ndroppedstats != 0)
+ elog(ERROR, "cannot commit a transaction that deleted files but has no xid");
+
+ /* Can't have child XIDs either; AssignTransactionId enforces this */
+ Assert(nchildren == 0);
+
+ /*
+ * Transactions without an assigned xid can contain invalidation
+ * messages (e.g. explicit relcache invalidations or catcache
+ * invalidations for inplace updates); standbys need to process those.
+ * We can't emit a commit record without an xid, and we don't want to
+ * force assigning an xid, because that'd be problematic for e.g.
+ * vacuum. Hence we emit a bespoke record for the invalidations. We
+ * don't want to use that in case a commit record is emitted, so they
+ * happen synchronously with commits (besides not wanting to emit more
+ * WAL records).
+ */
+ if (nmsgs != 0)
+ {
+ LogStandbyInvalidations(nmsgs, invalMessages,
+ RelcacheInitFileInval);
+ wrote_xlog = true; /* not strictly necessary */
+ }
+
+ /*
+ * If we didn't create XLOG entries, we're done here; otherwise we
+ * should trigger flushing those entries the same as a commit record
+ * would. This will primarily happen for HOT pruning and the like; we
+ * want these to be flushed to disk in due time.
+ */
+ if (!wrote_xlog)
+ goto cleanup;
+ }
+ else
+ {
+ bool replorigin;
+
+ /*
+ * Are we using the replication origins feature? Or, in other words,
+ * are we replaying remote actions?
+ */
+ replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+ replorigin_session_origin != DoNotReplicateId);
+
+ /*
+ * Begin commit critical section and insert the commit XLOG record.
+ */
+ /* Tell bufmgr and smgr to prepare for commit */
+ BufmgrCommit();
+
+ /*
+ * Mark ourselves as within our "commit critical section". This
+ * forces any concurrent checkpoint to wait until we've updated
+ * pg_xact. Without this, it is possible for the checkpoint to set
+ * REDO after the XLOG record but fail to flush the pg_xact update to
+ * disk, leading to loss of the transaction commit if the system
+ * crashes a little later.
+ *
+ * Note: we could, but don't bother to, set this flag in
+ * RecordTransactionAbort. That's because loss of a transaction abort
+ * is noncritical; the presumption would be that it aborted, anyway.
+ *
+ * It's safe to change the delayChkptFlags flag of our own backend
+ * without holding the ProcArrayLock, since we're the only one
+ * modifying it. This makes checkpoint's determination of which xacts
+ * are delaying the checkpoint a bit fuzzy, but it doesn't matter.
+ */
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ START_CRIT_SECTION();
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+ SetCurrentTransactionStopTimestamp();
+
+ XactLogCommitRecord(xactStopTimestamp,
+ nchildren, children, nrels, rels,
+ ndroppedstats, droppedstats,
+ nmsgs, invalMessages,
+ RelcacheInitFileInval,
+ MyXactFlags,
+ InvalidTransactionId, NULL /* plain commit */ );
+
+ if (replorigin)
+ /* Move LSNs forward for this replication origin */
+ replorigin_session_advance(replorigin_session_origin_lsn,
+ XactLastRecEnd);
+
+ /*
+ * Record commit timestamp. The value comes from plain commit
+ * timestamp if there's no replication origin; otherwise, the
+ * timestamp was already set in replorigin_session_origin_timestamp by
+ * replication.
+ *
+ * We don't need to WAL-log anything here, as the commit record
+ * written above already contains the data.
+ */
+
+ if (!replorigin || replorigin_session_origin_timestamp == 0)
+ replorigin_session_origin_timestamp = xactStopTimestamp;
+
+ TransactionTreeSetCommitTsData(xid, nchildren, children,
+ replorigin_session_origin_timestamp,
+ replorigin_session_origin);
+ }
+
+ /*
+ * Check if we want to commit asynchronously. We can allow the XLOG flush
+ * to happen asynchronously if synchronous_commit=off, or if the current
+ * transaction has not performed any WAL-logged operation or didn't assign
+ * an xid. The transaction can end up not writing any WAL, even if it has
+ * an xid, if it only wrote to temporary and/or unlogged tables. It can
+ * end up having written WAL without an xid if it did HOT pruning. In
+ * case of a crash, the loss of such a transaction will be irrelevant;
+ * temp tables will be lost anyway, unlogged tables will be truncated and
+ * HOT pruning will be done again later. (Given the foregoing, you might
+ * think that it would be unnecessary to emit the XLOG record at all in
+ * this case, but we don't currently try to do that. It would certainly
+ * cause problems at least in Hot Standby mode, where the
+ * KnownAssignedXids machinery requires tracking every XID assignment. It
+ * might be OK to skip it only when wal_level < replica, but for now we
+ * don't.)
+ *
+ * However, if we're doing cleanup of any non-temp rels or committing any
+ * command that wanted to force sync commit, then we must flush XLOG
+ * immediately. (We must not allow asynchronous commit if there are any
+ * non-temp tables to be deleted, because we might delete the files before
+ * the COMMIT record is flushed to disk. We do allow asynchronous commit
+ * if all to-be-deleted tables are temporary though, since they are lost
+ * anyway if we crash.)
+ */
+ if ((wrote_xlog && markXidCommitted &&
+ synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
+ forceSyncCommit || nrels > 0)
+ {
+ XLogFlush(XactLastRecEnd);
+
+ /*
+ * Now we may update the CLOG, if we wrote a COMMIT record above
+ */
+ if (markXidCommitted)
+ TransactionIdCommitTree(xid, nchildren, children);
+ }
+ else
+ {
+ /*
+ * Asynchronous commit case:
+ *
+ * This enables possible committed transaction loss in the case of a
+ * postmaster crash because WAL buffers are left unwritten. Ideally we
+ * could issue the WAL write without the fsync, but some
+ * wal_sync_methods do not allow separate write/fsync.
+ *
+ * Report the latest async commit LSN, so that the WAL writer knows to
+ * flush this commit.
+ */
+ XLogSetAsyncXactLSN(XactLastRecEnd);
+
+ /*
+ * We must not immediately update the CLOG, since we didn't flush the
+ * XLOG. Instead, we store the LSN up to which the XLOG must be
+ * flushed before the CLOG may be updated.
+ */
+ if (markXidCommitted)
+ TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd);
+ }
+
+ /*
+ * If we entered a commit critical section, leave it now, and let
+ * checkpoints proceed.
+ */
+ if (markXidCommitted)
+ {
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+ END_CRIT_SECTION();
+ }
+
+ /* Compute latestXid while we have the child XIDs handy */
+ latestXid = TransactionIdLatest(xid, nchildren, children);
+
+ /*
+ * Wait for synchronous replication, if required. Similar to the decision
+ * above about using committing asynchronously we only want to wait if
+ * this backend assigned an xid and wrote WAL. No need to wait if an xid
+ * was assigned due to temporary/unlogged tables or due to HOT pruning.
+ *
+ * Note that at this stage we have marked clog, but still show as running
+ * in the procarray and continue to hold locks.
+ */
+ if (wrote_xlog && markXidCommitted)
+ SyncRepWaitForLSN(XactLastRecEnd, true);
+
+ /* remember end of last commit record */
+ XactLastCommitEnd = XactLastRecEnd;
+
+ /* Reset XactLastRecEnd until the next transaction writes something */
+ XactLastRecEnd = 0;
+cleanup:
+ /* Clean up local data */
+ if (rels)
+ pfree(rels);
+ if (ndroppedstats)
+ pfree(droppedstats);
+
+ return latestXid;
+}
+
+
+/*
+ * AtCCI_LocalCache
+ */
+static void
+AtCCI_LocalCache(void)
+{
+ /*
+ * Make any pending relation map changes visible. We must do this before
+ * processing local sinval messages, so that the map changes will get
+ * reflected into the relcache when relcache invals are processed.
+ */
+ AtCCI_RelationMap();
+
+ /*
+ * Make catalog changes visible to me for the next command.
+ */
+ CommandEndInvalidationMessages();
+}
+
+/*
+ * AtCommit_Memory
+ */
+static void
+AtCommit_Memory(void)
+{
+ /*
+ * Now that we're "out" of a transaction, have the system allocate things
+ * in the top memory context instead of per-transaction contexts.
+ */
+ MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Release all transaction-local memory.
+ */
+ Assert(TopTransactionContext != NULL);
+ MemoryContextDelete(TopTransactionContext);
+ TopTransactionContext = NULL;
+ CurTransactionContext = NULL;
+ CurrentTransactionState->curTransactionContext = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * CommitSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCommit_Memory
+ */
+static void
+AtSubCommit_Memory(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(s->parent != NULL);
+
+ /* Return to parent transaction level's memory context. */
+ CurTransactionContext = s->parent->curTransactionContext;
+ MemoryContextSwitchTo(CurTransactionContext);
+
+ /*
+ * Ordinarily we cannot throw away the child's CurTransactionContext,
+ * since the data it contains will be needed at upper commit. However, if
+ * there isn't actually anything in it, we can throw it away. This avoids
+ * a small memory leak in the common case of "trivial" subxacts.
+ */
+ if (MemoryContextIsEmpty(s->curTransactionContext))
+ {
+ MemoryContextDelete(s->curTransactionContext);
+ s->curTransactionContext = NULL;
+ }
+}
+
+/*
+ * AtSubCommit_childXids
+ *
+ * Pass my own XID and my child XIDs up to my parent as committed children.
+ */
+static void
+AtSubCommit_childXids(void)
+{
+ TransactionState s = CurrentTransactionState;
+ int new_nChildXids;
+
+ Assert(s->parent != NULL);
+
+ /*
+ * The parent childXids array will need to hold my XID and all my
+ * childXids, in addition to the XIDs already there.
+ */
+ new_nChildXids = s->parent->nChildXids + s->nChildXids + 1;
+
+ /* Allocate or enlarge the parent array if necessary */
+ if (s->parent->maxChildXids < new_nChildXids)
+ {
+ int new_maxChildXids;
+ TransactionId *new_childXids;
+
+ /*
+ * Make it 2x what's needed right now, to avoid having to enlarge it
+ * repeatedly. But we can't go above MaxAllocSize. (The latter limit
+ * is what ensures that we don't need to worry about integer overflow
+ * here or in the calculation of new_nChildXids.)
+ */
+ new_maxChildXids = Min(new_nChildXids * 2,
+ (int) (MaxAllocSize / sizeof(TransactionId)));
+
+ if (new_maxChildXids < new_nChildXids)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("maximum number of committed subtransactions (%d) exceeded",
+ (int) (MaxAllocSize / sizeof(TransactionId)))));
+
+ /*
+ * We keep the child-XID arrays in TopTransactionContext; this avoids
+ * setting up child-transaction contexts for what might be just a few
+ * bytes of grandchild XIDs.
+ */
+ if (s->parent->childXids == NULL)
+ new_childXids =
+ MemoryContextAlloc(TopTransactionContext,
+ new_maxChildXids * sizeof(TransactionId));
+ else
+ new_childXids = repalloc(s->parent->childXids,
+ new_maxChildXids * sizeof(TransactionId));
+
+ s->parent->childXids = new_childXids;
+ s->parent->maxChildXids = new_maxChildXids;
+ }
+
+ /*
+ * Copy all my XIDs to parent's array.
+ *
+ * Note: We rely on the fact that the XID of a child always follows that
+ * of its parent. By copying the XID of this subtransaction before the
+ * XIDs of its children, we ensure that the array stays ordered. Likewise,
+ * all XIDs already in the array belong to subtransactions started and
+ * subcommitted before us, so their XIDs must precede ours.
+ */
+ s->parent->childXids[s->parent->nChildXids] = XidFromFullTransactionId(s->fullTransactionId);
+
+ if (s->nChildXids > 0)
+ memcpy(&s->parent->childXids[s->parent->nChildXids + 1],
+ s->childXids,
+ s->nChildXids * sizeof(TransactionId));
+
+ s->parent->nChildXids = new_nChildXids;
+
+ /* Release child's array to avoid leakage */
+ if (s->childXids != NULL)
+ pfree(s->childXids);
+ /* We must reset these to avoid double-free if fail later in commit */
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+}
+
+/* ----------------------------------------------------------------
+ * AbortTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * RecordTransactionAbort
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID. (We compute that here just because it's easier.)
+ */
+static TransactionId
+RecordTransactionAbort(bool isSubXact)
+{
+ TransactionId xid = GetCurrentTransactionIdIfAny();
+ TransactionId latestXid;
+ int nrels;
+ RelFileNode *rels;
+ int ndroppedstats = 0;
+ xl_xact_stats_item *droppedstats = NULL;
+ int nchildren;
+ TransactionId *children;
+ TimestampTz xact_time;
+
+ /*
+ * If we haven't been assigned an XID, nobody will care whether we aborted
+ * or not. Hence, we're done in that case. It does not matter if we have
+ * rels to delete (note that this routine is not responsible for actually
+ * deleting 'em). We cannot have any child XIDs, either.
+ */
+ if (!TransactionIdIsValid(xid))
+ {
+ /* Reset XactLastRecEnd until the next transaction writes something */
+ if (!isSubXact)
+ XactLastRecEnd = 0;
+ return InvalidTransactionId;
+ }
+
+ /*
+ * We have a valid XID, so we should write an ABORT record for it.
+ *
+ * We do not flush XLOG to disk here, since the default assumption after a
+ * crash would be that we aborted, anyway. For the same reason, we don't
+ * need to worry about interlocking against checkpoint start.
+ */
+
+ /*
+ * Check that we haven't aborted halfway through RecordTransactionCommit.
+ */
+ if (TransactionIdDidCommit(xid))
+ elog(PANIC, "cannot abort transaction %u, it was already committed",
+ xid);
+
+ /* Fetch the data we need for the abort record */
+ nrels = smgrGetPendingDeletes(false, &rels);
+ nchildren = xactGetCommittedChildren(&children);
+ ndroppedstats = pgstat_get_transactional_drops(false, &droppedstats);
+
+ /* XXX do we really need a critical section here? */
+ START_CRIT_SECTION();
+
+ /* Write the ABORT record */
+ if (isSubXact)
+ xact_time = GetCurrentTimestamp();
+ else
+ {
+ SetCurrentTransactionStopTimestamp();
+ xact_time = xactStopTimestamp;
+ }
+
+ XactLogAbortRecord(xact_time,
+ nchildren, children,
+ nrels, rels,
+ ndroppedstats, droppedstats,
+ MyXactFlags, InvalidTransactionId,
+ NULL);
+
+ /*
+ * Report the latest async abort LSN, so that the WAL writer knows to
+ * flush this abort. There's nothing to be gained by delaying this, since
+ * WALWriter may as well do this when it can. This is important with
+ * streaming replication because if we don't flush WAL regularly we will
+ * find that large aborts leave us with a long backlog for when commits
+ * occur after the abort, increasing our window of data loss should
+ * problems occur at that point.
+ */
+ if (!isSubXact)
+ XLogSetAsyncXactLSN(XactLastRecEnd);
+
+ /*
+ * Mark the transaction aborted in clog. This is not absolutely necessary
+ * but we may as well do it while we are here; also, in the subxact case
+ * it is helpful because XactLockTableWait makes use of it to avoid
+ * waiting for already-aborted subtransactions. It is OK to do it without
+ * having flushed the ABORT record to disk, because in event of a crash
+ * we'd be assumed to have aborted anyway.
+ */
+ TransactionIdAbortTree(xid, nchildren, children);
+
+ END_CRIT_SECTION();
+
+ /* Compute latestXid while we have the child XIDs handy */
+ latestXid = TransactionIdLatest(xid, nchildren, children);
+
+ /*
+ * If we're aborting a subtransaction, we can immediately remove failed
+ * XIDs from PGPROC's cache of running child XIDs. We do that here for
+ * subxacts, because we already have the child XID array at hand. For
+ * main xacts, the equivalent happens just after this function returns.
+ */
+ if (isSubXact)
+ XidCacheRemoveRunningXids(xid, nchildren, children, latestXid);
+
+ /* Reset XactLastRecEnd until the next transaction writes something */
+ if (!isSubXact)
+ XactLastRecEnd = 0;
+
+ /* And clean up local data */
+ if (rels)
+ pfree(rels);
+ if (ndroppedstats)
+ pfree(droppedstats);
+
+ return latestXid;
+}
+
+/*
+ * AtAbort_Memory
+ */
+static void
+AtAbort_Memory(void)
+{
+ /*
+ * Switch into TransactionAbortContext, which should have some free space
+ * even if nothing else does. We'll work in this context until we've
+ * finished cleaning up.
+ *
+ * It is barely possible to get here when we've not been able to create
+ * TransactionAbortContext yet; if so use TopMemoryContext.
+ */
+ if (TransactionAbortContext != NULL)
+ MemoryContextSwitchTo(TransactionAbortContext);
+ else
+ MemoryContextSwitchTo(TopMemoryContext);
+}
+
+/*
+ * AtSubAbort_Memory
+ */
+static void
+AtSubAbort_Memory(void)
+{
+ Assert(TransactionAbortContext != NULL);
+
+ MemoryContextSwitchTo(TransactionAbortContext);
+}
+
+
+/*
+ * AtAbort_ResourceOwner
+ */
+static void
+AtAbort_ResourceOwner(void)
+{
+ /*
+ * Make sure we have a valid ResourceOwner, if possible (else it will be
+ * NULL, which is OK)
+ */
+ CurrentResourceOwner = TopTransactionResourceOwner;
+}
+
+/*
+ * AtSubAbort_ResourceOwner
+ */
+static void
+AtSubAbort_ResourceOwner(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* Make sure we have a valid ResourceOwner */
+ CurrentResourceOwner = s->curTransactionOwner;
+}
+
+
+/*
+ * AtSubAbort_childXids
+ */
+static void
+AtSubAbort_childXids(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * We keep the child-XID arrays in TopTransactionContext (see
+ * AtSubCommit_childXids). This means we'd better free the array
+ * explicitly at abort to avoid leakage.
+ */
+ if (s->childXids != NULL)
+ pfree(s->childXids);
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+
+ /*
+ * We could prune the unreportedXids array here. But we don't bother. That
+ * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it
+ * would likely introduce more CPU time into the more common paths, so we
+ * choose not to do that.
+ */
+}
+
+/* ----------------------------------------------------------------
+ * CleanupTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtCleanup_Memory
+ */
+static void
+AtCleanup_Memory(void)
+{
+ Assert(CurrentTransactionState->parent == NULL);
+
+ /*
+ * Now that we're "out" of a transaction, have the system allocate things
+ * in the top memory context instead of per-transaction contexts.
+ */
+ MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Clear the special abort context for next time.
+ */
+ if (TransactionAbortContext != NULL)
+ MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
+ /*
+ * Release all transaction-local memory.
+ */
+ if (TopTransactionContext != NULL)
+ MemoryContextDelete(TopTransactionContext);
+ TopTransactionContext = NULL;
+ CurTransactionContext = NULL;
+ CurrentTransactionState->curTransactionContext = NULL;
+}
+
+
+/* ----------------------------------------------------------------
+ * CleanupSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCleanup_Memory
+ */
+static void
+AtSubCleanup_Memory(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ Assert(s->parent != NULL);
+
+ /* Make sure we're not in an about-to-be-deleted context */
+ MemoryContextSwitchTo(s->parent->curTransactionContext);
+ CurTransactionContext = s->parent->curTransactionContext;
+
+ /*
+ * Clear the special abort context for next time.
+ */
+ if (TransactionAbortContext != NULL)
+ MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
+ /*
+ * Delete the subxact local memory contexts. Its CurTransactionContext can
+ * go too (note this also kills CurTransactionContexts from any children
+ * of the subxact).
+ */
+ if (s->curTransactionContext)
+ MemoryContextDelete(s->curTransactionContext);
+ s->curTransactionContext = NULL;
+}
+
+/* ----------------------------------------------------------------
+ * interface routines
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * StartTransaction
+ */
+static void
+StartTransaction(void)
+{
+ TransactionState s;
+ VirtualTransactionId vxid;
+
+ /*
+ * Let's just make sure the state stack is empty
+ */
+ s = &TopTransactionStateData;
+ CurrentTransactionState = s;
+
+ Assert(!FullTransactionIdIsValid(XactTopFullTransactionId));
+
+ /* check the current transaction state */
+ Assert(s->state == TRANS_DEFAULT);
+
+ /*
+ * Set the current transaction state information appropriately during
+ * start processing. Note that once the transaction status is switched
+ * this process cannot fail until the user ID and the security context
+ * flags are fetched below.
+ */
+ s->state = TRANS_START;
+ s->fullTransactionId = InvalidFullTransactionId; /* until assigned */
+
+ /* Determine if statements are logged in this transaction */
+ xact_is_sampled = log_xact_sample_rate != 0 &&
+ (log_xact_sample_rate == 1 ||
+ pg_prng_double(&pg_global_prng_state) <= log_xact_sample_rate);
+
+ /*
+ * initialize current transaction state fields
+ *
+ * note: prevXactReadOnly is not used at the outermost level
+ */
+ s->nestingLevel = 1;
+ s->gucNestLevel = 1;
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+
+ /*
+ * Once the current user ID and the security context flags are fetched,
+ * both will be properly reset even if transaction startup fails.
+ */
+ GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
+
+ /* SecurityRestrictionContext should never be set outside a transaction */
+ Assert(s->prevSecContext == 0);
+
+ /*
+ * Make sure we've reset xact state variables
+ *
+ * If recovery is still in progress, mark this transaction as read-only.
+ * We have lower level defences in XLogInsert and elsewhere to stop us
+ * from modifying data during recovery, but this gives the normal
+ * indication to the user that the transaction is read-only.
+ */
+ if (RecoveryInProgress())
+ {
+ s->startedInRecovery = true;
+ XactReadOnly = true;
+ }
+ else
+ {
+ s->startedInRecovery = false;
+ XactReadOnly = DefaultXactReadOnly;
+ }
+ XactDeferrable = DefaultXactDeferrable;
+ XactIsoLevel = DefaultXactIsoLevel;
+ forceSyncCommit = false;
+ MyXactFlags = 0;
+
+ /*
+ * reinitialize within-transaction counters
+ */
+ s->subTransactionId = TopSubTransactionId;
+ currentSubTransactionId = TopSubTransactionId;
+ currentCommandId = FirstCommandId;
+ currentCommandIdUsed = false;
+
+ /*
+ * initialize reported xid accounting
+ */
+ nUnreportedXids = 0;
+ s->didLogXid = false;
+
+ /*
+ * must initialize resource-management stuff first
+ */
+ AtStart_Memory();
+ AtStart_ResourceOwner();
+
+ /*
+ * Assign a new LocalTransactionId, and combine it with the backendId to
+ * form a virtual transaction id.
+ */
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = GetNextLocalTransactionId();
+
+ /*
+ * Lock the virtual transaction id before we announce it in the proc array
+ */
+ VirtualXactLockTableInsert(vxid);
+
+ /*
+ * Advertise it in the proc array. We assume assignment of
+ * localTransactionId is atomic, and the backendId should be set already.
+ */
+ Assert(MyProc->backendId == vxid.backendId);
+ MyProc->lxid = vxid.localTransactionId;
+
+ TRACE_POSTGRESQL_TRANSACTION_START(vxid.localTransactionId);
+
+ /*
+ * set transaction_timestamp() (a/k/a now()). Normally, we want this to
+ * be the same as the first command's statement_timestamp(), so don't do a
+ * fresh GetCurrentTimestamp() call (which'd be expensive anyway). But
+ * for transactions started inside procedures (i.e., nonatomic SPI
+ * contexts), we do need to advance the timestamp. Also, in a parallel
+ * worker, the timestamp should already have been provided by a call to
+ * SetParallelStartTimestamps().
+ */
+ if (!IsParallelWorker())
+ {
+ if (!SPI_inside_nonatomic_context())
+ xactStartTimestamp = stmtStartTimestamp;
+ else
+ xactStartTimestamp = GetCurrentTimestamp();
+ }
+ else
+ Assert(xactStartTimestamp != 0);
+ pgstat_report_xact_timestamp(xactStartTimestamp);
+ /* Mark xactStopTimestamp as unset. */
+ xactStopTimestamp = 0;
+
+ /*
+ * initialize other subsystems for new transaction
+ */
+ AtStart_GUC();
+ AtStart_Cache();
+ AfterTriggerBeginXact();
+
+ /*
+ * done with start processing, set current transaction state to "in
+ * progress"
+ */
+ s->state = TRANS_INPROGRESS;
+
+ ShowTransactionState("StartTransaction");
+}
+
+
+/*
+ * CommitTransaction
+ *
+ * NB: if you change this routine, better look at PrepareTransaction too!
+ */
+static void
+CommitTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+ TransactionId latestXid;
+ bool is_parallel_worker;
+
+ is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
+
+ /* Enforce parallel mode restrictions during parallel worker commit. */
+ if (is_parallel_worker)
+ EnterParallelMode();
+
+ ShowTransactionState("CommitTransaction");
+
+ /*
+ * check the current transaction state
+ */
+ if (s->state != TRANS_INPROGRESS)
+ elog(WARNING, "CommitTransaction while in %s state",
+ TransStateAsString(s->state));
+ Assert(s->parent == NULL);
+
+ /*
+ * Do pre-commit processing that involves calling user-defined code, such
+ * as triggers. SECURITY_RESTRICTED_OPERATION contexts must not queue an
+ * action that would run here, because that would bypass the sandbox.
+ * Since closing cursors could queue trigger actions, triggers could open
+ * cursors, etc, we have to keep looping until there's nothing left to do.
+ */
+ for (;;)
+ {
+ /*
+ * Fire all currently pending deferred triggers.
+ */
+ AfterTriggerFireDeferred();
+
+ /*
+ * Close open portals (converting holdable ones into static portals).
+ * If there weren't any, we are done ... otherwise loop back to check
+ * if they queued deferred triggers. Lather, rinse, repeat.
+ */
+ if (!PreCommit_Portals(false))
+ break;
+ }
+
+ /*
+ * The remaining actions cannot call any user-defined code, so it's safe
+ * to start shutting down within-transaction services. But note that most
+ * of this stuff could still throw an error, which would switch us into
+ * the transaction-abort path.
+ */
+
+ CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT
+ : XACT_EVENT_PRE_COMMIT);
+
+ /* If we might have parallel workers, clean them up now. */
+ if (IsInParallelMode())
+ AtEOXact_Parallel(true);
+
+ /* Shut down the deferred-trigger manager */
+ AfterTriggerEndXact(true);
+
+ /*
+ * Let ON COMMIT management do its thing (must happen after closing
+ * cursors, to avoid dangling-reference problems)
+ */
+ PreCommit_on_commit_actions();
+
+ /*
+ * Synchronize files that are created and not WAL-logged during this
+ * transaction. This must happen before AtEOXact_RelationMap(), so that we
+ * don't see committed-but-broken files after a crash.
+ */
+ smgrDoPendingSyncs(true, is_parallel_worker);
+
+ /* close large objects before lower-level cleanup */
+ AtEOXact_LargeObject(true);
+
+ /*
+ * Insert notifications sent by NOTIFY commands into the queue. This
+ * should be late in the pre-commit sequence to minimize time spent
+ * holding the notify-insertion lock. However, this could result in
+ * creating a snapshot, so we must do it before serializable cleanup.
+ */
+ PreCommit_Notify();
+
+ /*
+ * Mark serializable transaction as complete for predicate locking
+ * purposes. This should be done as late as we can put it and still allow
+ * errors to be raised for failure patterns found at commit. This is not
+ * appropriate in a parallel worker however, because we aren't committing
+ * the leader's transaction and its serializable state will live on.
+ */
+ if (!is_parallel_worker)
+ PreCommit_CheckForSerializationFailure();
+
+ /* Prevent cancel/die interrupt while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /* Commit updates to the relation map --- do this as late as possible */
+ AtEOXact_RelationMap(true, is_parallel_worker);
+
+ /*
+ * set the current transaction state information appropriately during
+ * commit processing
+ */
+ s->state = TRANS_COMMIT;
+ s->parallelModeLevel = 0;
+
+ if (!is_parallel_worker)
+ {
+ /*
+ * We need to mark our XIDs as committed in pg_xact. This is where we
+ * durably commit.
+ */
+ latestXid = RecordTransactionCommit();
+ }
+ else
+ {
+ /*
+ * We must not mark our XID committed; the parallel leader is
+ * responsible for that.
+ */
+ latestXid = InvalidTransactionId;
+
+ /*
+ * Make sure the leader will know about any WAL we wrote before it
+ * commits.
+ */
+ ParallelWorkerReportLastRecEnd(XactLastRecEnd);
+ }
+
+ TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
+
+ /*
+ * Let others know about no transaction in progress by me. Note that this
+ * must be done _before_ releasing locks we hold and _after_
+ * RecordTransactionCommit.
+ */
+ ProcArrayEndTransaction(MyProc, latestXid);
+
+ /*
+ * This is all post-commit cleanup. Note that if an error is raised here,
+ * it's too late to abort the transaction. This should be just
+ * noncritical resource releasing.
+ *
+ * The ordering of operations is not entirely random. The idea is:
+ * release resources visible to other backends (eg, files, buffer pins);
+ * then release locks; then release backend-local resources. We want to
+ * release locks at the point where any backend waiting for us will see
+ * our transaction as being fully cleaned up.
+ *
+ * Resources that can be associated with individual queries are handled by
+ * the ResourceOwner mechanism. The other calls here are for backend-wide
+ * state.
+ */
+
+ CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_COMMIT
+ : XACT_EVENT_COMMIT);
+
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ true, true);
+
+ /* Check we've released all buffer pins */
+ AtEOXact_Buffers(true);
+
+ /* Clean up the relation cache */
+ AtEOXact_RelationCache(true);
+
+ /*
+ * Make catalog changes visible to all backends. This has to happen after
+ * relcache references are dropped (see comments for
+ * AtEOXact_RelationCache), but before locks are released (if anyone is
+ * waiting for lock on a relation we've modified, we want them to know
+ * about the catalog change before they start using the relation).
+ */
+ AtEOXact_Inval(true);
+
+ AtEOXact_MultiXact();
+
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_LOCKS,
+ true, true);
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ true, true);
+
+ /*
+ * Likewise, dropping of files deleted during the transaction is best done
+ * after releasing relcache and buffer pins. (This is not strictly
+ * necessary during commit, since such pins should have been released
+ * already, but this ordering is definitely critical during abort.) Since
+ * this may take many seconds, also delay until after releasing locks.
+ * Other backends will observe the attendant catalog changes and not
+ * attempt to access affected files.
+ */
+ smgrDoPendingDeletes(true);
+
+ /*
+ * Send out notification signals to other backends (and do other
+ * post-commit NOTIFY cleanup). This must not happen until after our
+ * transaction is fully done from the viewpoint of other backends.
+ */
+ AtCommit_Notify();
+
+ /*
+ * Everything after this should be purely internal-to-this-backend
+ * cleanup.
+ */
+ AtEOXact_GUC(true, 1);
+ AtEOXact_SPI(true);
+ AtEOXact_Enum();
+ AtEOXact_on_commit_actions(true);
+ AtEOXact_Namespace(true, is_parallel_worker);
+ AtEOXact_SMgr();
+ AtEOXact_Files(true);
+ AtEOXact_ComboCid();
+ AtEOXact_HashTables(true);
+ AtEOXact_PgStat(true, is_parallel_worker);
+ AtEOXact_Snapshot(true, false);
+ AtEOXact_ApplyLauncher(true);
+ pgstat_report_xact_timestamp(0);
+
+ CurrentResourceOwner = NULL;
+ ResourceOwnerDelete(TopTransactionResourceOwner);
+ s->curTransactionOwner = NULL;
+ CurTransactionResourceOwner = NULL;
+ TopTransactionResourceOwner = NULL;
+
+ AtCommit_Memory();
+
+ s->fullTransactionId = InvalidFullTransactionId;
+ s->subTransactionId = InvalidSubTransactionId;
+ s->nestingLevel = 0;
+ s->gucNestLevel = 0;
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+
+ XactTopFullTransactionId = InvalidFullTransactionId;
+ nParallelCurrentXids = 0;
+
+ /*
+ * done with commit processing, set current transaction state back to
+ * default
+ */
+ s->state = TRANS_DEFAULT;
+
+ RESUME_INTERRUPTS();
+}
+
+
+/*
+ * PrepareTransaction
+ *
+ * NB: if you change this routine, better look at CommitTransaction too!
+ */
+static void
+PrepareTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+ TransactionId xid = GetCurrentTransactionId();
+ GlobalTransaction gxact;
+ TimestampTz prepared_at;
+
+ Assert(!IsInParallelMode());
+
+ ShowTransactionState("PrepareTransaction");
+
+ /*
+ * check the current transaction state
+ */
+ if (s->state != TRANS_INPROGRESS)
+ elog(WARNING, "PrepareTransaction while in %s state",
+ TransStateAsString(s->state));
+ Assert(s->parent == NULL);
+
+ /*
+ * Do pre-commit processing that involves calling user-defined code, such
+ * as triggers. Since closing cursors could queue trigger actions,
+ * triggers could open cursors, etc, we have to keep looping until there's
+ * nothing left to do.
+ */
+ for (;;)
+ {
+ /*
+ * Fire all currently pending deferred triggers.
+ */
+ AfterTriggerFireDeferred();
+
+ /*
+ * Close open portals (converting holdable ones into static portals).
+ * If there weren't any, we are done ... otherwise loop back to check
+ * if they queued deferred triggers. Lather, rinse, repeat.
+ */
+ if (!PreCommit_Portals(true))
+ break;
+ }
+
+ CallXactCallbacks(XACT_EVENT_PRE_PREPARE);
+
+ /*
+ * The remaining actions cannot call any user-defined code, so it's safe
+ * to start shutting down within-transaction services. But note that most
+ * of this stuff could still throw an error, which would switch us into
+ * the transaction-abort path.
+ */
+
+ /* Shut down the deferred-trigger manager */
+ AfterTriggerEndXact(true);
+
+ /*
+ * Let ON COMMIT management do its thing (must happen after closing
+ * cursors, to avoid dangling-reference problems)
+ */
+ PreCommit_on_commit_actions();
+
+ /*
+ * Synchronize files that are created and not WAL-logged during this
+ * transaction. This must happen before EndPrepare(), so that we don't see
+ * committed-but-broken files after a crash and COMMIT PREPARED.
+ */
+ smgrDoPendingSyncs(true, false);
+
+ /* close large objects before lower-level cleanup */
+ AtEOXact_LargeObject(true);
+
+ /* NOTIFY requires no work at this point */
+
+ /*
+ * Mark serializable transaction as complete for predicate locking
+ * purposes. This should be done as late as we can put it and still allow
+ * errors to be raised for failure patterns found at commit.
+ */
+ PreCommit_CheckForSerializationFailure();
+
+ /*
+ * Don't allow PREPARE TRANSACTION if we've accessed a temporary table in
+ * this transaction. Having the prepared xact hold locks on another
+ * backend's temp table seems a bad idea --- for instance it would prevent
+ * the backend from exiting. There are other problems too, such as how to
+ * clean up the source backend's local buffers and ON COMMIT state if the
+ * prepared xact includes a DROP of a temp table.
+ *
+ * Other objects types, like functions, operators or extensions, share the
+ * same restriction as they should not be created, locked or dropped as
+ * this can mess up with this session or even a follow-up session trying
+ * to use the same temporary namespace.
+ *
+ * We must check this after executing any ON COMMIT actions, because they
+ * might still access a temp relation.
+ *
+ * XXX In principle this could be relaxed to allow some useful special
+ * cases, such as a temp table created and dropped all within the
+ * transaction. That seems to require much more bookkeeping though.
+ */
+ if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPNAMESPACE))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE a transaction that has operated on temporary objects")));
+
+ /*
+ * Likewise, don't allow PREPARE after pg_export_snapshot. This could be
+ * supported if we added cleanup logic to twophase.c, but for now it
+ * doesn't seem worth the trouble.
+ */
+ if (XactHasExportedSnapshots())
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE a transaction that has exported snapshots")));
+
+ /* Prevent cancel/die interrupt while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /*
+ * set the current transaction state information appropriately during
+ * prepare processing
+ */
+ s->state = TRANS_PREPARE;
+
+ prepared_at = GetCurrentTimestamp();
+
+ /* Tell bufmgr and smgr to prepare for commit */
+ BufmgrCommit();
+
+ /*
+ * Reserve the GID for this transaction. This could fail if the requested
+ * GID is invalid or already in use.
+ */
+ gxact = MarkAsPreparing(xid, prepareGID, prepared_at,
+ GetUserId(), MyDatabaseId);
+ prepareGID = NULL;
+
+ /*
+ * Collect data for the 2PC state file. Note that in general, no actual
+ * state change should happen in the called modules during this step,
+ * since it's still possible to fail before commit, and in that case we
+ * want transaction abort to be able to clean up. (In particular, the
+ * AtPrepare routines may error out if they find cases they cannot
+ * handle.) State cleanup should happen in the PostPrepare routines
+ * below. However, some modules can go ahead and clear state here because
+ * they wouldn't do anything with it during abort anyway.
+ *
+ * Note: because the 2PC state file records will be replayed in the same
+ * order they are made, the order of these calls has to match the order in
+ * which we want things to happen during COMMIT PREPARED or ROLLBACK
+ * PREPARED; in particular, pay attention to whether things should happen
+ * before or after releasing the transaction's locks.
+ */
+ StartPrepare(gxact);
+
+ AtPrepare_Notify();
+ AtPrepare_Locks();
+ AtPrepare_PredicateLocks();
+ AtPrepare_PgStat();
+ AtPrepare_MultiXact();
+ AtPrepare_RelationMap();
+
+ /*
+ * Here is where we really truly prepare.
+ *
+ * We have to record transaction prepares even if we didn't make any
+ * updates, because the transaction manager might get confused if we lose
+ * a global transaction.
+ */
+ EndPrepare(gxact);
+
+ /*
+ * Now we clean up backend-internal state and release internal resources.
+ */
+
+ /* Reset XactLastRecEnd until the next transaction writes something */
+ XactLastRecEnd = 0;
+
+ /*
+ * Transfer our locks to a dummy PGPROC. This has to be done before
+ * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would
+ * conclude "xact already committed or aborted" for our locks.
+ */
+ PostPrepare_Locks(xid);
+
+ /*
+ * Let others know about no transaction in progress by me. This has to be
+ * done *after* the prepared transaction has been marked valid, else
+ * someone may think it is unlocked and recyclable.
+ */
+ ProcArrayClearTransaction(MyProc);
+
+ /*
+ * In normal commit-processing, this is all non-critical post-transaction
+ * cleanup. When the transaction is prepared, however, it's important
+ * that the locks and other per-backend resources are transferred to the
+ * prepared transaction's PGPROC entry. Note that if an error is raised
+ * here, it's too late to abort the transaction. XXX: This probably should
+ * be in a critical section, to force a PANIC if any of this fails, but
+ * that cure could be worse than the disease.
+ */
+
+ CallXactCallbacks(XACT_EVENT_PREPARE);
+
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ true, true);
+
+ /* Check we've released all buffer pins */
+ AtEOXact_Buffers(true);
+
+ /* Clean up the relation cache */
+ AtEOXact_RelationCache(true);
+
+ /* notify doesn't need a postprepare call */
+
+ PostPrepare_PgStat();
+
+ PostPrepare_Inval();
+
+ PostPrepare_smgr();
+
+ PostPrepare_MultiXact(xid);
+
+ PostPrepare_PredicateLocks(xid);
+
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_LOCKS,
+ true, true);
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ true, true);
+
+ /*
+ * Allow another backend to finish the transaction. After
+ * PostPrepare_Twophase(), the transaction is completely detached from our
+ * backend. The rest is just non-critical cleanup of backend-local state.
+ */
+ PostPrepare_Twophase();
+
+ /* PREPARE acts the same as COMMIT as far as GUC is concerned */
+ AtEOXact_GUC(true, 1);
+ AtEOXact_SPI(true);
+ AtEOXact_Enum();
+ AtEOXact_on_commit_actions(true);
+ AtEOXact_Namespace(true, false);
+ AtEOXact_SMgr();
+ AtEOXact_Files(true);
+ AtEOXact_ComboCid();
+ AtEOXact_HashTables(true);
+ /* don't call AtEOXact_PgStat here; we fixed pgstat state above */
+ AtEOXact_Snapshot(true, true);
+ pgstat_report_xact_timestamp(0);
+
+ CurrentResourceOwner = NULL;
+ ResourceOwnerDelete(TopTransactionResourceOwner);
+ s->curTransactionOwner = NULL;
+ CurTransactionResourceOwner = NULL;
+ TopTransactionResourceOwner = NULL;
+
+ AtCommit_Memory();
+
+ s->fullTransactionId = InvalidFullTransactionId;
+ s->subTransactionId = InvalidSubTransactionId;
+ s->nestingLevel = 0;
+ s->gucNestLevel = 0;
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+
+ XactTopFullTransactionId = InvalidFullTransactionId;
+ nParallelCurrentXids = 0;
+
+ /*
+ * done with 1st phase commit processing, set current transaction state
+ * back to default
+ */
+ s->state = TRANS_DEFAULT;
+
+ RESUME_INTERRUPTS();
+}
+
+
+/*
+ * AbortTransaction
+ */
+static void
+AbortTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+ TransactionId latestXid;
+ bool is_parallel_worker;
+
+ /* Prevent cancel/die interrupt while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /* Make sure we have a valid memory context and resource owner */
+ AtAbort_Memory();
+ AtAbort_ResourceOwner();
+
+ /*
+ * Release any LW locks we might be holding as quickly as possible.
+ * (Regular locks, however, must be held till we finish aborting.)
+ * Releasing LW locks is critical since we might try to grab them again
+ * while cleaning up!
+ */
+ LWLockReleaseAll();
+
+ /* Clear wait information and command progress indicator */
+ pgstat_report_wait_end();
+ pgstat_progress_end_command();
+
+ /* Clean up buffer I/O and buffer context locks, too */
+ AbortBufferIO();
+ UnlockBuffers();
+
+ /* Reset WAL record construction state */
+ XLogResetInsertion();
+
+ /* Cancel condition variable sleep */
+ ConditionVariableCancelSleep();
+
+ /*
+ * Also clean up any open wait for lock, since the lock manager will choke
+ * if we try to wait for another lock before doing this.
+ */
+ LockErrorCleanup();
+
+ /*
+ * If any timeout events are still active, make sure the timeout interrupt
+ * is scheduled. This covers possible loss of a timeout interrupt due to
+ * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+ * We delay this till after LockErrorCleanup so that we don't uselessly
+ * reschedule lock or deadlock check timeouts.
+ */
+ reschedule_timeouts();
+
+ /*
+ * Re-enable signals, in case we got here by longjmp'ing out of a signal
+ * handler. We do this fairly early in the sequence so that the timeout
+ * infrastructure will be functional if needed while aborting.
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ /*
+ * check the current transaction state
+ */
+ is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
+ if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE)
+ elog(WARNING, "AbortTransaction while in %s state",
+ TransStateAsString(s->state));
+ Assert(s->parent == NULL);
+
+ /*
+ * set the current transaction state information appropriately during the
+ * abort processing
+ */
+ s->state = TRANS_ABORT;
+
+ /*
+ * Reset user ID which might have been changed transiently. We need this
+ * to clean up in case control escaped out of a SECURITY DEFINER function
+ * or other local change of CurrentUserId; therefore, the prior value of
+ * SecurityRestrictionContext also needs to be restored.
+ *
+ * (Note: it is not necessary to restore session authorization or role
+ * settings here because those can only be changed via GUC, and GUC will
+ * take care of rolling them back if need be.)
+ */
+ SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
+
+ /* Forget about any active REINDEX. */
+ ResetReindexState(s->nestingLevel);
+
+ /* Reset logical streaming state. */
+ ResetLogicalStreamingState();
+
+ /* Reset snapshot export state. */
+ SnapBuildResetExportedSnapshotState();
+
+ /* If in parallel mode, clean up workers and exit parallel mode. */
+ if (IsInParallelMode())
+ {
+ AtEOXact_Parallel(false);
+ s->parallelModeLevel = 0;
+ }
+
+ /*
+ * do abort processing
+ */
+ AfterTriggerEndXact(false); /* 'false' means it's abort */
+ AtAbort_Portals();
+ smgrDoPendingSyncs(false, is_parallel_worker);
+ AtEOXact_LargeObject(false);
+ AtAbort_Notify();
+ AtEOXact_RelationMap(false, is_parallel_worker);
+ AtAbort_Twophase();
+
+ /*
+ * Advertise the fact that we aborted in pg_xact (assuming that we got as
+ * far as assigning an XID to advertise). But if we're inside a parallel
+ * worker, skip this; the user backend must be the one to write the abort
+ * record.
+ */
+ if (!is_parallel_worker)
+ latestXid = RecordTransactionAbort(false);
+ else
+ {
+ latestXid = InvalidTransactionId;
+
+ /*
+ * Since the parallel leader won't get our value of XactLastRecEnd in
+ * this case, we nudge WAL-writer ourselves in this case. See related
+ * comments in RecordTransactionAbort for why this matters.
+ */
+ XLogSetAsyncXactLSN(XactLastRecEnd);
+ }
+
+ TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
+
+ /*
+ * Let others know about no transaction in progress by me. Note that this
+ * must be done _before_ releasing locks we hold and _after_
+ * RecordTransactionAbort.
+ */
+ ProcArrayEndTransaction(MyProc, latestXid);
+
+ /*
+ * Post-abort cleanup. See notes in CommitTransaction() concerning
+ * ordering. We can skip all of it if the transaction failed before
+ * creating a resource owner.
+ */
+ if (TopTransactionResourceOwner != NULL)
+ {
+ if (is_parallel_worker)
+ CallXactCallbacks(XACT_EVENT_PARALLEL_ABORT);
+ else
+ CallXactCallbacks(XACT_EVENT_ABORT);
+
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ false, true);
+ AtEOXact_Buffers(false);
+ AtEOXact_RelationCache(false);
+ AtEOXact_Inval(false);
+ AtEOXact_MultiXact();
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_LOCKS,
+ false, true);
+ ResourceOwnerRelease(TopTransactionResourceOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ false, true);
+ smgrDoPendingDeletes(false);
+
+ AtEOXact_GUC(false, 1);
+ AtEOXact_SPI(false);
+ AtEOXact_Enum();
+ AtEOXact_on_commit_actions(false);
+ AtEOXact_Namespace(false, is_parallel_worker);
+ AtEOXact_SMgr();
+ AtEOXact_Files(false);
+ AtEOXact_ComboCid();
+ AtEOXact_HashTables(false);
+ AtEOXact_PgStat(false, is_parallel_worker);
+ AtEOXact_ApplyLauncher(false);
+ pgstat_report_xact_timestamp(0);
+ }
+
+ /*
+ * State remains TRANS_ABORT until CleanupTransaction().
+ */
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * CleanupTransaction
+ */
+static void
+CleanupTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * State should still be TRANS_ABORT from AbortTransaction().
+ */
+ if (s->state != TRANS_ABORT)
+ elog(FATAL, "CleanupTransaction: unexpected state %s",
+ TransStateAsString(s->state));
+
+ /*
+ * do abort cleanup processing
+ */
+ AtCleanup_Portals(); /* now safe to release portal memory */
+ AtEOXact_Snapshot(false, true); /* and release the transaction's snapshots */
+
+ CurrentResourceOwner = NULL; /* and resource owner */
+ if (TopTransactionResourceOwner)
+ ResourceOwnerDelete(TopTransactionResourceOwner);
+ s->curTransactionOwner = NULL;
+ CurTransactionResourceOwner = NULL;
+ TopTransactionResourceOwner = NULL;
+
+ AtCleanup_Memory(); /* and transaction memory */
+
+ s->fullTransactionId = InvalidFullTransactionId;
+ s->subTransactionId = InvalidSubTransactionId;
+ s->nestingLevel = 0;
+ s->gucNestLevel = 0;
+ s->childXids = NULL;
+ s->nChildXids = 0;
+ s->maxChildXids = 0;
+ s->parallelModeLevel = 0;
+
+ XactTopFullTransactionId = InvalidFullTransactionId;
+ nParallelCurrentXids = 0;
+
+ /*
+ * done with abort processing, set current transaction state back to
+ * default
+ */
+ s->state = TRANS_DEFAULT;
+}
+
+/*
+ * StartTransactionCommand
+ */
+void
+StartTransactionCommand(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->blockState)
+ {
+ /*
+ * if we aren't in a transaction block, we just do our usual start
+ * transaction.
+ */
+ case TBLOCK_DEFAULT:
+ StartTransaction();
+ s->blockState = TBLOCK_STARTED;
+ break;
+
+ /*
+ * We are somewhere in a transaction block or subtransaction and
+ * about to start a new command. For now we do nothing, but
+ * someday we may do command-local resource initialization. (Note
+ * that any needed CommandCounterIncrement was done by the
+ * previous CommitTransactionCommand.)
+ */
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_SUBINPROGRESS:
+ break;
+
+ /*
+ * Here we are in a failed transaction block (one of the commands
+ * caused an abort) so we do nothing but remain in the abort
+ * state. Eventually we will get a ROLLBACK command which will
+ * get us out of this state. (It is up to other code to ensure
+ * that no commands other than ROLLBACK will be processed in these
+ * states.)
+ */
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(ERROR, "StartTransactionCommand: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ /*
+ * We must switch to CurTransactionContext before returning. This is
+ * already done if we called StartTransaction, otherwise not.
+ */
+ Assert(CurTransactionContext != NULL);
+ MemoryContextSwitchTo(CurTransactionContext);
+}
+
+
+/*
+ * Simple system for saving and restoring transaction characteristics
+ * (isolation level, read only, deferrable). We need this for transaction
+ * chaining, so that we can set the characteristics of the new transaction to
+ * be the same as the previous one. (We need something like this because the
+ * GUC system resets the characteristics at transaction end, so for example
+ * just skipping the reset in StartTransaction() won't work.)
+ */
+void
+SaveTransactionCharacteristics(SavedTransactionCharacteristics *s)
+{
+ s->save_XactIsoLevel = XactIsoLevel;
+ s->save_XactReadOnly = XactReadOnly;
+ s->save_XactDeferrable = XactDeferrable;
+}
+
+void
+RestoreTransactionCharacteristics(const SavedTransactionCharacteristics *s)
+{
+ XactIsoLevel = s->save_XactIsoLevel;
+ XactReadOnly = s->save_XactReadOnly;
+ XactDeferrable = s->save_XactDeferrable;
+}
+
+
+/*
+ * CommitTransactionCommand
+ */
+void
+CommitTransactionCommand(void)
+{
+ TransactionState s = CurrentTransactionState;
+ SavedTransactionCharacteristics savetc;
+
+ /* Must save in case we need to restore below */
+ SaveTransactionCharacteristics(&savetc);
+
+ switch (s->blockState)
+ {
+ /*
+ * These shouldn't happen. TBLOCK_DEFAULT means the previous
+ * StartTransactionCommand didn't set the STARTED state
+ * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended
+ * by EndParallelWorkerTransaction(), not this function.
+ */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ elog(FATAL, "CommitTransactionCommand: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+
+ /*
+ * If we aren't in a transaction block, just do our usual
+ * transaction commit, and return to the idle state.
+ */
+ case TBLOCK_STARTED:
+ CommitTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * We are completing a "BEGIN TRANSACTION" command, so we change
+ * to the "transaction block in progress" state and return. (We
+ * assume the BEGIN did nothing to the database, so we need no
+ * CommandCounterIncrement.)
+ */
+ case TBLOCK_BEGIN:
+ s->blockState = TBLOCK_INPROGRESS;
+ break;
+
+ /*
+ * This is the case when we have finished executing a command
+ * someplace within a transaction block. We increment the command
+ * counter and return.
+ */
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_SUBINPROGRESS:
+ CommandCounterIncrement();
+ break;
+
+ /*
+ * We are completing a "COMMIT" command. Do it and return to the
+ * idle state.
+ */
+ case TBLOCK_END:
+ CommitTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ if (s->chain)
+ {
+ StartTransaction();
+ s->blockState = TBLOCK_INPROGRESS;
+ s->chain = false;
+ RestoreTransactionCharacteristics(&savetc);
+ }
+ break;
+
+ /*
+ * Here we are in the middle of a transaction block but one of the
+ * commands caused an abort so we do nothing but remain in the
+ * abort state. Eventually we will get a ROLLBACK command.
+ */
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ break;
+
+ /*
+ * Here we were in an aborted transaction block and we just got
+ * the ROLLBACK command from the user, so clean up the
+ * already-aborted transaction and return to the idle state.
+ */
+ case TBLOCK_ABORT_END:
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ if (s->chain)
+ {
+ StartTransaction();
+ s->blockState = TBLOCK_INPROGRESS;
+ s->chain = false;
+ RestoreTransactionCharacteristics(&savetc);
+ }
+ break;
+
+ /*
+ * Here we were in a perfectly good transaction block but the user
+ * told us to ROLLBACK anyway. We have to abort the transaction
+ * and then clean up.
+ */
+ case TBLOCK_ABORT_PENDING:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ if (s->chain)
+ {
+ StartTransaction();
+ s->blockState = TBLOCK_INPROGRESS;
+ s->chain = false;
+ RestoreTransactionCharacteristics(&savetc);
+ }
+ break;
+
+ /*
+ * We are completing a "PREPARE TRANSACTION" command. Do it and
+ * return to the idle state.
+ */
+ case TBLOCK_PREPARE:
+ PrepareTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * We were just issued a SAVEPOINT inside a transaction block.
+ * Start a subtransaction. (DefineSavepoint already did
+ * PushTransaction, so as to have someplace to put the SUBBEGIN
+ * state.)
+ */
+ case TBLOCK_SUBBEGIN:
+ StartSubTransaction();
+ s->blockState = TBLOCK_SUBINPROGRESS;
+ break;
+
+ /*
+ * We were issued a RELEASE command, so we end the current
+ * subtransaction and return to the parent transaction. The parent
+ * might be ended too, so repeat till we find an INPROGRESS
+ * transaction or subtransaction.
+ */
+ case TBLOCK_SUBRELEASE:
+ do
+ {
+ CommitSubTransaction();
+ s = CurrentTransactionState; /* changed by pop */
+ } while (s->blockState == TBLOCK_SUBRELEASE);
+
+ Assert(s->blockState == TBLOCK_INPROGRESS ||
+ s->blockState == TBLOCK_SUBINPROGRESS);
+ break;
+
+ /*
+ * We were issued a COMMIT, so we end the current subtransaction
+ * hierarchy and perform final commit. We do this by rolling up
+ * any subtransactions into their parent, which leads to O(N^2)
+ * operations with respect to resource owners - this isn't that
+ * bad until we approach a thousands of savepoints but is
+ * necessary for correctness should after triggers create new
+ * resource owners.
+ */
+ case TBLOCK_SUBCOMMIT:
+ do
+ {
+ CommitSubTransaction();
+ s = CurrentTransactionState; /* changed by pop */
+ } while (s->blockState == TBLOCK_SUBCOMMIT);
+ /* If we had a COMMIT command, finish off the main xact too */
+ if (s->blockState == TBLOCK_END)
+ {
+ Assert(s->parent == NULL);
+ CommitTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ if (s->chain)
+ {
+ StartTransaction();
+ s->blockState = TBLOCK_INPROGRESS;
+ s->chain = false;
+ RestoreTransactionCharacteristics(&savetc);
+ }
+ }
+ else if (s->blockState == TBLOCK_PREPARE)
+ {
+ Assert(s->parent == NULL);
+ PrepareTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ }
+ else
+ elog(ERROR, "CommitTransactionCommand: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+
+ /*
+ * The current already-failed subtransaction is ending due to a
+ * ROLLBACK or ROLLBACK TO command, so pop it and recursively
+ * examine the parent (which could be in any of several states).
+ */
+ case TBLOCK_SUBABORT_END:
+ CleanupSubTransaction();
+ CommitTransactionCommand();
+ break;
+
+ /*
+ * As above, but it's not dead yet, so abort first.
+ */
+ case TBLOCK_SUBABORT_PENDING:
+ AbortSubTransaction();
+ CleanupSubTransaction();
+ CommitTransactionCommand();
+ break;
+
+ /*
+ * The current subtransaction is the target of a ROLLBACK TO
+ * command. Abort and pop it, then start a new subtransaction
+ * with the same name.
+ */
+ case TBLOCK_SUBRESTART:
+ {
+ char *name;
+ int savepointLevel;
+
+ /* save name and keep Cleanup from freeing it */
+ name = s->name;
+ s->name = NULL;
+ savepointLevel = s->savepointLevel;
+
+ AbortSubTransaction();
+ CleanupSubTransaction();
+
+ DefineSavepoint(NULL);
+ s = CurrentTransactionState; /* changed by push */
+ s->name = name;
+ s->savepointLevel = savepointLevel;
+
+ /* This is the same as TBLOCK_SUBBEGIN case */
+ AssertState(s->blockState == TBLOCK_SUBBEGIN);
+ StartSubTransaction();
+ s->blockState = TBLOCK_SUBINPROGRESS;
+ }
+ break;
+
+ /*
+ * Same as above, but the subtransaction had already failed, so we
+ * don't need AbortSubTransaction.
+ */
+ case TBLOCK_SUBABORT_RESTART:
+ {
+ char *name;
+ int savepointLevel;
+
+ /* save name and keep Cleanup from freeing it */
+ name = s->name;
+ s->name = NULL;
+ savepointLevel = s->savepointLevel;
+
+ CleanupSubTransaction();
+
+ DefineSavepoint(NULL);
+ s = CurrentTransactionState; /* changed by push */
+ s->name = name;
+ s->savepointLevel = savepointLevel;
+
+ /* This is the same as TBLOCK_SUBBEGIN case */
+ AssertState(s->blockState == TBLOCK_SUBBEGIN);
+ StartSubTransaction();
+ s->blockState = TBLOCK_SUBINPROGRESS;
+ }
+ break;
+ }
+}
+
+/*
+ * AbortCurrentTransaction
+ */
+void
+AbortCurrentTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->blockState)
+ {
+ case TBLOCK_DEFAULT:
+ if (s->state == TRANS_DEFAULT)
+ {
+ /* we are idle, so nothing to do */
+ }
+ else
+ {
+ /*
+ * We can get here after an error during transaction start
+ * (state will be TRANS_START). Need to clean up the
+ * incompletely started transaction. First, adjust the
+ * low-level state to suppress warning message from
+ * AbortTransaction.
+ */
+ if (s->state == TRANS_START)
+ s->state = TRANS_INPROGRESS;
+ AbortTransaction();
+ CleanupTransaction();
+ }
+ break;
+
+ /*
+ * If we aren't in a transaction block, we just do the basic abort
+ * & cleanup transaction. For this purpose, we treat an implicit
+ * transaction block as if it were a simple statement.
+ */
+ case TBLOCK_STARTED:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * If we are in TBLOCK_BEGIN it means something screwed up right
+ * after reading "BEGIN TRANSACTION". We assume that the user
+ * will interpret the error as meaning the BEGIN failed to get him
+ * into a transaction block, so we should abort and return to idle
+ * state.
+ */
+ case TBLOCK_BEGIN:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * We are somewhere in a transaction block and we've gotten a
+ * failure, so we abort the transaction and set up the persistent
+ * ABORT state. We will stay in ABORT until we get a ROLLBACK.
+ */
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ AbortTransaction();
+ s->blockState = TBLOCK_ABORT;
+ /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */
+ break;
+
+ /*
+ * Here, we failed while trying to COMMIT. Clean up the
+ * transaction and return to idle state (we do not want to stay in
+ * the transaction).
+ */
+ case TBLOCK_END:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * Here, we are already in an aborted transaction state and are
+ * waiting for a ROLLBACK, but for some reason we failed again! So
+ * we just remain in the abort state.
+ */
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ break;
+
+ /*
+ * We are in a failed transaction and we got the ROLLBACK command.
+ * We have already aborted, we just need to cleanup and go to idle
+ * state.
+ */
+ case TBLOCK_ABORT_END:
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * We are in a live transaction and we got a ROLLBACK command.
+ * Abort, cleanup, go to idle state.
+ */
+ case TBLOCK_ABORT_PENDING:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * Here, we failed while trying to PREPARE. Clean up the
+ * transaction and return to idle state (we do not want to stay in
+ * the transaction).
+ */
+ case TBLOCK_PREPARE:
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * We got an error inside a subtransaction. Abort just the
+ * subtransaction, and go to the persistent SUBABORT state until
+ * we get ROLLBACK.
+ */
+ case TBLOCK_SUBINPROGRESS:
+ AbortSubTransaction();
+ s->blockState = TBLOCK_SUBABORT;
+ break;
+
+ /*
+ * If we failed while trying to create a subtransaction, clean up
+ * the broken subtransaction and abort the parent. The same
+ * applies if we get a failure while ending a subtransaction.
+ */
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ AbortSubTransaction();
+ CleanupSubTransaction();
+ AbortCurrentTransaction();
+ break;
+
+ /*
+ * Same as above, except the Abort() was already done.
+ */
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_SUBABORT_RESTART:
+ CleanupSubTransaction();
+ AbortCurrentTransaction();
+ break;
+ }
+}
+
+/*
+ * PreventInTransactionBlock
+ *
+ * This routine is to be called by statements that must not run inside
+ * a transaction block, typically because they have non-rollback-able
+ * side effects or do internal commits.
+ *
+ * If this routine completes successfully, then the calling statement is
+ * guaranteed that if it completes without error, its results will be
+ * committed immediately.
+ *
+ * If we have already started a transaction block, issue an error; also issue
+ * an error if we appear to be running inside a user-defined function (which
+ * could issue more commands and possibly cause a failure after the statement
+ * completes). Subtransactions are verboten too.
+ *
+ * We must also set XACT_FLAGS_NEEDIMMEDIATECOMMIT in MyXactFlags, to ensure
+ * that postgres.c follows through by committing after the statement is done.
+ *
+ * isTopLevel: passed down from ProcessUtility to determine whether we are
+ * inside a function. (We will always fail if this is false, but it's
+ * convenient to centralize the check here instead of making callers do it.)
+ * stmtType: statement type name, for error messages.
+ */
+void
+PreventInTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+ /*
+ * xact block already started?
+ */
+ if (IsTransactionBlock())
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s cannot run inside a transaction block",
+ stmtType)));
+
+ /*
+ * subtransaction?
+ */
+ if (IsSubTransaction())
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s cannot run inside a subtransaction",
+ stmtType)));
+
+ /*
+ * inside a pipeline that has started an implicit transaction?
+ */
+ if (MyXactFlags & XACT_FLAGS_PIPELINING)
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s cannot be executed within a pipeline",
+ stmtType)));
+
+ /*
+ * inside a function call?
+ */
+ if (!isTopLevel)
+ ereport(ERROR,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s cannot be executed from a function", stmtType)));
+
+ /* If we got past IsTransactionBlock test, should be in default state */
+ if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
+ CurrentTransactionState->blockState != TBLOCK_STARTED)
+ elog(FATAL, "cannot prevent transaction chain");
+
+ /* All okay. Set the flag to make sure the right thing happens later. */
+ MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT;
+}
+
+/*
+ * WarnNoTransactionBlock
+ * RequireTransactionBlock
+ *
+ * These two functions allow for warnings or errors if a command is executed
+ * outside of a transaction block. This is useful for commands that have no
+ * effects that persist past transaction end (and so calling them outside a
+ * transaction block is presumably an error). DECLARE CURSOR is an example.
+ * While top-level transaction control commands (BEGIN/COMMIT/ABORT) and SET
+ * that have no effect issue warnings, all other no-effect commands generate
+ * errors.
+ *
+ * If we appear to be running inside a user-defined function, we do not
+ * issue anything, since the function could issue more commands that make
+ * use of the current statement's results. Likewise subtransactions.
+ * Thus these are inverses for PreventInTransactionBlock.
+ *
+ * isTopLevel: passed down from ProcessUtility to determine whether we are
+ * inside a function.
+ * stmtType: statement type name, for warning or error messages.
+ */
+void
+WarnNoTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+ CheckTransactionBlock(isTopLevel, false, stmtType);
+}
+
+void
+RequireTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+ CheckTransactionBlock(isTopLevel, true, stmtType);
+}
+
+/*
+ * This is the implementation of the above two.
+ */
+static void
+CheckTransactionBlock(bool isTopLevel, bool throwError, const char *stmtType)
+{
+ /*
+ * xact block already started?
+ */
+ if (IsTransactionBlock())
+ return;
+
+ /*
+ * subtransaction?
+ */
+ if (IsSubTransaction())
+ return;
+
+ /*
+ * inside a function call?
+ */
+ if (!isTopLevel)
+ return;
+
+ ereport(throwError ? ERROR : WARNING,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ stmtType)));
+}
+
+/*
+ * IsInTransactionBlock
+ *
+ * This routine is for statements that need to behave differently inside
+ * a transaction block than when running as single commands. ANALYZE is
+ * currently the only example.
+ *
+ * If this routine returns "false", then the calling statement is allowed
+ * to perform internal transaction-commit-and-start cycles; there is not a
+ * risk of messing up any transaction already in progress. (Note that this
+ * is not the identical guarantee provided by PreventInTransactionBlock,
+ * since we will not force a post-statement commit.)
+ *
+ * isTopLevel: passed down from ProcessUtility to determine whether we are
+ * inside a function.
+ */
+bool
+IsInTransactionBlock(bool isTopLevel)
+{
+ /*
+ * Return true on same conditions that would make
+ * PreventInTransactionBlock error out
+ */
+ if (IsTransactionBlock())
+ return true;
+
+ if (IsSubTransaction())
+ return true;
+
+ if (MyXactFlags & XACT_FLAGS_PIPELINING)
+ return true;
+
+ if (!isTopLevel)
+ return true;
+
+ if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
+ CurrentTransactionState->blockState != TBLOCK_STARTED)
+ return true;
+
+ return false;
+}
+
+
+/*
+ * Register or deregister callback functions for start- and end-of-xact
+ * operations.
+ *
+ * These functions are intended for use by dynamically loaded modules.
+ * For built-in modules we generally just hardwire the appropriate calls
+ * (mainly because it's easier to control the order that way, where needed).
+ *
+ * At transaction end, the callback occurs post-commit or post-abort, so the
+ * callback functions can only do noncritical cleanup.
+ */
+void
+RegisterXactCallback(XactCallback callback, void *arg)
+{
+ XactCallbackItem *item;
+
+ item = (XactCallbackItem *)
+ MemoryContextAlloc(TopMemoryContext, sizeof(XactCallbackItem));
+ item->callback = callback;
+ item->arg = arg;
+ item->next = Xact_callbacks;
+ Xact_callbacks = item;
+}
+
+void
+UnregisterXactCallback(XactCallback callback, void *arg)
+{
+ XactCallbackItem *item;
+ XactCallbackItem *prev;
+
+ prev = NULL;
+ for (item = Xact_callbacks; item; prev = item, item = item->next)
+ {
+ if (item->callback == callback && item->arg == arg)
+ {
+ if (prev)
+ prev->next = item->next;
+ else
+ Xact_callbacks = item->next;
+ pfree(item);
+ break;
+ }
+ }
+}
+
+static void
+CallXactCallbacks(XactEvent event)
+{
+ XactCallbackItem *item;
+
+ for (item = Xact_callbacks; item; item = item->next)
+ item->callback(event, item->arg);
+}
+
+
+/*
+ * Register or deregister callback functions for start- and end-of-subxact
+ * operations.
+ *
+ * Pretty much same as above, but for subtransaction events.
+ *
+ * At subtransaction end, the callback occurs post-subcommit or post-subabort,
+ * so the callback functions can only do noncritical cleanup. At
+ * subtransaction start, the callback is called when the subtransaction has
+ * finished initializing.
+ */
+void
+RegisterSubXactCallback(SubXactCallback callback, void *arg)
+{
+ SubXactCallbackItem *item;
+
+ item = (SubXactCallbackItem *)
+ MemoryContextAlloc(TopMemoryContext, sizeof(SubXactCallbackItem));
+ item->callback = callback;
+ item->arg = arg;
+ item->next = SubXact_callbacks;
+ SubXact_callbacks = item;
+}
+
+void
+UnregisterSubXactCallback(SubXactCallback callback, void *arg)
+{
+ SubXactCallbackItem *item;
+ SubXactCallbackItem *prev;
+
+ prev = NULL;
+ for (item = SubXact_callbacks; item; prev = item, item = item->next)
+ {
+ if (item->callback == callback && item->arg == arg)
+ {
+ if (prev)
+ prev->next = item->next;
+ else
+ SubXact_callbacks = item->next;
+ pfree(item);
+ break;
+ }
+ }
+}
+
+static void
+CallSubXactCallbacks(SubXactEvent event,
+ SubTransactionId mySubid,
+ SubTransactionId parentSubid)
+{
+ SubXactCallbackItem *item;
+
+ for (item = SubXact_callbacks; item; item = item->next)
+ item->callback(event, mySubid, parentSubid, item->arg);
+}
+
+
+/* ----------------------------------------------------------------
+ * transaction block support
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * BeginTransactionBlock
+ * This executes a BEGIN command.
+ */
+void
+BeginTransactionBlock(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->blockState)
+ {
+ /*
+ * We are not inside a transaction block, so allow one to begin.
+ */
+ case TBLOCK_STARTED:
+ s->blockState = TBLOCK_BEGIN;
+ break;
+
+ /*
+ * BEGIN converts an implicit transaction block to a regular one.
+ * (Note that we allow this even if we've already done some
+ * commands, which is a bit odd but matches historical practice.)
+ */
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ s->blockState = TBLOCK_BEGIN;
+ break;
+
+ /*
+ * Already a transaction block in progress.
+ */
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ ereport(WARNING,
+ (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+ errmsg("there is already a transaction in progress")));
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_BEGIN:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "BeginTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+}
+
+/*
+ * PrepareTransactionBlock
+ * This executes a PREPARE command.
+ *
+ * Since PREPARE may actually do a ROLLBACK, the result indicates what
+ * happened: true for PREPARE, false for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming PrepareTransaction().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
+ */
+bool
+PrepareTransactionBlock(const char *gid)
+{
+ TransactionState s;
+ bool result;
+
+ /* Set up to commit the current transaction */
+ result = EndTransactionBlock(false);
+
+ /* If successful, change outer tblock state to PREPARE */
+ if (result)
+ {
+ s = CurrentTransactionState;
+
+ while (s->parent != NULL)
+ s = s->parent;
+
+ if (s->blockState == TBLOCK_END)
+ {
+ /* Save GID where PrepareTransaction can find it again */
+ prepareGID = MemoryContextStrdup(TopTransactionContext, gid);
+
+ s->blockState = TBLOCK_PREPARE;
+ }
+ else
+ {
+ /*
+ * ignore case where we are not in a transaction;
+ * EndTransactionBlock already issued a warning.
+ */
+ Assert(s->blockState == TBLOCK_STARTED ||
+ s->blockState == TBLOCK_IMPLICIT_INPROGRESS);
+ /* Don't send back a PREPARE result tag... */
+ result = false;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * EndTransactionBlock
+ * This executes a COMMIT command.
+ *
+ * Since COMMIT may actually do a ROLLBACK, the result indicates what
+ * happened: true for COMMIT, false for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming CommitTransactionCommand().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
+ */
+bool
+EndTransactionBlock(bool chain)
+{
+ TransactionState s = CurrentTransactionState;
+ bool result = false;
+
+ switch (s->blockState)
+ {
+ /*
+ * We are in a transaction block, so tell CommitTransactionCommand
+ * to COMMIT.
+ */
+ case TBLOCK_INPROGRESS:
+ s->blockState = TBLOCK_END;
+ result = true;
+ break;
+
+ /*
+ * We are in an implicit transaction block. If AND CHAIN was
+ * specified, error. Otherwise commit, but issue a warning
+ * because there was no explicit BEGIN before this.
+ */
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ if (chain)
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "COMMIT AND CHAIN")));
+ else
+ ereport(WARNING,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ errmsg("there is no transaction in progress")));
+ s->blockState = TBLOCK_END;
+ result = true;
+ break;
+
+ /*
+ * We are in a failed transaction block. Tell
+ * CommitTransactionCommand it's time to exit the block.
+ */
+ case TBLOCK_ABORT:
+ s->blockState = TBLOCK_ABORT_END;
+ break;
+
+ /*
+ * We are in a live subtransaction block. Set up to subcommit all
+ * open subtransactions and then commit the main transaction.
+ */
+ case TBLOCK_SUBINPROGRESS:
+ while (s->parent != NULL)
+ {
+ if (s->blockState == TBLOCK_SUBINPROGRESS)
+ s->blockState = TBLOCK_SUBCOMMIT;
+ else
+ elog(FATAL, "EndTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ s = s->parent;
+ }
+ if (s->blockState == TBLOCK_INPROGRESS)
+ s->blockState = TBLOCK_END;
+ else
+ elog(FATAL, "EndTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ result = true;
+ break;
+
+ /*
+ * Here we are inside an aborted subtransaction. Treat the COMMIT
+ * as ROLLBACK: set up to abort everything and exit the main
+ * transaction.
+ */
+ case TBLOCK_SUBABORT:
+ while (s->parent != NULL)
+ {
+ if (s->blockState == TBLOCK_SUBINPROGRESS)
+ s->blockState = TBLOCK_SUBABORT_PENDING;
+ else if (s->blockState == TBLOCK_SUBABORT)
+ s->blockState = TBLOCK_SUBABORT_END;
+ else
+ elog(FATAL, "EndTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ s = s->parent;
+ }
+ if (s->blockState == TBLOCK_INPROGRESS)
+ s->blockState = TBLOCK_ABORT_PENDING;
+ else if (s->blockState == TBLOCK_ABORT)
+ s->blockState = TBLOCK_ABORT_END;
+ else
+ elog(FATAL, "EndTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+
+ /*
+ * The user issued COMMIT when not inside a transaction. For
+ * COMMIT without CHAIN, issue a WARNING, staying in
+ * TBLOCK_STARTED state. The upcoming call to
+ * CommitTransactionCommand() will then close the transaction and
+ * put us back into the default state. For COMMIT AND CHAIN,
+ * error.
+ */
+ case TBLOCK_STARTED:
+ if (chain)
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "COMMIT AND CHAIN")));
+ else
+ ereport(WARNING,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ errmsg("there is no transaction in progress")));
+ result = true;
+ break;
+
+ /*
+ * The user issued a COMMIT that somehow ran inside a parallel
+ * worker. We can't cope with that.
+ */
+ case TBLOCK_PARALLEL_INPROGRESS:
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot commit during a parallel operation")));
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_BEGIN:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "EndTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ Assert(s->blockState == TBLOCK_STARTED ||
+ s->blockState == TBLOCK_END ||
+ s->blockState == TBLOCK_ABORT_END ||
+ s->blockState == TBLOCK_ABORT_PENDING);
+
+ s->chain = chain;
+
+ return result;
+}
+
+/*
+ * UserAbortTransactionBlock
+ * This executes a ROLLBACK command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+UserAbortTransactionBlock(bool chain)
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->blockState)
+ {
+ /*
+ * We are inside a transaction block and we got a ROLLBACK command
+ * from the user, so tell CommitTransactionCommand to abort and
+ * exit the transaction block.
+ */
+ case TBLOCK_INPROGRESS:
+ s->blockState = TBLOCK_ABORT_PENDING;
+ break;
+
+ /*
+ * We are inside a failed transaction block and we got a ROLLBACK
+ * command from the user. Abort processing is already done, so
+ * CommitTransactionCommand just has to cleanup and go back to
+ * idle state.
+ */
+ case TBLOCK_ABORT:
+ s->blockState = TBLOCK_ABORT_END;
+ break;
+
+ /*
+ * We are inside a subtransaction. Mark everything up to top
+ * level as exitable.
+ */
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_SUBABORT:
+ while (s->parent != NULL)
+ {
+ if (s->blockState == TBLOCK_SUBINPROGRESS)
+ s->blockState = TBLOCK_SUBABORT_PENDING;
+ else if (s->blockState == TBLOCK_SUBABORT)
+ s->blockState = TBLOCK_SUBABORT_END;
+ else
+ elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ s = s->parent;
+ }
+ if (s->blockState == TBLOCK_INPROGRESS)
+ s->blockState = TBLOCK_ABORT_PENDING;
+ else if (s->blockState == TBLOCK_ABORT)
+ s->blockState = TBLOCK_ABORT_END;
+ else
+ elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+
+ /*
+ * The user issued ABORT when not inside a transaction. For
+ * ROLLBACK without CHAIN, issue a WARNING and go to abort state.
+ * The upcoming call to CommitTransactionCommand() will then put
+ * us back into the default state. For ROLLBACK AND CHAIN, error.
+ *
+ * We do the same thing with ABORT inside an implicit transaction,
+ * although in this case we might be rolling back actual database
+ * state changes. (It's debatable whether we should issue a
+ * WARNING in this case, but we have done so historically.)
+ */
+ case TBLOCK_STARTED:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ if (chain)
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "ROLLBACK AND CHAIN")));
+ else
+ ereport(WARNING,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ errmsg("there is no transaction in progress")));
+ s->blockState = TBLOCK_ABORT_PENDING;
+ break;
+
+ /*
+ * The user issued an ABORT that somehow ran inside a parallel
+ * worker. We can't cope with that.
+ */
+ case TBLOCK_PARALLEL_INPROGRESS:
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot abort during a parallel operation")));
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_BEGIN:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ Assert(s->blockState == TBLOCK_ABORT_END ||
+ s->blockState == TBLOCK_ABORT_PENDING);
+
+ s->chain = chain;
+}
+
+/*
+ * BeginImplicitTransactionBlock
+ * Start an implicit transaction block if we're not already in one.
+ *
+ * Unlike BeginTransactionBlock, this is called directly from the main loop
+ * in postgres.c, not within a Portal. So we can just change blockState
+ * without a lot of ceremony. We do not expect caller to do
+ * CommitTransactionCommand/StartTransactionCommand.
+ */
+void
+BeginImplicitTransactionBlock(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * If we are in STARTED state (that is, no transaction block is open),
+ * switch to IMPLICIT_INPROGRESS state, creating an implicit transaction
+ * block.
+ *
+ * For caller convenience, we consider all other transaction states as
+ * legal here; otherwise the caller would need its own state check, which
+ * seems rather pointless.
+ */
+ if (s->blockState == TBLOCK_STARTED)
+ s->blockState = TBLOCK_IMPLICIT_INPROGRESS;
+}
+
+/*
+ * EndImplicitTransactionBlock
+ * End an implicit transaction block, if we're in one.
+ *
+ * Like EndTransactionBlock, we just make any needed blockState change here.
+ * The real work will be done in the upcoming CommitTransactionCommand().
+ */
+void
+EndImplicitTransactionBlock(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * If we are in IMPLICIT_INPROGRESS state, switch back to STARTED state,
+ * allowing CommitTransactionCommand to commit whatever happened during
+ * the implicit transaction block as though it were a single statement.
+ *
+ * For caller convenience, we consider all other transaction states as
+ * legal here; otherwise the caller would need its own state check, which
+ * seems rather pointless.
+ */
+ if (s->blockState == TBLOCK_IMPLICIT_INPROGRESS)
+ s->blockState = TBLOCK_STARTED;
+}
+
+/*
+ * DefineSavepoint
+ * This executes a SAVEPOINT command.
+ */
+void
+DefineSavepoint(const char *name)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for new subtransactions after that
+ * point. (Note that this check will certainly error out if s->blockState
+ * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+ * below.)
+ */
+ if (IsInParallelMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot define savepoints during a parallel operation")));
+
+ switch (s->blockState)
+ {
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_SUBINPROGRESS:
+ /* Normal subtransaction start */
+ PushTransaction();
+ s = CurrentTransactionState; /* changed by push */
+
+ /*
+ * Savepoint names, like the TransactionState block itself, live
+ * in TopTransactionContext.
+ */
+ if (name)
+ s->name = MemoryContextStrdup(TopTransactionContext, name);
+ break;
+
+ /*
+ * We disallow savepoint commands in implicit transaction blocks.
+ * There would be no great difficulty in allowing them so far as
+ * this module is concerned, but a savepoint seems inconsistent
+ * with exec_simple_query's behavior of abandoning the whole query
+ * string upon error. Also, the point of an implicit transaction
+ * block (as opposed to a regular one) is to automatically close
+ * after an error, so it's hard to see how a savepoint would fit
+ * into that.
+ *
+ * The error messages for this are phrased as if there were no
+ * active transaction block at all, which is historical but
+ * perhaps could be improved.
+ */
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "SAVEPOINT")));
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "DefineSavepoint: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+}
+
+/*
+ * ReleaseSavepoint
+ * This executes a RELEASE command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+ReleaseSavepoint(const char *name)
+{
+ TransactionState s = CurrentTransactionState;
+ TransactionState target,
+ xact;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for transaction state change after that
+ * point. (Note that this check will certainly error out if s->blockState
+ * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+ * below.)
+ */
+ if (IsInParallelMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot release savepoints during a parallel operation")));
+
+ switch (s->blockState)
+ {
+ /*
+ * We can't release a savepoint if there is no savepoint defined.
+ */
+ case TBLOCK_INPROGRESS:
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist", name)));
+ break;
+
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ /* See comment about implicit transactions in DefineSavepoint */
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "RELEASE SAVEPOINT")));
+ break;
+
+ /*
+ * We are in a non-aborted subtransaction. This is the only valid
+ * case.
+ */
+ case TBLOCK_SUBINPROGRESS:
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "ReleaseSavepoint: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ for (target = s; PointerIsValid(target); target = target->parent)
+ {
+ if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
+ break;
+ }
+
+ if (!PointerIsValid(target))
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist", name)));
+
+ /* disallow crossing savepoint level boundaries */
+ if (target->savepointLevel != s->savepointLevel)
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist within current savepoint level", name)));
+
+ /*
+ * Mark "commit pending" all subtransactions up to the target
+ * subtransaction. The actual commits will happen when control gets to
+ * CommitTransactionCommand.
+ */
+ xact = CurrentTransactionState;
+ for (;;)
+ {
+ Assert(xact->blockState == TBLOCK_SUBINPROGRESS);
+ xact->blockState = TBLOCK_SUBRELEASE;
+ if (xact == target)
+ break;
+ xact = xact->parent;
+ Assert(PointerIsValid(xact));
+ }
+}
+
+/*
+ * RollbackToSavepoint
+ * This executes a ROLLBACK TO <savepoint> command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+RollbackToSavepoint(const char *name)
+{
+ TransactionState s = CurrentTransactionState;
+ TransactionState target,
+ xact;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for transaction state change after that
+ * point. (Note that this check will certainly error out if s->blockState
+ * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+ * below.)
+ */
+ if (IsInParallelMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot rollback to savepoints during a parallel operation")));
+
+ switch (s->blockState)
+ {
+ /*
+ * We can't rollback to a savepoint if there is no savepoint
+ * defined.
+ */
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_ABORT:
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist", name)));
+ break;
+
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ /* See comment about implicit transactions in DefineSavepoint */
+ ereport(ERROR,
+ (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+ /* translator: %s represents an SQL statement name */
+ errmsg("%s can only be used in transaction blocks",
+ "ROLLBACK TO SAVEPOINT")));
+ break;
+
+ /*
+ * There is at least one savepoint, so proceed.
+ */
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_SUBABORT:
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ for (target = s; PointerIsValid(target); target = target->parent)
+ {
+ if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
+ break;
+ }
+
+ if (!PointerIsValid(target))
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist", name)));
+
+ /* disallow crossing savepoint level boundaries */
+ if (target->savepointLevel != s->savepointLevel)
+ ereport(ERROR,
+ (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+ errmsg("savepoint \"%s\" does not exist within current savepoint level", name)));
+
+ /*
+ * Mark "abort pending" all subtransactions up to the target
+ * subtransaction. The actual aborts will happen when control gets to
+ * CommitTransactionCommand.
+ */
+ xact = CurrentTransactionState;
+ for (;;)
+ {
+ if (xact == target)
+ break;
+ if (xact->blockState == TBLOCK_SUBINPROGRESS)
+ xact->blockState = TBLOCK_SUBABORT_PENDING;
+ else if (xact->blockState == TBLOCK_SUBABORT)
+ xact->blockState = TBLOCK_SUBABORT_END;
+ else
+ elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+ BlockStateAsString(xact->blockState));
+ xact = xact->parent;
+ Assert(PointerIsValid(xact));
+ }
+
+ /* And mark the target as "restart pending" */
+ if (xact->blockState == TBLOCK_SUBINPROGRESS)
+ xact->blockState = TBLOCK_SUBRESTART;
+ else if (xact->blockState == TBLOCK_SUBABORT)
+ xact->blockState = TBLOCK_SUBABORT_RESTART;
+ else
+ elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+ BlockStateAsString(xact->blockState));
+}
+
+/*
+ * BeginInternalSubTransaction
+ * This is the same as DefineSavepoint except it allows TBLOCK_STARTED,
+ * TBLOCK_IMPLICIT_INPROGRESS, TBLOCK_END, and TBLOCK_PREPARE states,
+ * and therefore it can safely be used in functions that might be called
+ * when not inside a BEGIN block or when running deferred triggers at
+ * COMMIT/PREPARE time. Also, it automatically does
+ * CommitTransactionCommand/StartTransactionCommand instead of expecting
+ * the caller to do it.
+ */
+void
+BeginInternalSubTransaction(const char *name)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for new subtransactions after that
+ * point. We might be able to make an exception for the type of
+ * subtransaction established by this function, which is typically used in
+ * contexts where we're going to release or roll back the subtransaction
+ * before proceeding further, so that no enduring change to the
+ * transaction state occurs. For now, however, we prohibit this case along
+ * with all the others.
+ */
+ if (IsInParallelMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot start subtransactions during a parallel operation")));
+
+ switch (s->blockState)
+ {
+ case TBLOCK_STARTED:
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_END:
+ case TBLOCK_PREPARE:
+ case TBLOCK_SUBINPROGRESS:
+ /* Normal subtransaction start */
+ PushTransaction();
+ s = CurrentTransactionState; /* changed by push */
+
+ /*
+ * Savepoint names, like the TransactionState block itself, live
+ * in TopTransactionContext.
+ */
+ if (name)
+ s->name = MemoryContextStrdup(TopTransactionContext, name);
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_BEGIN:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ elog(FATAL, "BeginInternalSubTransaction: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ CommitTransactionCommand();
+ StartTransactionCommand();
+}
+
+/*
+ * ReleaseCurrentSubTransaction
+ *
+ * RELEASE (ie, commit) the innermost subtransaction, regardless of its
+ * savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
+ */
+void
+ReleaseCurrentSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * Workers synchronize transaction state at the beginning of each parallel
+ * operation, so we can't account for commit of subtransactions after that
+ * point. This should not happen anyway. Code calling this would
+ * typically have called BeginInternalSubTransaction() first, failing
+ * there.
+ */
+ if (IsInParallelMode())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot commit subtransactions during a parallel operation")));
+
+ if (s->blockState != TBLOCK_SUBINPROGRESS)
+ elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ Assert(s->state == TRANS_INPROGRESS);
+ MemoryContextSwitchTo(CurTransactionContext);
+ CommitSubTransaction();
+ s = CurrentTransactionState; /* changed by pop */
+ Assert(s->state == TRANS_INPROGRESS);
+}
+
+/*
+ * RollbackAndReleaseCurrentSubTransaction
+ *
+ * ROLLBACK and RELEASE (ie, abort) the innermost subtransaction, regardless
+ * of its savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
+ */
+void
+RollbackAndReleaseCurrentSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /*
+ * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted
+ * during parallel operations. That's because we may be in the leader,
+ * recovering from an error thrown while we were in parallel mode. We
+ * won't reach here in a worker, because BeginInternalSubTransaction()
+ * will have failed.
+ */
+
+ switch (s->blockState)
+ {
+ /* Must be in a subtransaction */
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_SUBABORT:
+ break;
+
+ /* These cases are invalid. */
+ case TBLOCK_DEFAULT:
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_ABORT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ case TBLOCK_PREPARE:
+ elog(FATAL, "RollbackAndReleaseCurrentSubTransaction: unexpected state %s",
+ BlockStateAsString(s->blockState));
+ break;
+ }
+
+ /*
+ * Abort the current subtransaction, if needed.
+ */
+ if (s->blockState == TBLOCK_SUBINPROGRESS)
+ AbortSubTransaction();
+
+ /* And clean it up, too */
+ CleanupSubTransaction();
+
+ s = CurrentTransactionState; /* changed by pop */
+ AssertState(s->blockState == TBLOCK_SUBINPROGRESS ||
+ s->blockState == TBLOCK_INPROGRESS ||
+ s->blockState == TBLOCK_IMPLICIT_INPROGRESS ||
+ s->blockState == TBLOCK_STARTED);
+}
+
+/*
+ * AbortOutOfAnyTransaction
+ *
+ * This routine is provided for error recovery purposes. It aborts any
+ * active transaction or transaction block, leaving the system in a known
+ * idle state.
+ */
+void
+AbortOutOfAnyTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* Ensure we're not running in a doomed memory context */
+ AtAbort_Memory();
+
+ /*
+ * Get out of any transaction or nested transaction
+ */
+ do
+ {
+ switch (s->blockState)
+ {
+ case TBLOCK_DEFAULT:
+ if (s->state == TRANS_DEFAULT)
+ {
+ /* Not in a transaction, do nothing */
+ }
+ else
+ {
+ /*
+ * We can get here after an error during transaction start
+ * (state will be TRANS_START). Need to clean up the
+ * incompletely started transaction. First, adjust the
+ * low-level state to suppress warning message from
+ * AbortTransaction.
+ */
+ if (s->state == TRANS_START)
+ s->state = TRANS_INPROGRESS;
+ AbortTransaction();
+ CleanupTransaction();
+ }
+ break;
+ case TBLOCK_STARTED:
+ case TBLOCK_BEGIN:
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_PREPARE:
+ /* In a transaction, so clean up */
+ AbortTransaction();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+ case TBLOCK_ABORT:
+ case TBLOCK_ABORT_END:
+
+ /*
+ * AbortTransaction is already done, still need Cleanup.
+ * However, if we failed partway through running ROLLBACK,
+ * there will be an active portal running that command, which
+ * we need to shut down before doing CleanupTransaction.
+ */
+ AtAbort_Portals();
+ CleanupTransaction();
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+
+ /*
+ * In a subtransaction, so clean it up and abort parent too
+ */
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ AbortSubTransaction();
+ CleanupSubTransaction();
+ s = CurrentTransactionState; /* changed by pop */
+ break;
+
+ case TBLOCK_SUBABORT:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_SUBABORT_RESTART:
+ /* As above, but AbortSubTransaction already done */
+ if (s->curTransactionOwner)
+ {
+ /* As in TBLOCK_ABORT, might have a live portal to zap */
+ AtSubAbort_Portals(s->subTransactionId,
+ s->parent->subTransactionId,
+ s->curTransactionOwner,
+ s->parent->curTransactionOwner);
+ }
+ CleanupSubTransaction();
+ s = CurrentTransactionState; /* changed by pop */
+ break;
+ }
+ } while (s->blockState != TBLOCK_DEFAULT);
+
+ /* Should be out of all subxacts now */
+ Assert(s->parent == NULL);
+
+ /* If we didn't actually have anything to do, revert to TopMemoryContext */
+ AtCleanup_Memory();
+}
+
+/*
+ * IsTransactionBlock --- are we within a transaction block?
+ */
+bool
+IsTransactionBlock(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->blockState == TBLOCK_DEFAULT || s->blockState == TBLOCK_STARTED)
+ return false;
+
+ return true;
+}
+
+/*
+ * IsTransactionOrTransactionBlock --- are we within either a transaction
+ * or a transaction block? (The backend is only really "idle" when this
+ * returns false.)
+ *
+ * This should match up with IsTransactionBlock and IsTransactionState.
+ */
+bool
+IsTransactionOrTransactionBlock(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->blockState == TBLOCK_DEFAULT)
+ return false;
+
+ return true;
+}
+
+/*
+ * TransactionBlockStatusCode - return status code to send in ReadyForQuery
+ */
+char
+TransactionBlockStatusCode(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->blockState)
+ {
+ case TBLOCK_DEFAULT:
+ case TBLOCK_STARTED:
+ return 'I'; /* idle --- not in transaction */
+ case TBLOCK_BEGIN:
+ case TBLOCK_SUBBEGIN:
+ case TBLOCK_INPROGRESS:
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ case TBLOCK_PARALLEL_INPROGRESS:
+ case TBLOCK_SUBINPROGRESS:
+ case TBLOCK_END:
+ case TBLOCK_SUBRELEASE:
+ case TBLOCK_SUBCOMMIT:
+ case TBLOCK_PREPARE:
+ return 'T'; /* in transaction */
+ case TBLOCK_ABORT:
+ case TBLOCK_SUBABORT:
+ case TBLOCK_ABORT_END:
+ case TBLOCK_SUBABORT_END:
+ case TBLOCK_ABORT_PENDING:
+ case TBLOCK_SUBABORT_PENDING:
+ case TBLOCK_SUBRESTART:
+ case TBLOCK_SUBABORT_RESTART:
+ return 'E'; /* in failed transaction */
+ }
+
+ /* should never get here */
+ elog(FATAL, "invalid transaction block state: %s",
+ BlockStateAsString(s->blockState));
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * IsSubTransaction
+ */
+bool
+IsSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->nestingLevel >= 2)
+ return true;
+
+ return false;
+}
+
+/*
+ * StartSubTransaction
+ *
+ * If you're wondering why this is separate from PushTransaction: it's because
+ * we can't conveniently do this stuff right inside DefineSavepoint. The
+ * SAVEPOINT utility command will be executed inside a Portal, and if we
+ * muck with CurrentMemoryContext or CurrentResourceOwner then exit from
+ * the Portal will undo those settings. So we make DefineSavepoint just
+ * push a dummy transaction block, and when control returns to the main
+ * idle loop, CommitTransactionCommand will be called, and we'll come here
+ * to finish starting the subtransaction.
+ */
+static void
+StartSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->state != TRANS_DEFAULT)
+ elog(WARNING, "StartSubTransaction while in %s state",
+ TransStateAsString(s->state));
+
+ s->state = TRANS_START;
+
+ /*
+ * Initialize subsystems for new subtransaction
+ *
+ * must initialize resource-management stuff first
+ */
+ AtSubStart_Memory();
+ AtSubStart_ResourceOwner();
+ AfterTriggerBeginSubXact();
+
+ s->state = TRANS_INPROGRESS;
+
+ /*
+ * Call start-of-subxact callbacks
+ */
+ CallSubXactCallbacks(SUBXACT_EVENT_START_SUB, s->subTransactionId,
+ s->parent->subTransactionId);
+
+ ShowTransactionState("StartSubTransaction");
+}
+
+/*
+ * CommitSubTransaction
+ *
+ * The caller has to make sure to always reassign CurrentTransactionState
+ * if it has a local pointer to it after calling this function.
+ */
+static void
+CommitSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ ShowTransactionState("CommitSubTransaction");
+
+ if (s->state != TRANS_INPROGRESS)
+ elog(WARNING, "CommitSubTransaction while in %s state",
+ TransStateAsString(s->state));
+
+ /* Pre-commit processing goes here */
+
+ CallSubXactCallbacks(SUBXACT_EVENT_PRE_COMMIT_SUB, s->subTransactionId,
+ s->parent->subTransactionId);
+
+ /* If in parallel mode, clean up workers and exit parallel mode. */
+ if (IsInParallelMode())
+ {
+ AtEOSubXact_Parallel(true, s->subTransactionId);
+ s->parallelModeLevel = 0;
+ }
+
+ /* Do the actual "commit", such as it is */
+ s->state = TRANS_COMMIT;
+
+ /* Must CCI to ensure commands of subtransaction are seen as done */
+ CommandCounterIncrement();
+
+ /*
+ * Prior to 8.4 we marked subcommit in clog at this point. We now only
+ * perform that step, if required, as part of the atomic update of the
+ * whole transaction tree at top level commit or abort.
+ */
+
+ /* Post-commit cleanup */
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ AtSubCommit_childXids();
+ AfterTriggerEndSubXact(true);
+ AtSubCommit_Portals(s->subTransactionId,
+ s->parent->subTransactionId,
+ s->parent->nestingLevel,
+ s->parent->curTransactionOwner);
+ AtEOSubXact_LargeObject(true, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtSubCommit_Notify();
+
+ CallSubXactCallbacks(SUBXACT_EVENT_COMMIT_SUB, s->subTransactionId,
+ s->parent->subTransactionId);
+
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ true, false);
+ AtEOSubXact_RelationCache(true, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Inval(true);
+ AtSubCommit_smgr();
+
+ /*
+ * The only lock we actually release here is the subtransaction XID lock.
+ */
+ CurrentResourceOwner = s->curTransactionOwner;
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ XactLockTableDelete(XidFromFullTransactionId(s->fullTransactionId));
+
+ /*
+ * Other locks should get transferred to their parent resource owner.
+ */
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_LOCKS,
+ true, false);
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ true, false);
+
+ AtEOXact_GUC(true, s->gucNestLevel);
+ AtEOSubXact_SPI(true, s->subTransactionId);
+ AtEOSubXact_on_commit_actions(true, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Namespace(true, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Files(true, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_HashTables(true, s->nestingLevel);
+ AtEOSubXact_PgStat(true, s->nestingLevel);
+ AtSubCommit_Snapshot(s->nestingLevel);
+
+ /*
+ * We need to restore the upper transaction's read-only state, in case the
+ * upper is read-write while the child is read-only; GUC will incorrectly
+ * think it should leave the child state in place.
+ */
+ XactReadOnly = s->prevXactReadOnly;
+
+ CurrentResourceOwner = s->parent->curTransactionOwner;
+ CurTransactionResourceOwner = s->parent->curTransactionOwner;
+ ResourceOwnerDelete(s->curTransactionOwner);
+ s->curTransactionOwner = NULL;
+
+ AtSubCommit_Memory();
+
+ s->state = TRANS_DEFAULT;
+
+ PopTransaction();
+}
+
+/*
+ * AbortSubTransaction
+ */
+static void
+AbortSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* Prevent cancel/die interrupt while cleaning up */
+ HOLD_INTERRUPTS();
+
+ /* Make sure we have a valid memory context and resource owner */
+ AtSubAbort_Memory();
+ AtSubAbort_ResourceOwner();
+
+ /*
+ * Release any LW locks we might be holding as quickly as possible.
+ * (Regular locks, however, must be held till we finish aborting.)
+ * Releasing LW locks is critical since we might try to grab them again
+ * while cleaning up!
+ *
+ * FIXME This may be incorrect --- Are there some locks we should keep?
+ * Buffer locks, for example? I don't think so but I'm not sure.
+ */
+ LWLockReleaseAll();
+
+ pgstat_report_wait_end();
+ pgstat_progress_end_command();
+ AbortBufferIO();
+ UnlockBuffers();
+
+ /* Reset WAL record construction state */
+ XLogResetInsertion();
+
+ /* Cancel condition variable sleep */
+ ConditionVariableCancelSleep();
+
+ /*
+ * Also clean up any open wait for lock, since the lock manager will choke
+ * if we try to wait for another lock before doing this.
+ */
+ LockErrorCleanup();
+
+ /*
+ * If any timeout events are still active, make sure the timeout interrupt
+ * is scheduled. This covers possible loss of a timeout interrupt due to
+ * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+ * We delay this till after LockErrorCleanup so that we don't uselessly
+ * reschedule lock or deadlock check timeouts.
+ */
+ reschedule_timeouts();
+
+ /*
+ * Re-enable signals, in case we got here by longjmp'ing out of a signal
+ * handler. We do this fairly early in the sequence so that the timeout
+ * infrastructure will be functional if needed while aborting.
+ */
+ PG_SETMASK(&UnBlockSig);
+
+ /*
+ * check the current transaction state
+ */
+ ShowTransactionState("AbortSubTransaction");
+
+ if (s->state != TRANS_INPROGRESS)
+ elog(WARNING, "AbortSubTransaction while in %s state",
+ TransStateAsString(s->state));
+
+ s->state = TRANS_ABORT;
+
+ /*
+ * Reset user ID which might have been changed transiently. (See notes in
+ * AbortTransaction.)
+ */
+ SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
+
+ /* Forget about any active REINDEX. */
+ ResetReindexState(s->nestingLevel);
+
+ /* Reset logical streaming state. */
+ ResetLogicalStreamingState();
+
+ /*
+ * No need for SnapBuildResetExportedSnapshotState() here, snapshot
+ * exports are not supported in subtransactions.
+ */
+
+ /* Exit from parallel mode, if necessary. */
+ if (IsInParallelMode())
+ {
+ AtEOSubXact_Parallel(false, s->subTransactionId);
+ s->parallelModeLevel = 0;
+ }
+
+ /*
+ * We can skip all this stuff if the subxact failed before creating a
+ * ResourceOwner...
+ */
+ if (s->curTransactionOwner)
+ {
+ AfterTriggerEndSubXact(false);
+ AtSubAbort_Portals(s->subTransactionId,
+ s->parent->subTransactionId,
+ s->curTransactionOwner,
+ s->parent->curTransactionOwner);
+ AtEOSubXact_LargeObject(false, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtSubAbort_Notify();
+
+ /* Advertise the fact that we aborted in pg_xact. */
+ (void) RecordTransactionAbort(true);
+
+ /* Post-abort cleanup */
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ AtSubAbort_childXids();
+
+ CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId,
+ s->parent->subTransactionId);
+
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ false, false);
+ AtEOSubXact_RelationCache(false, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Inval(false);
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_LOCKS,
+ false, false);
+ ResourceOwnerRelease(s->curTransactionOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ false, false);
+ AtSubAbort_smgr();
+
+ AtEOXact_GUC(false, s->gucNestLevel);
+ AtEOSubXact_SPI(false, s->subTransactionId);
+ AtEOSubXact_on_commit_actions(false, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Namespace(false, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_Files(false, s->subTransactionId,
+ s->parent->subTransactionId);
+ AtEOSubXact_HashTables(false, s->nestingLevel);
+ AtEOSubXact_PgStat(false, s->nestingLevel);
+ AtSubAbort_Snapshot(s->nestingLevel);
+ }
+
+ /*
+ * Restore the upper transaction's read-only state, too. This should be
+ * redundant with GUC's cleanup but we may as well do it for consistency
+ * with the commit case.
+ */
+ XactReadOnly = s->prevXactReadOnly;
+
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * CleanupSubTransaction
+ *
+ * The caller has to make sure to always reassign CurrentTransactionState
+ * if it has a local pointer to it after calling this function.
+ */
+static void
+CleanupSubTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ ShowTransactionState("CleanupSubTransaction");
+
+ if (s->state != TRANS_ABORT)
+ elog(WARNING, "CleanupSubTransaction while in %s state",
+ TransStateAsString(s->state));
+
+ AtSubCleanup_Portals(s->subTransactionId);
+
+ CurrentResourceOwner = s->parent->curTransactionOwner;
+ CurTransactionResourceOwner = s->parent->curTransactionOwner;
+ if (s->curTransactionOwner)
+ ResourceOwnerDelete(s->curTransactionOwner);
+ s->curTransactionOwner = NULL;
+
+ AtSubCleanup_Memory();
+
+ s->state = TRANS_DEFAULT;
+
+ PopTransaction();
+}
+
+/*
+ * PushTransaction
+ * Create transaction state stack entry for a subtransaction
+ *
+ * The caller has to make sure to always reassign CurrentTransactionState
+ * if it has a local pointer to it after calling this function.
+ */
+static void
+PushTransaction(void)
+{
+ TransactionState p = CurrentTransactionState;
+ TransactionState s;
+
+ /*
+ * We keep subtransaction state nodes in TopTransactionContext.
+ */
+ s = (TransactionState)
+ MemoryContextAllocZero(TopTransactionContext,
+ sizeof(TransactionStateData));
+
+ /*
+ * Assign a subtransaction ID, watching out for counter wraparound.
+ */
+ currentSubTransactionId += 1;
+ if (currentSubTransactionId == InvalidSubTransactionId)
+ {
+ currentSubTransactionId -= 1;
+ pfree(s);
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot have more than 2^32-1 subtransactions in a transaction")));
+ }
+
+ /*
+ * We can now stack a minimally valid subtransaction without fear of
+ * failure.
+ */
+ s->fullTransactionId = InvalidFullTransactionId; /* until assigned */
+ s->subTransactionId = currentSubTransactionId;
+ s->parent = p;
+ s->nestingLevel = p->nestingLevel + 1;
+ s->gucNestLevel = NewGUCNestLevel();
+ s->savepointLevel = p->savepointLevel;
+ s->state = TRANS_DEFAULT;
+ s->blockState = TBLOCK_SUBBEGIN;
+ GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
+ s->prevXactReadOnly = XactReadOnly;
+ s->parallelModeLevel = 0;
+ s->topXidLogged = false;
+
+ CurrentTransactionState = s;
+
+ /*
+ * AbortSubTransaction and CleanupSubTransaction have to be able to cope
+ * with the subtransaction from here on out; in particular they should not
+ * assume that it necessarily has a transaction context, resource owner,
+ * or XID.
+ */
+}
+
+/*
+ * PopTransaction
+ * Pop back to parent transaction state
+ *
+ * The caller has to make sure to always reassign CurrentTransactionState
+ * if it has a local pointer to it after calling this function.
+ */
+static void
+PopTransaction(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->state != TRANS_DEFAULT)
+ elog(WARNING, "PopTransaction while in %s state",
+ TransStateAsString(s->state));
+
+ if (s->parent == NULL)
+ elog(FATAL, "PopTransaction with no parent");
+
+ CurrentTransactionState = s->parent;
+
+ /* Let's just make sure CurTransactionContext is good */
+ CurTransactionContext = s->parent->curTransactionContext;
+ MemoryContextSwitchTo(CurTransactionContext);
+
+ /* Ditto for ResourceOwner links */
+ CurTransactionResourceOwner = s->parent->curTransactionOwner;
+ CurrentResourceOwner = s->parent->curTransactionOwner;
+
+ /* Free the old child structure */
+ if (s->name)
+ pfree(s->name);
+ pfree(s);
+}
+
+/*
+ * EstimateTransactionStateSpace
+ * Estimate the amount of space that will be needed by
+ * SerializeTransactionState. It would be OK to overestimate slightly,
+ * but it's simple for us to work out the precise value, so we do.
+ */
+Size
+EstimateTransactionStateSpace(void)
+{
+ TransactionState s;
+ Size nxids = 0;
+ Size size = SerializedTransactionStateHeaderSize;
+
+ for (s = CurrentTransactionState; s != NULL; s = s->parent)
+ {
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ nxids = add_size(nxids, 1);
+ nxids = add_size(nxids, s->nChildXids);
+ }
+
+ return add_size(size, mul_size(sizeof(TransactionId), nxids));
+}
+
+/*
+ * SerializeTransactionState
+ * Write out relevant details of our transaction state that will be
+ * needed by a parallel worker.
+ *
+ * We need to save and restore XactDeferrable, XactIsoLevel, and the XIDs
+ * associated with this transaction. These are serialized into a
+ * caller-supplied buffer big enough to hold the number of bytes reported by
+ * EstimateTransactionStateSpace(). We emit the XIDs in sorted order for the
+ * convenience of the receiving process.
+ */
+void
+SerializeTransactionState(Size maxsize, char *start_address)
+{
+ TransactionState s;
+ Size nxids = 0;
+ Size i = 0;
+ TransactionId *workspace;
+ SerializedTransactionState *result;
+
+ result = (SerializedTransactionState *) start_address;
+
+ result->xactIsoLevel = XactIsoLevel;
+ result->xactDeferrable = XactDeferrable;
+ result->topFullTransactionId = XactTopFullTransactionId;
+ result->currentFullTransactionId =
+ CurrentTransactionState->fullTransactionId;
+ result->currentCommandId = currentCommandId;
+
+ /*
+ * If we're running in a parallel worker and launching a parallel worker
+ * of our own, we can just pass along the information that was passed to
+ * us.
+ */
+ if (nParallelCurrentXids > 0)
+ {
+ result->nParallelCurrentXids = nParallelCurrentXids;
+ memcpy(&result->parallelCurrentXids[0], ParallelCurrentXids,
+ nParallelCurrentXids * sizeof(TransactionId));
+ return;
+ }
+
+ /*
+ * OK, we need to generate a sorted list of XIDs that our workers should
+ * view as current. First, figure out how many there are.
+ */
+ for (s = CurrentTransactionState; s != NULL; s = s->parent)
+ {
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ nxids = add_size(nxids, 1);
+ nxids = add_size(nxids, s->nChildXids);
+ }
+ Assert(SerializedTransactionStateHeaderSize + nxids * sizeof(TransactionId)
+ <= maxsize);
+
+ /* Copy them to our scratch space. */
+ workspace = palloc(nxids * sizeof(TransactionId));
+ for (s = CurrentTransactionState; s != NULL; s = s->parent)
+ {
+ if (FullTransactionIdIsValid(s->fullTransactionId))
+ workspace[i++] = XidFromFullTransactionId(s->fullTransactionId);
+ if (s->nChildXids > 0)
+ memcpy(&workspace[i], s->childXids,
+ s->nChildXids * sizeof(TransactionId));
+ i += s->nChildXids;
+ }
+ Assert(i == nxids);
+
+ /* Sort them. */
+ qsort(workspace, nxids, sizeof(TransactionId), xidComparator);
+
+ /* Copy data into output area. */
+ result->nParallelCurrentXids = nxids;
+ memcpy(&result->parallelCurrentXids[0], workspace,
+ nxids * sizeof(TransactionId));
+}
+
+/*
+ * StartParallelWorkerTransaction
+ * Start a parallel worker transaction, restoring the relevant
+ * transaction state serialized by SerializeTransactionState.
+ */
+void
+StartParallelWorkerTransaction(char *tstatespace)
+{
+ SerializedTransactionState *tstate;
+
+ Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT);
+ StartTransaction();
+
+ tstate = (SerializedTransactionState *) tstatespace;
+ XactIsoLevel = tstate->xactIsoLevel;
+ XactDeferrable = tstate->xactDeferrable;
+ XactTopFullTransactionId = tstate->topFullTransactionId;
+ CurrentTransactionState->fullTransactionId =
+ tstate->currentFullTransactionId;
+ currentCommandId = tstate->currentCommandId;
+ nParallelCurrentXids = tstate->nParallelCurrentXids;
+ ParallelCurrentXids = &tstate->parallelCurrentXids[0];
+
+ CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS;
+}
+
+/*
+ * EndParallelWorkerTransaction
+ * End a parallel worker transaction.
+ */
+void
+EndParallelWorkerTransaction(void)
+{
+ Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS);
+ CommitTransaction();
+ CurrentTransactionState->blockState = TBLOCK_DEFAULT;
+}
+
+/*
+ * ShowTransactionState
+ * Debug support
+ */
+static void
+ShowTransactionState(const char *str)
+{
+ /* skip work if message will definitely not be printed */
+ if (message_level_is_interesting(DEBUG5))
+ ShowTransactionStateRec(str, CurrentTransactionState);
+}
+
+/*
+ * ShowTransactionStateRec
+ * Recursive subroutine for ShowTransactionState
+ */
+static void
+ShowTransactionStateRec(const char *str, TransactionState s)
+{
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+
+ if (s->nChildXids > 0)
+ {
+ int i;
+
+ appendStringInfo(&buf, ", children: %u", s->childXids[0]);
+ for (i = 1; i < s->nChildXids; i++)
+ appendStringInfo(&buf, " %u", s->childXids[i]);
+ }
+
+ if (s->parent)
+ ShowTransactionStateRec(str, s->parent);
+
+ ereport(DEBUG5,
+ (errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s",
+ str, s->nestingLevel,
+ PointerIsValid(s->name) ? s->name : "unnamed",
+ BlockStateAsString(s->blockState),
+ TransStateAsString(s->state),
+ (unsigned int) XidFromFullTransactionId(s->fullTransactionId),
+ (unsigned int) s->subTransactionId,
+ (unsigned int) currentCommandId,
+ currentCommandIdUsed ? " (used)" : "",
+ buf.data)));
+
+ pfree(buf.data);
+}
+
+/*
+ * BlockStateAsString
+ * Debug support
+ */
+static const char *
+BlockStateAsString(TBlockState blockState)
+{
+ switch (blockState)
+ {
+ case TBLOCK_DEFAULT:
+ return "DEFAULT";
+ case TBLOCK_STARTED:
+ return "STARTED";
+ case TBLOCK_BEGIN:
+ return "BEGIN";
+ case TBLOCK_INPROGRESS:
+ return "INPROGRESS";
+ case TBLOCK_IMPLICIT_INPROGRESS:
+ return "IMPLICIT_INPROGRESS";
+ case TBLOCK_PARALLEL_INPROGRESS:
+ return "PARALLEL_INPROGRESS";
+ case TBLOCK_END:
+ return "END";
+ case TBLOCK_ABORT:
+ return "ABORT";
+ case TBLOCK_ABORT_END:
+ return "ABORT_END";
+ case TBLOCK_ABORT_PENDING:
+ return "ABORT_PENDING";
+ case TBLOCK_PREPARE:
+ return "PREPARE";
+ case TBLOCK_SUBBEGIN:
+ return "SUBBEGIN";
+ case TBLOCK_SUBINPROGRESS:
+ return "SUBINPROGRESS";
+ case TBLOCK_SUBRELEASE:
+ return "SUBRELEASE";
+ case TBLOCK_SUBCOMMIT:
+ return "SUBCOMMIT";
+ case TBLOCK_SUBABORT:
+ return "SUBABORT";
+ case TBLOCK_SUBABORT_END:
+ return "SUBABORT_END";
+ case TBLOCK_SUBABORT_PENDING:
+ return "SUBABORT_PENDING";
+ case TBLOCK_SUBRESTART:
+ return "SUBRESTART";
+ case TBLOCK_SUBABORT_RESTART:
+ return "SUBABORT_RESTART";
+ }
+ return "UNRECOGNIZED";
+}
+
+/*
+ * TransStateAsString
+ * Debug support
+ */
+static const char *
+TransStateAsString(TransState state)
+{
+ switch (state)
+ {
+ case TRANS_DEFAULT:
+ return "DEFAULT";
+ case TRANS_START:
+ return "START";
+ case TRANS_INPROGRESS:
+ return "INPROGRESS";
+ case TRANS_COMMIT:
+ return "COMMIT";
+ case TRANS_ABORT:
+ return "ABORT";
+ case TRANS_PREPARE:
+ return "PREPARE";
+ }
+ return "UNRECOGNIZED";
+}
+
+/*
+ * xactGetCommittedChildren
+ *
+ * Gets the list of committed children of the current transaction. The return
+ * value is the number of child transactions. *ptr is set to point to an
+ * array of TransactionIds. The array is allocated in TopTransactionContext;
+ * the caller should *not* pfree() it (this is a change from pre-8.4 code!).
+ * If there are no subxacts, *ptr is set to NULL.
+ */
+int
+xactGetCommittedChildren(TransactionId **ptr)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->nChildXids == 0)
+ *ptr = NULL;
+ else
+ *ptr = s->childXids;
+
+ return s->nChildXids;
+}
+
+/*
+ * XLOG support routines
+ */
+
+
+/*
+ * Log the commit record for a plain or twophase transaction commit.
+ *
+ * A 2pc commit will be emitted when twophase_xid is valid, a plain one
+ * otherwise.
+ */
+XLogRecPtr
+XactLogCommitRecord(TimestampTz commit_time,
+ int nsubxacts, TransactionId *subxacts,
+ int nrels, RelFileNode *rels,
+ int ndroppedstats, xl_xact_stats_item *droppedstats,
+ int nmsgs, SharedInvalidationMessage *msgs,
+ bool relcacheInval,
+ int xactflags, TransactionId twophase_xid,
+ const char *twophase_gid)
+{
+ xl_xact_commit xlrec;
+ xl_xact_xinfo xl_xinfo;
+ xl_xact_dbinfo xl_dbinfo;
+ xl_xact_subxacts xl_subxacts;
+ xl_xact_relfilenodes xl_relfilenodes;
+ xl_xact_stats_items xl_dropped_stats;
+ xl_xact_invals xl_invals;
+ xl_xact_twophase xl_twophase;
+ xl_xact_origin xl_origin;
+ uint8 info;
+
+ Assert(CritSectionCount > 0);
+
+ xl_xinfo.xinfo = 0;
+
+ /* decide between a plain and 2pc commit */
+ if (!TransactionIdIsValid(twophase_xid))
+ info = XLOG_XACT_COMMIT;
+ else
+ info = XLOG_XACT_COMMIT_PREPARED;
+
+ /* First figure out and collect all the information needed */
+
+ xlrec.xact_time = commit_time;
+
+ if (relcacheInval)
+ xl_xinfo.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+ if (forceSyncCommit)
+ xl_xinfo.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT;
+ if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK))
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS;
+
+ /*
+ * Check if the caller would like to ask standbys for immediate feedback
+ * once this commit is applied.
+ */
+ if (synchronous_commit >= SYNCHRONOUS_COMMIT_REMOTE_APPLY)
+ xl_xinfo.xinfo |= XACT_COMPLETION_APPLY_FEEDBACK;
+
+ /*
+ * Relcache invalidations requires information about the current database
+ * and so does logical decoding.
+ */
+ if (nmsgs > 0 || XLogLogicalInfoActive())
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO;
+ xl_dbinfo.dbId = MyDatabaseId;
+ xl_dbinfo.tsId = MyDatabaseTableSpace;
+ }
+
+ if (nsubxacts > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS;
+ xl_subxacts.nsubxacts = nsubxacts;
+ }
+
+ if (nrels > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
+ xl_relfilenodes.nrels = nrels;
+ info |= XLR_SPECIAL_REL_UPDATE;
+ }
+
+ if (ndroppedstats > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS;
+ xl_dropped_stats.nitems = ndroppedstats;
+ }
+
+ if (nmsgs > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_INVALS;
+ xl_invals.nmsgs = nmsgs;
+ }
+
+ if (TransactionIdIsValid(twophase_xid))
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE;
+ xl_twophase.xid = twophase_xid;
+ Assert(twophase_gid != NULL);
+
+ if (XLogLogicalInfoActive())
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_GID;
+ }
+
+ /* dump transaction origin information */
+ if (replorigin_session_origin != InvalidRepOriginId)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN;
+
+ xl_origin.origin_lsn = replorigin_session_origin_lsn;
+ xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
+ }
+
+ if (xl_xinfo.xinfo != 0)
+ info |= XLOG_XACT_HAS_INFO;
+
+ /* Then include all the collected data into the commit record. */
+
+ XLogBeginInsert();
+
+ XLogRegisterData((char *) (&xlrec), sizeof(xl_xact_commit));
+
+ if (xl_xinfo.xinfo != 0)
+ XLogRegisterData((char *) (&xl_xinfo.xinfo), sizeof(xl_xinfo.xinfo));
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO)
+ XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo));
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS)
+ {
+ XLogRegisterData((char *) (&xl_subxacts),
+ MinSizeOfXactSubxacts);
+ XLogRegisterData((char *) subxacts,
+ nsubxacts * sizeof(TransactionId));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES)
+ {
+ XLogRegisterData((char *) (&xl_relfilenodes),
+ MinSizeOfXactRelfilenodes);
+ XLogRegisterData((char *) rels,
+ nrels * sizeof(RelFileNode));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS)
+ {
+ XLogRegisterData((char *) (&xl_dropped_stats),
+ MinSizeOfXactStatsItems);
+ XLogRegisterData((char *) droppedstats,
+ ndroppedstats * sizeof(xl_xact_stats_item));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_INVALS)
+ {
+ XLogRegisterData((char *) (&xl_invals), MinSizeOfXactInvals);
+ XLogRegisterData((char *) msgs,
+ nmsgs * sizeof(SharedInvalidationMessage));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE)
+ {
+ XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase));
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID)
+ XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1);
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN)
+ XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin));
+
+ /* we allow filtering by xacts */
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+ return XLogInsert(RM_XACT_ID, info);
+}
+
+/*
+ * Log the commit record for a plain or twophase transaction abort.
+ *
+ * A 2pc abort will be emitted when twophase_xid is valid, a plain one
+ * otherwise.
+ */
+XLogRecPtr
+XactLogAbortRecord(TimestampTz abort_time,
+ int nsubxacts, TransactionId *subxacts,
+ int nrels, RelFileNode *rels,
+ int ndroppedstats, xl_xact_stats_item *droppedstats,
+ int xactflags, TransactionId twophase_xid,
+ const char *twophase_gid)
+{
+ xl_xact_abort xlrec;
+ xl_xact_xinfo xl_xinfo;
+ xl_xact_subxacts xl_subxacts;
+ xl_xact_relfilenodes xl_relfilenodes;
+ xl_xact_stats_items xl_dropped_stats;
+ xl_xact_twophase xl_twophase;
+ xl_xact_dbinfo xl_dbinfo;
+ xl_xact_origin xl_origin;
+
+ uint8 info;
+
+ Assert(CritSectionCount > 0);
+
+ xl_xinfo.xinfo = 0;
+
+ /* decide between a plain and 2pc abort */
+ if (!TransactionIdIsValid(twophase_xid))
+ info = XLOG_XACT_ABORT;
+ else
+ info = XLOG_XACT_ABORT_PREPARED;
+
+
+ /* First figure out and collect all the information needed */
+
+ xlrec.xact_time = abort_time;
+
+ if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK))
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS;
+
+ if (nsubxacts > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS;
+ xl_subxacts.nsubxacts = nsubxacts;
+ }
+
+ if (nrels > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
+ xl_relfilenodes.nrels = nrels;
+ info |= XLR_SPECIAL_REL_UPDATE;
+ }
+
+ if (ndroppedstats > 0)
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS;
+ xl_dropped_stats.nitems = ndroppedstats;
+ }
+
+ if (TransactionIdIsValid(twophase_xid))
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE;
+ xl_twophase.xid = twophase_xid;
+ Assert(twophase_gid != NULL);
+
+ if (XLogLogicalInfoActive())
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_GID;
+ }
+
+ if (TransactionIdIsValid(twophase_xid) && XLogLogicalInfoActive())
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO;
+ xl_dbinfo.dbId = MyDatabaseId;
+ xl_dbinfo.tsId = MyDatabaseTableSpace;
+ }
+
+ /*
+ * Dump transaction origin information only for abort prepared. We need
+ * this during recovery to update the replication origin progress.
+ */
+ if ((replorigin_session_origin != InvalidRepOriginId) &&
+ TransactionIdIsValid(twophase_xid))
+ {
+ xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN;
+
+ xl_origin.origin_lsn = replorigin_session_origin_lsn;
+ xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
+ }
+
+ if (xl_xinfo.xinfo != 0)
+ info |= XLOG_XACT_HAS_INFO;
+
+ /* Then include all the collected data into the abort record. */
+
+ XLogBeginInsert();
+
+ XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbort);
+
+ if (xl_xinfo.xinfo != 0)
+ XLogRegisterData((char *) (&xl_xinfo), sizeof(xl_xinfo));
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO)
+ XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo));
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS)
+ {
+ XLogRegisterData((char *) (&xl_subxacts),
+ MinSizeOfXactSubxacts);
+ XLogRegisterData((char *) subxacts,
+ nsubxacts * sizeof(TransactionId));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES)
+ {
+ XLogRegisterData((char *) (&xl_relfilenodes),
+ MinSizeOfXactRelfilenodes);
+ XLogRegisterData((char *) rels,
+ nrels * sizeof(RelFileNode));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS)
+ {
+ XLogRegisterData((char *) (&xl_dropped_stats),
+ MinSizeOfXactStatsItems);
+ XLogRegisterData((char *) droppedstats,
+ ndroppedstats * sizeof(xl_xact_stats_item));
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE)
+ {
+ XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase));
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID)
+ XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1);
+ }
+
+ if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN)
+ XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin));
+
+ if (TransactionIdIsValid(twophase_xid))
+ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+ return XLogInsert(RM_XACT_ID, info);
+}
+
+/*
+ * Before 9.0 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
+static void
+xact_redo_commit(xl_xact_parsed_commit *parsed,
+ TransactionId xid,
+ XLogRecPtr lsn,
+ RepOriginId origin_id)
+{
+ TransactionId max_xid;
+ TimestampTz commit_time;
+
+ Assert(TransactionIdIsValid(xid));
+
+ max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts);
+
+ /* Make sure nextXid is beyond any XID mentioned in the record. */
+ AdvanceNextFullTransactionIdPastXid(max_xid);
+
+ Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) ==
+ (origin_id == InvalidRepOriginId));
+
+ if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+ commit_time = parsed->origin_timestamp;
+ else
+ commit_time = parsed->xact_time;
+
+ /* Set the transaction commit timestamp and metadata */
+ TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts,
+ commit_time, origin_id);
+
+ if (standbyState == STANDBY_DISABLED)
+ {
+ /*
+ * Mark the transaction committed in pg_xact.
+ */
+ TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts);
+ }
+ else
+ {
+ /*
+ * If a transaction completion record arrives that has as-yet
+ * unobserved subtransactions then this will not have been fully
+ * handled by the call to RecordKnownAssignedTransactionIds() in the
+ * main recovery loop in xlog.c. So we need to do bookkeeping again to
+ * cover that case. This is confusing and it is easy to think this
+ * call is irrelevant, which has happened three times in development
+ * already. Leave it in.
+ */
+ RecordKnownAssignedTransactionIds(max_xid);
+
+ /*
+ * Mark the transaction committed in pg_xact. We use async commit
+ * protocol during recovery to provide information on database
+ * consistency for when users try to set hint bits. It is important
+ * that we do not set hint bits until the minRecoveryPoint is past
+ * this commit record. This ensures that if we crash we don't see hint
+ * bits set on changes made by transactions that haven't yet
+ * recovered. It's unlikely but it's good to be safe.
+ */
+ TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn);
+
+ /*
+ * We must mark clog before we update the ProcArray.
+ */
+ ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+
+ /*
+ * Send any cache invalidations attached to the commit. We must
+ * maintain the same order of invalidation then release locks as
+ * occurs in CommitTransaction().
+ */
+ ProcessCommittedInvalidationMessages(parsed->msgs, parsed->nmsgs,
+ XactCompletionRelcacheInitFileInval(parsed->xinfo),
+ parsed->dbId, parsed->tsId);
+
+ /*
+ * Release locks, if any. We do this for both two phase and normal one
+ * phase transactions. In effect we are ignoring the prepare phase and
+ * just going straight to lock release.
+ */
+ if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS)
+ StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts);
+ }
+
+ if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+ {
+ /* recover apply progress */
+ replorigin_advance(origin_id, parsed->origin_lsn, lsn,
+ false /* backward */ , false /* WAL */ );
+ }
+
+ /* Make sure files supposed to be dropped are dropped */
+ if (parsed->nrels > 0)
+ {
+ /*
+ * First update minimum recovery point to cover this WAL record. Once
+ * a relation is deleted, there's no going back. The buffer manager
+ * enforces the WAL-first rule for normal updates to relation files,
+ * so that the minimum recovery point is always updated before the
+ * corresponding change in the data file is flushed to disk, but we
+ * have to do the same here since we're bypassing the buffer manager.
+ *
+ * Doing this before deleting the files means that if a deletion fails
+ * for some reason, you cannot start up the system even after restart,
+ * until you fix the underlying situation so that the deletion will
+ * succeed. Alternatively, we could update the minimum recovery point
+ * after deletion, but that would leave a small window where the
+ * WAL-first rule would be violated.
+ */
+ XLogFlush(lsn);
+
+ /* Make sure files supposed to be dropped are dropped */
+ DropRelationFiles(parsed->xnodes, parsed->nrels, true);
+ }
+
+ if (parsed->nstats > 0)
+ {
+ /* see equivalent call for relations above */
+ XLogFlush(lsn);
+
+ pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true);
+ }
+
+ /*
+ * We issue an XLogFlush() for the same reason we emit ForceSyncCommit()
+ * in normal operation. For example, in CREATE DATABASE, we copy all files
+ * from the template database, and then commit the transaction. If we
+ * crash after all the files have been copied but before the commit, you
+ * have files in the data directory without an entry in pg_database. To
+ * minimize the window for that, we use ForceSyncCommit() to rush the
+ * commit record to disk as quick as possible. We have the same window
+ * during recovery, and forcing an XLogFlush() (which updates
+ * minRecoveryPoint during recovery) helps to reduce that problem window,
+ * for any user that requested ForceSyncCommit().
+ */
+ if (XactCompletionForceSyncCommit(parsed->xinfo))
+ XLogFlush(lsn);
+
+ /*
+ * If asked by the primary (because someone is waiting for a synchronous
+ * commit = remote_apply), we will need to ask walreceiver to send a reply
+ * immediately.
+ */
+ if (XactCompletionApplyFeedback(parsed->xinfo))
+ XLogRequestWalReceiverReply();
+}
+
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ *
+ * Note also that an abort can be for a subtransaction and its children,
+ * not just for a top level abort. That means we have to consider
+ * topxid != xid, whereas in commit we would find topxid == xid always
+ * because subtransaction commit is never WAL logged.
+ */
+static void
+xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid,
+ XLogRecPtr lsn, RepOriginId origin_id)
+{
+ TransactionId max_xid;
+
+ Assert(TransactionIdIsValid(xid));
+
+ /* Make sure nextXid is beyond any XID mentioned in the record. */
+ max_xid = TransactionIdLatest(xid,
+ parsed->nsubxacts,
+ parsed->subxacts);
+ AdvanceNextFullTransactionIdPastXid(max_xid);
+
+ if (standbyState == STANDBY_DISABLED)
+ {
+ /* Mark the transaction aborted in pg_xact, no need for async stuff */
+ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
+ }
+ else
+ {
+ /*
+ * If a transaction completion record arrives that has as-yet
+ * unobserved subtransactions then this will not have been fully
+ * handled by the call to RecordKnownAssignedTransactionIds() in the
+ * main recovery loop in xlog.c. So we need to do bookkeeping again to
+ * cover that case. This is confusing and it is easy to think this
+ * call is irrelevant, which has happened three times in development
+ * already. Leave it in.
+ */
+ RecordKnownAssignedTransactionIds(max_xid);
+
+ /* Mark the transaction aborted in pg_xact, no need for async stuff */
+ TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
+
+ /*
+ * We must update the ProcArray after we have marked clog.
+ */
+ ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+
+ /*
+ * There are no invalidation messages to send or undo.
+ */
+
+ /*
+ * Release locks, if any. There are no invalidations to send.
+ */
+ if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS)
+ StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts);
+ }
+
+ if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+ {
+ /* recover apply progress */
+ replorigin_advance(origin_id, parsed->origin_lsn, lsn,
+ false /* backward */ , false /* WAL */ );
+ }
+
+ /* Make sure files supposed to be dropped are dropped */
+ if (parsed->nrels > 0)
+ {
+ /*
+ * See comments about update of minimum recovery point on truncation,
+ * in xact_redo_commit().
+ */
+ XLogFlush(lsn);
+
+ DropRelationFiles(parsed->xnodes, parsed->nrels, true);
+ }
+
+ if (parsed->nstats > 0)
+ {
+ /* see equivalent call for relations above */
+ XLogFlush(lsn);
+
+ pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true);
+ }
+}
+
+void
+xact_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+ /* Backup blocks are not used in xact records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ if (info == XLOG_XACT_COMMIT)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed);
+ xact_redo_commit(&parsed, XLogRecGetXid(record),
+ record->EndRecPtr, XLogRecGetOrigin(record));
+ }
+ else if (info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed);
+ xact_redo_commit(&parsed, parsed.twophase_xid,
+ record->EndRecPtr, XLogRecGetOrigin(record));
+
+ /* Delete TwoPhaseState gxact entry and/or 2PC file. */
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ PrepareRedoRemove(parsed.twophase_xid, false);
+ LWLockRelease(TwoPhaseStateLock);
+ }
+ else if (info == XLOG_XACT_ABORT)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed);
+ xact_redo_abort(&parsed, XLogRecGetXid(record),
+ record->EndRecPtr, XLogRecGetOrigin(record));
+ }
+ else if (info == XLOG_XACT_ABORT_PREPARED)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed);
+ xact_redo_abort(&parsed, parsed.twophase_xid,
+ record->EndRecPtr, XLogRecGetOrigin(record));
+
+ /* Delete TwoPhaseState gxact entry and/or 2PC file. */
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ PrepareRedoRemove(parsed.twophase_xid, false);
+ LWLockRelease(TwoPhaseStateLock);
+ }
+ else if (info == XLOG_XACT_PREPARE)
+ {
+ /*
+ * Store xid and start/end pointers of the WAL record in TwoPhaseState
+ * gxact entry.
+ */
+ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+ PrepareRedoAdd(XLogRecGetData(record),
+ record->ReadRecPtr,
+ record->EndRecPtr,
+ XLogRecGetOrigin(record));
+ LWLockRelease(TwoPhaseStateLock);
+ }
+ else if (info == XLOG_XACT_ASSIGNMENT)
+ {
+ xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+ if (standbyState >= STANDBY_INITIALIZED)
+ ProcArrayApplyXidAssignment(xlrec->xtop,
+ xlrec->nsubxacts, xlrec->xsub);
+ }
+ else if (info == XLOG_XACT_INVALIDATIONS)
+ {
+ /*
+ * XXX we do ignore this for now, what matters are invalidations
+ * written into the commit record.
+ */
+ }
+ else
+ elog(PANIC, "xact_redo: unknown op code %u", info);
+}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
new file mode 100644
index 0000000..59f94b0
--- /dev/null
+++ b/src/backend/access/transam/xlog.c
@@ -0,0 +1,8906 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlog.c
+ * PostgreSQL write-ahead log manager
+ *
+ * The Write-Ahead Log (WAL) functionality is split into several source
+ * files, in addition to this one:
+ *
+ * xloginsert.c - Functions for constructing WAL records
+ * xlogrecovery.c - WAL recovery and standby code
+ * xlogreader.c - Facility for reading WAL files and parsing WAL records
+ * xlogutils.c - Helper functions for WAL redo routines
+ *
+ * This file contains functions for coordinating database startup and
+ * checkpointing, and managing the write-ahead log buffers when the
+ * system is running.
+ *
+ * StartupXLOG() is the main entry point of the startup process. It
+ * coordinates database startup, performing WAL recovery, and the
+ * transition from WAL recovery into normal operations.
+ *
+ * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
+ * callers should not call this directly, but use the functions in
+ * xloginsert.c to construct the WAL record. XLogFlush() can be used
+ * to force the WAL to disk.
+ *
+ * In addition to those, there are many other functions for interrogating
+ * the current system state, and for starting/stopping backups.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heaptoast.h"
+#include "access/multixact.h"
+#include "access/rewriteheap.h"
+#include "access/subtrans.h"
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xloginsert.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "backup/basebackup.h"
+#include "catalog/catversion.h"
+#include "catalog/pg_control.h"
+#include "catalog/pg_database.h"
+#include "common/controldata_utils.h"
+#include "common/file_utils.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "port/pg_iovec.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "postmaster/walwriter.h"
+#include "replication/logical.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/large_object.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/reinit.h"
+#include "storage/smgr.h"
+#include "storage/spin.h"
+#include "storage/sync.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/relmapper.h"
+#include "utils/pg_rusage.h"
+#include "utils/snapmgr.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+extern uint32 bootstrap_data_checksum_version;
+
+/* timeline ID to be used when bootstrapping */
+#define BootstrapTimeLineID 1
+
+/* User-settable parameters */
+int max_wal_size_mb = 1024; /* 1 GB */
+int min_wal_size_mb = 80; /* 80 MB */
+int wal_keep_size_mb = 0;
+int XLOGbuffers = -1;
+int XLogArchiveTimeout = 0;
+int XLogArchiveMode = ARCHIVE_MODE_OFF;
+char *XLogArchiveCommand = NULL;
+bool EnableHotStandby = false;
+bool fullPageWrites = true;
+bool wal_log_hints = false;
+int wal_compression = WAL_COMPRESSION_NONE;
+char *wal_consistency_checking_string = NULL;
+bool *wal_consistency_checking = NULL;
+bool wal_init_zero = true;
+bool wal_recycle = true;
+bool log_checkpoints = true;
+int sync_method = DEFAULT_SYNC_METHOD;
+int wal_level = WAL_LEVEL_MINIMAL;
+int CommitDelay = 0; /* precommit delay in microseconds */
+int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+int wal_retrieve_retry_interval = 5000;
+int max_slot_wal_keep_size_mb = -1;
+int wal_decode_buffer_size = 512 * 1024;
+bool track_wal_io_timing = false;
+
+#ifdef WAL_DEBUG
+bool XLOG_DEBUG = false;
+#endif
+
+int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
+
+/*
+ * Number of WAL insertion locks to use. A higher value allows more insertions
+ * to happen concurrently, but adds some CPU overhead to flushing the WAL,
+ * which needs to iterate all the locks.
+ */
+#define NUM_XLOGINSERT_LOCKS 8
+
+/*
+ * Max distance from last checkpoint, before triggering a new xlog-based
+ * checkpoint.
+ */
+int CheckPointSegments;
+
+/* Estimated distance between checkpoints, in bytes */
+static double CheckPointDistanceEstimate = 0;
+static double PrevCheckPointDistance = 0;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry sync_method_options[] = {
+ {"fsync", SYNC_METHOD_FSYNC, false},
+#ifdef HAVE_FSYNC_WRITETHROUGH
+ {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
+#endif
+#ifdef HAVE_FDATASYNC
+ {"fdatasync", SYNC_METHOD_FDATASYNC, false},
+#endif
+#ifdef OPEN_SYNC_FLAG
+ {"open_sync", SYNC_METHOD_OPEN, false},
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+ {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
+#endif
+ {NULL, 0, false}
+};
+
+
+/*
+ * Although only "on", "off", and "always" are documented,
+ * we accept all the likely variants of "on" and "off".
+ */
+const struct config_enum_entry archive_mode_options[] = {
+ {"always", ARCHIVE_MODE_ALWAYS, false},
+ {"on", ARCHIVE_MODE_ON, false},
+ {"off", ARCHIVE_MODE_OFF, false},
+ {"true", ARCHIVE_MODE_ON, true},
+ {"false", ARCHIVE_MODE_OFF, true},
+ {"yes", ARCHIVE_MODE_ON, true},
+ {"no", ARCHIVE_MODE_OFF, true},
+ {"1", ARCHIVE_MODE_ON, true},
+ {"0", ARCHIVE_MODE_OFF, true},
+ {NULL, 0, false}
+};
+
+/*
+ * Statistics for current checkpoint are collected in this global struct.
+ * Because only the checkpointer or a stand-alone backend can perform
+ * checkpoints, this will be unused in normal backends.
+ */
+CheckpointStatsData CheckpointStats;
+
+/*
+ * During recovery, lastFullPageWrites keeps track of full_page_writes that
+ * the replayed WAL records indicate. It's initialized with full_page_writes
+ * that the recovery starting checkpoint record indicates, and then updated
+ * each time XLOG_FPW_CHANGE record is replayed.
+ */
+static bool lastFullPageWrites;
+
+/*
+ * Local copy of the state tracked by SharedRecoveryState in shared memory,
+ * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
+ * means "not known, need to check the shared state".
+ */
+static bool LocalRecoveryInProgress = true;
+
+/*
+ * Local state for XLogInsertAllowed():
+ * 1: unconditionally allowed to insert XLOG
+ * 0: unconditionally not allowed to insert XLOG
+ * -1: must check RecoveryInProgress(); disallow until it is false
+ * Most processes start with -1 and transition to 1 after seeing that recovery
+ * is not in progress. But we can also force the value for special cases.
+ * The coding in XLogInsertAllowed() depends on the first two of these states
+ * being numerically the same as bool true and false.
+ */
+static int LocalXLogInsertAllowed = -1;
+
+/*
+ * ProcLastRecPtr points to the start of the last XLOG record inserted by the
+ * current backend. It is updated for all inserts. XactLastRecEnd points to
+ * end+1 of the last record, and is reset when we end a top-level transaction,
+ * or start a new one; so it can be used to tell if the current transaction has
+ * created any XLOG records.
+ *
+ * While in parallel mode, this may not be fully up to date. When committing,
+ * a transaction can assume this covers all xlog records written either by the
+ * user backend or by any parallel worker which was present at any point during
+ * the transaction. But when aborting, or when still in parallel mode, other
+ * parallel backends may have written WAL records at later LSNs than the value
+ * stored here. The parallel leader advances its own copy, when necessary,
+ * in WaitForParallelWorkersToFinish.
+ */
+XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
+XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
+XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
+
+/*
+ * RedoRecPtr is this backend's local copy of the REDO record pointer
+ * (which is almost but not quite the same as a pointer to the most recent
+ * CHECKPOINT record). We update this from the shared-memory copy,
+ * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
+ * hold an insertion lock). See XLogInsertRecord for details. We are also
+ * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
+ * see GetRedoRecPtr.
+ *
+ * NB: Code that uses this variable must be prepared not only for the
+ * possibility that it may be arbitrarily out of date, but also for the
+ * possibility that it might be set to InvalidXLogRecPtr. We used to
+ * initialize it as a side effect of the first call to RecoveryInProgress(),
+ * which meant that most code that might use it could assume that it had a
+ * real if perhaps stale value. That's no longer the case.
+ */
+static XLogRecPtr RedoRecPtr;
+
+/*
+ * doPageWrites is this backend's local copy of (forcePageWrites ||
+ * fullPageWrites). It is used together with RedoRecPtr to decide whether
+ * a full-page image of a page need to be taken.
+ *
+ * NB: Initially this is false, and there's no guarantee that it will be
+ * initialized to any other value before it is first used. Any code that
+ * makes use of it must recheck the value after obtaining a WALInsertLock,
+ * and respond appropriately if it turns out that the previous value wasn't
+ * accurate.
+ */
+static bool doPageWrites;
+
+/*----------
+ * Shared-memory data structures for XLOG control
+ *
+ * LogwrtRqst indicates a byte position that we need to write and/or fsync
+ * the log up to (all records before that point must be written or fsynced).
+ * LogwrtResult indicates the byte positions we have already written/fsynced.
+ * These structs are identical but are declared separately to indicate their
+ * slightly different functions.
+ *
+ * To read XLogCtl->LogwrtResult, you must hold either info_lck or
+ * WALWriteLock. To update it, you need to hold both locks. The point of
+ * this arrangement is that the value can be examined by code that already
+ * holds WALWriteLock without needing to grab info_lck as well. In addition
+ * to the shared variable, each backend has a private copy of LogwrtResult,
+ * which is updated when convenient.
+ *
+ * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
+ * (protected by info_lck), but we don't need to cache any copies of it.
+ *
+ * info_lck is only held long enough to read/update the protected variables,
+ * so it's a plain spinlock. The other locks are held longer (potentially
+ * over I/O operations), so we use LWLocks for them. These locks are:
+ *
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
+ * It is only held while initializing and changing the mapping. If the
+ * contents of the buffer being replaced haven't been written yet, the mapping
+ * lock is released while the write is done, and reacquired afterwards.
+ *
+ * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
+ * XLogFlush).
+ *
+ * ControlFileLock: must be held to read/update control file or create
+ * new log file.
+ *
+ *----------
+ */
+
+typedef struct XLogwrtRqst
+{
+ XLogRecPtr Write; /* last byte + 1 to write out */
+ XLogRecPtr Flush; /* last byte + 1 to flush */
+} XLogwrtRqst;
+
+typedef struct XLogwrtResult
+{
+ XLogRecPtr Write; /* last byte + 1 written out */
+ XLogRecPtr Flush; /* last byte + 1 flushed */
+} XLogwrtResult;
+
+/*
+ * Inserting to WAL is protected by a small fixed number of WAL insertion
+ * locks. To insert to the WAL, you must hold one of the locks - it doesn't
+ * matter which one. To lock out other concurrent insertions, you must hold
+ * of them. Each WAL insertion lock consists of a lightweight lock, plus an
+ * indicator of how far the insertion has progressed (insertingAt).
+ *
+ * The insertingAt values are read when a process wants to flush WAL from
+ * the in-memory buffers to disk, to check that all the insertions to the
+ * region the process is about to write out have finished. You could simply
+ * wait for all currently in-progress insertions to finish, but the
+ * insertingAt indicator allows you to ignore insertions to later in the WAL,
+ * so that you only wait for the insertions that are modifying the buffers
+ * you're about to write out.
+ *
+ * This isn't just an optimization. If all the WAL buffers are dirty, an
+ * inserter that's holding a WAL insert lock might need to evict an old WAL
+ * buffer, which requires flushing the WAL. If it's possible for an inserter
+ * to block on another inserter unnecessarily, deadlock can arise when two
+ * inserters holding a WAL insert lock wait for each other to finish their
+ * insertion.
+ *
+ * Small WAL records that don't cross a page boundary never update the value,
+ * the WAL record is just copied to the page and the lock is released. But
+ * to avoid the deadlock-scenario explained above, the indicator is always
+ * updated before sleeping while holding an insertion lock.
+ *
+ * lastImportantAt contains the LSN of the last important WAL record inserted
+ * using a given lock. This value is used to detect if there has been
+ * important WAL activity since the last time some action, like a checkpoint,
+ * was performed - allowing to not repeat the action if not. The LSN is
+ * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
+ * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
+ * records. Tracking the WAL activity directly in WALInsertLock has the
+ * advantage of not needing any additional locks to update the value.
+ */
+typedef struct
+{
+ LWLock lock;
+ XLogRecPtr insertingAt;
+ XLogRecPtr lastImportantAt;
+} WALInsertLock;
+
+/*
+ * All the WAL insertion locks are allocated as an array in shared memory. We
+ * force the array stride to be a power of 2, which saves a few cycles in
+ * indexing, but more importantly also ensures that individual slots don't
+ * cross cache line boundaries. (Of course, we have to also ensure that the
+ * array start address is suitably aligned.)
+ */
+typedef union WALInsertLockPadded
+{
+ WALInsertLock l;
+ char pad[PG_CACHE_LINE_SIZE];
+} WALInsertLockPadded;
+
+/*
+ * Session status of running backup, used for sanity checks in SQL-callable
+ * functions to start and stop backups.
+ */
+static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
+
+/*
+ * Shared state data for WAL insertion.
+ */
+typedef struct XLogCtlInsert
+{
+ slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
+
+ /*
+ * CurrBytePos is the end of reserved WAL. The next record will be
+ * inserted at that position. PrevBytePos is the start position of the
+ * previously inserted (or rather, reserved) record - it is copied to the
+ * prev-link of the next record. These are stored as "usable byte
+ * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
+ */
+ uint64 CurrBytePos;
+ uint64 PrevBytePos;
+
+ /*
+ * Make sure the above heavily-contended spinlock and byte positions are
+ * on their own cache line. In particular, the RedoRecPtr and full page
+ * write variables below should be on a different cache line. They are
+ * read on every WAL insertion, but updated rarely, and we don't want
+ * those reads to steal the cache line containing Curr/PrevBytePos.
+ */
+ char pad[PG_CACHE_LINE_SIZE];
+
+ /*
+ * fullPageWrites is the authoritative value used by all backends to
+ * determine whether to write full-page image to WAL. This shared value,
+ * instead of the process-local fullPageWrites, is required because, when
+ * full_page_writes is changed by SIGHUP, we must WAL-log it before it
+ * actually affects WAL-logging by backends. Checkpointer sets at startup
+ * or after SIGHUP.
+ *
+ * To read these fields, you must hold an insertion lock. To modify them,
+ * you must hold ALL the locks.
+ */
+ XLogRecPtr RedoRecPtr; /* current redo point for insertions */
+ bool forcePageWrites; /* forcing full-page writes for PITR? */
+ bool fullPageWrites;
+
+ /*
+ * runningBackups is a counter indicating the number of backups currently
+ * in progress. forcePageWrites is set to true when runningBackups is
+ * non-zero. lastBackupStart is the latest checkpoint redo location used
+ * as a starting point for an online backup.
+ */
+ int runningBackups;
+ XLogRecPtr lastBackupStart;
+
+ /*
+ * WAL insertion locks.
+ */
+ WALInsertLockPadded *WALInsertLocks;
+} XLogCtlInsert;
+
+/*
+ * Total shared-memory state for XLOG.
+ */
+typedef struct XLogCtlData
+{
+ XLogCtlInsert Insert;
+
+ /* Protected by info_lck: */
+ XLogwrtRqst LogwrtRqst;
+ XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
+ FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */
+ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
+ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
+
+ XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
+
+ /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
+ XLogRecPtr unloggedLSN;
+ slock_t ulsn_lck;
+
+ /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
+ pg_time_t lastSegSwitchTime;
+ XLogRecPtr lastSegSwitchLSN;
+
+ /*
+ * Protected by info_lck and WALWriteLock (you must hold either lock to
+ * read it, but both to update)
+ */
+ XLogwrtResult LogwrtResult;
+
+ /*
+ * Latest initialized page in the cache (last byte position + 1).
+ *
+ * To change the identity of a buffer (and InitializedUpTo), you need to
+ * hold WALBufMappingLock. To change the identity of a buffer that's
+ * still dirty, the old page needs to be written out first, and for that
+ * you need WALWriteLock, and you need to ensure that there are no
+ * in-progress insertions to the page by calling
+ * WaitXLogInsertionsToFinish().
+ */
+ XLogRecPtr InitializedUpTo;
+
+ /*
+ * These values do not change after startup, although the pointed-to pages
+ * and xlblocks values certainly do. xlblocks values are protected by
+ * WALBufMappingLock.
+ */
+ char *pages; /* buffers for unwritten XLOG pages */
+ XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
+ int XLogCacheBlck; /* highest allocated xlog buffer index */
+
+ /*
+ * InsertTimeLineID is the timeline into which new WAL is being inserted
+ * and flushed. It is zero during recovery, and does not change once set.
+ *
+ * If we create a new timeline when the system was started up,
+ * PrevTimeLineID is the old timeline's ID that we forked off from.
+ * Otherwise it's equal to InsertTimeLineID.
+ */
+ TimeLineID InsertTimeLineID;
+ TimeLineID PrevTimeLineID;
+
+ /*
+ * SharedRecoveryState indicates if we're still in crash or archive
+ * recovery. Protected by info_lck.
+ */
+ RecoveryState SharedRecoveryState;
+
+ /*
+ * InstallXLogFileSegmentActive indicates whether the checkpointer should
+ * arrange for future segments by recycling and/or PreallocXlogFiles().
+ * Protected by ControlFileLock. Only the startup process changes it. If
+ * true, anyone can use InstallXLogFileSegment(). If false, the startup
+ * process owns the exclusive right to install segments, by reading from
+ * the archive and possibly replacing existing files.
+ */
+ bool InstallXLogFileSegmentActive;
+
+ /*
+ * WalWriterSleeping indicates whether the WAL writer is currently in
+ * low-power mode (and hence should be nudged if an async commit occurs).
+ * Protected by info_lck.
+ */
+ bool WalWriterSleeping;
+
+ /*
+ * During recovery, we keep a copy of the latest checkpoint record here.
+ * lastCheckPointRecPtr points to start of checkpoint record and
+ * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
+ * checkpointer when it wants to create a restartpoint.
+ *
+ * Protected by info_lck.
+ */
+ XLogRecPtr lastCheckPointRecPtr;
+ XLogRecPtr lastCheckPointEndPtr;
+ CheckPoint lastCheckPoint;
+
+ /*
+ * lastFpwDisableRecPtr points to the start of the last replayed
+ * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
+ */
+ XLogRecPtr lastFpwDisableRecPtr;
+
+ slock_t info_lck; /* locks shared variables shown above */
+} XLogCtlData;
+
+static XLogCtlData *XLogCtl = NULL;
+
+/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
+static WALInsertLockPadded *WALInsertLocks = NULL;
+
+/*
+ * We maintain an image of pg_control in shared memory.
+ */
+static ControlFileData *ControlFile = NULL;
+
+/*
+ * Calculate the amount of space left on the page after 'endptr'. Beware
+ * multiple evaluation!
+ */
+#define INSERT_FREESPACE(endptr) \
+ (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
+
+/* Macro to advance to next buffer index. */
+#define NextBufIdx(idx) \
+ (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
+
+/*
+ * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
+ * would hold if it was in cache, the page containing 'recptr'.
+ */
+#define XLogRecPtrToBufIdx(recptr) \
+ (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
+
+/*
+ * These are the number of bytes in a WAL page usable for WAL data.
+ */
+#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
+
+/*
+ * Convert values of GUCs measured in megabytes to equiv. segment count.
+ * Rounds down.
+ */
+#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
+
+/* The number of bytes in a WAL segment usable for WAL data. */
+static int UsableBytesInSegment;
+
+/*
+ * Private, possibly out-of-date copy of shared LogwrtResult.
+ * See discussion above.
+ */
+static XLogwrtResult LogwrtResult = {0, 0};
+
+/*
+ * openLogFile is -1 or a kernel FD for an open log file segment.
+ * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
+ * These variables are only used to write the XLOG, and so will normally refer
+ * to the active segment.
+ *
+ * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
+ */
+static int openLogFile = -1;
+static XLogSegNo openLogSegNo = 0;
+static TimeLineID openLogTLI = 0;
+
+/*
+ * Local copies of equivalent fields in the control file. When running
+ * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
+ * expect to replay all the WAL available, and updateMinRecoveryPoint is
+ * switched to false to prevent any updates while replaying records.
+ * Those values are kept consistent as long as crash recovery runs.
+ */
+static XLogRecPtr LocalMinRecoveryPoint;
+static TimeLineID LocalMinRecoveryPointTLI;
+static bool updateMinRecoveryPoint = true;
+
+/* For WALInsertLockAcquire/Release functions */
+static int MyLockNo = 0;
+static bool holdingAllLocks = false;
+
+#ifdef WAL_DEBUG
+static MemoryContext walDebugCxt = NULL;
+#endif
+
+static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
+ XLogRecPtr EndOfLog,
+ TimeLineID newTLI);
+static void CheckRequiredParameterValues(void);
+static void XLogReportParameters(void);
+static int LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
+static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
+ XLogRecPtr missingContrecPtr,
+ TimeLineID newTLI);
+static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
+static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
+
+static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
+ bool opportunistic);
+static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
+static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
+ bool find_free, XLogSegNo max_segno,
+ TimeLineID tli);
+static void XLogFileClose(void);
+static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
+static void RemoveTempXlogFiles(void);
+static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
+ XLogRecPtr endptr, TimeLineID insertTLI);
+static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
+ XLogSegNo *endlogSegNo, TimeLineID insertTLI);
+static void UpdateLastRemovedPtr(char *filename);
+static void ValidateXLOGDirectoryStructure(void);
+static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
+static bool PerformRecoveryXLogAction(void);
+static void InitControlFile(uint64 sysidentifier);
+static void WriteControlFile(void);
+static void ReadControlFile(void);
+static void UpdateControlFile(void);
+static char *str_time(pg_time_t tnow);
+
+static void pg_backup_start_callback(int code, Datum arg);
+
+static int get_sync_bit(int method);
+
+static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
+ XLogRecData *rdata,
+ XLogRecPtr StartPos, XLogRecPtr EndPos,
+ TimeLineID tli);
+static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
+ XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
+static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+ XLogRecPtr *PrevPtr);
+static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
+static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
+static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
+static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
+static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+
+static void WALInsertLockAcquire(void);
+static void WALInsertLockAcquireExclusive(void);
+static void WALInsertLockRelease(void);
+static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
+
+/*
+ * Insert an XLOG record represented by an already-constructed chain of data
+ * chunks. This is a low-level routine; to construct the WAL record header
+ * and data, use the higher-level routines in xloginsert.c.
+ *
+ * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
+ * WAL record applies to, that were not included in the record as full page
+ * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
+ * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
+ * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
+ * record is always inserted.
+ *
+ * 'flags' gives more in-depth control on the record being inserted. See
+ * XLogSetRecordFlags() for details.
+ *
+ * 'topxid_included' tells whether the top-transaction id is logged along with
+ * current subtransaction. See XLogRecordAssemble().
+ *
+ * The first XLogRecData in the chain must be for the record header, and its
+ * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
+ * xl_crc fields in the header, the rest of the header must already be filled
+ * by the caller.
+ *
+ * Returns XLOG pointer to end of record (beginning of next record).
+ * This can be used as LSN for data pages affected by the logged action.
+ * (LSN is the XLOG point up to which the XLOG must be flushed to disk
+ * before the data page can be written out. This implements the basic
+ * WAL rule "write the log before the data".)
+ */
+XLogRecPtr
+XLogInsertRecord(XLogRecData *rdata,
+ XLogRecPtr fpw_lsn,
+ uint8 flags,
+ int num_fpi,
+ bool topxid_included)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ pg_crc32c rdata_crc;
+ bool inserted;
+ XLogRecord *rechdr = (XLogRecord *) rdata->data;
+ uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
+ bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
+ info == XLOG_SWITCH);
+ XLogRecPtr StartPos;
+ XLogRecPtr EndPos;
+ bool prevDoPageWrites = doPageWrites;
+ TimeLineID insertTLI;
+
+ /* we assume that all of the record header is in the first chunk */
+ Assert(rdata->len >= SizeOfXLogRecord);
+
+ /* cross-check on whether we should be here or not */
+ if (!XLogInsertAllowed())
+ elog(ERROR, "cannot make new WAL entries during recovery");
+
+ /*
+ * Given that we're not in recovery, InsertTimeLineID is set and can't
+ * change, so we can read it without a lock.
+ */
+ insertTLI = XLogCtl->InsertTimeLineID;
+
+ /*----------
+ *
+ * We have now done all the preparatory work we can without holding a
+ * lock or modifying shared state. From here on, inserting the new WAL
+ * record to the shared WAL buffer cache is a two-step process:
+ *
+ * 1. Reserve the right amount of space from the WAL. The current head of
+ * reserved space is kept in Insert->CurrBytePos, and is protected by
+ * insertpos_lck.
+ *
+ * 2. Copy the record to the reserved WAL space. This involves finding the
+ * correct WAL buffer containing the reserved space, and copying the
+ * record in place. This can be done concurrently in multiple processes.
+ *
+ * To keep track of which insertions are still in-progress, each concurrent
+ * inserter acquires an insertion lock. In addition to just indicating that
+ * an insertion is in progress, the lock tells others how far the inserter
+ * has progressed. There is a small fixed number of insertion locks,
+ * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
+ * boundary, it updates the value stored in the lock to the how far it has
+ * inserted, to allow the previous buffer to be flushed.
+ *
+ * Holding onto an insertion lock also protects RedoRecPtr and
+ * fullPageWrites from changing until the insertion is finished.
+ *
+ * Step 2 can usually be done completely in parallel. If the required WAL
+ * page is not initialized yet, you have to grab WALBufMappingLock to
+ * initialize it, but the WAL writer tries to do that ahead of insertions
+ * to avoid that from happening in the critical path.
+ *
+ *----------
+ */
+ START_CRIT_SECTION();
+ if (isLogSwitch)
+ WALInsertLockAcquireExclusive();
+ else
+ WALInsertLockAcquire();
+
+ /*
+ * Check to see if my copy of RedoRecPtr is out of date. If so, may have
+ * to go back and have the caller recompute everything. This can only
+ * happen just after a checkpoint, so it's better to be slow in this case
+ * and fast otherwise.
+ *
+ * Also check to see if fullPageWrites or forcePageWrites was just turned
+ * on; if we weren't already doing full-page writes then go back and
+ * recompute.
+ *
+ * If we aren't doing full-page writes then RedoRecPtr doesn't actually
+ * affect the contents of the XLOG record, so we'll update our local copy
+ * but not force a recomputation. (If doPageWrites was just turned off,
+ * we could recompute the record without full pages, but we choose not to
+ * bother.)
+ */
+ if (RedoRecPtr != Insert->RedoRecPtr)
+ {
+ Assert(RedoRecPtr < Insert->RedoRecPtr);
+ RedoRecPtr = Insert->RedoRecPtr;
+ }
+ doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
+
+ if (doPageWrites &&
+ (!prevDoPageWrites ||
+ (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
+ {
+ /*
+ * Oops, some buffer now needs to be backed up that the caller didn't
+ * back up. Start over.
+ */
+ WALInsertLockRelease();
+ END_CRIT_SECTION();
+ return InvalidXLogRecPtr;
+ }
+
+ /*
+ * Reserve space for the record in the WAL. This also sets the xl_prev
+ * pointer.
+ */
+ if (isLogSwitch)
+ inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
+ else
+ {
+ ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
+ &rechdr->xl_prev);
+ inserted = true;
+ }
+
+ if (inserted)
+ {
+ /*
+ * Now that xl_prev has been filled in, calculate CRC of the record
+ * header.
+ */
+ rdata_crc = rechdr->xl_crc;
+ COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
+ FIN_CRC32C(rdata_crc);
+ rechdr->xl_crc = rdata_crc;
+
+ /*
+ * All the record data, including the header, is now ready to be
+ * inserted. Copy the record in the space reserved.
+ */
+ CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
+ StartPos, EndPos, insertTLI);
+
+ /*
+ * Unless record is flagged as not important, update LSN of last
+ * important record in the current slot. When holding all locks, just
+ * update the first one.
+ */
+ if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
+ {
+ int lockno = holdingAllLocks ? 0 : MyLockNo;
+
+ WALInsertLocks[lockno].l.lastImportantAt = StartPos;
+ }
+ }
+ else
+ {
+ /*
+ * This was an xlog-switch record, but the current insert location was
+ * already exactly at the beginning of a segment, so there was no need
+ * to do anything.
+ */
+ }
+
+ /*
+ * Done! Let others know that we're finished.
+ */
+ WALInsertLockRelease();
+
+ END_CRIT_SECTION();
+
+ MarkCurrentTransactionIdLoggedIfAny();
+
+ /*
+ * Mark top transaction id is logged (if needed) so that we should not try
+ * to log it again with the next WAL record in the current subtransaction.
+ */
+ if (topxid_included)
+ MarkSubxactTopXidLogged();
+
+ /*
+ * Update shared LogwrtRqst.Write, if we crossed page boundary.
+ */
+ if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+ {
+ SpinLockAcquire(&XLogCtl->info_lck);
+ /* advance global request to include new block(s) */
+ if (XLogCtl->LogwrtRqst.Write < EndPos)
+ XLogCtl->LogwrtRqst.Write = EndPos;
+ /* update local result copy while I have the chance */
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
+ /*
+ * If this was an XLOG_SWITCH record, flush the record and the empty
+ * padding space that fills the rest of the segment, and perform
+ * end-of-segment actions (eg, notifying archiver).
+ */
+ if (isLogSwitch)
+ {
+ TRACE_POSTGRESQL_WAL_SWITCH();
+ XLogFlush(EndPos);
+
+ /*
+ * Even though we reserved the rest of the segment for us, which is
+ * reflected in EndPos, we return a pointer to just the end of the
+ * xlog-switch record.
+ */
+ if (inserted)
+ {
+ EndPos = StartPos + SizeOfXLogRecord;
+ if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+ {
+ uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
+
+ if (offset == EndPos % XLOG_BLCKSZ)
+ EndPos += SizeOfXLogLongPHD;
+ else
+ EndPos += SizeOfXLogShortPHD;
+ }
+ }
+ }
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG)
+ {
+ static XLogReaderState *debug_reader = NULL;
+ XLogRecord *record;
+ DecodedXLogRecord *decoded;
+ StringInfoData buf;
+ StringInfoData recordBuf;
+ char *errormsg = NULL;
+ MemoryContext oldCxt;
+
+ oldCxt = MemoryContextSwitchTo(walDebugCxt);
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
+
+ /*
+ * We have to piece together the WAL record data from the XLogRecData
+ * entries, so that we can pass it to the rm_desc function as one
+ * contiguous chunk.
+ */
+ initStringInfo(&recordBuf);
+ for (; rdata != NULL; rdata = rdata->next)
+ appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
+
+ /* We also need temporary space to decode the record. */
+ record = (XLogRecord *) recordBuf.data;
+ decoded = (DecodedXLogRecord *)
+ palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
+
+ if (!debug_reader)
+ debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
+ XL_ROUTINE(), NULL);
+
+ if (!debug_reader)
+ {
+ appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
+ }
+ else if (!DecodeXLogRecord(debug_reader,
+ decoded,
+ record,
+ EndPos,
+ &errormsg))
+ {
+ appendStringInfo(&buf, "error decoding record: %s",
+ errormsg ? errormsg : "no error message");
+ }
+ else
+ {
+ appendStringInfoString(&buf, " - ");
+
+ debug_reader->record = decoded;
+ xlog_outdesc(&buf, debug_reader);
+ debug_reader->record = NULL;
+ }
+ elog(LOG, "%s", buf.data);
+
+ pfree(decoded);
+ pfree(buf.data);
+ pfree(recordBuf.data);
+ MemoryContextSwitchTo(oldCxt);
+ }
+#endif
+
+ /*
+ * Update our global variables
+ */
+ ProcLastRecPtr = StartPos;
+ XactLastRecEnd = EndPos;
+
+ /* Report WAL traffic to the instrumentation. */
+ if (inserted)
+ {
+ pgWalUsage.wal_bytes += rechdr->xl_tot_len;
+ pgWalUsage.wal_records++;
+ pgWalUsage.wal_fpi += num_fpi;
+ }
+
+ return EndPos;
+}
+
+/*
+ * Reserves the right amount of space for a record of given size from the WAL.
+ * *StartPos is set to the beginning of the reserved section, *EndPos to
+ * its end+1. *PrevPtr is set to the beginning of the previous record; it is
+ * used to set the xl_prev of this record.
+ *
+ * This is the performance critical part of XLogInsert that must be serialized
+ * across backends. The rest can happen mostly in parallel. Try to keep this
+ * section as short as possible, insertpos_lck can be heavily contended on a
+ * busy system.
+ *
+ * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
+ * where we actually copy the record to the reserved space.
+ */
+static void
+ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+ XLogRecPtr *PrevPtr)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 startbytepos;
+ uint64 endbytepos;
+ uint64 prevbytepos;
+
+ size = MAXALIGN(size);
+
+ /* All (non xlog-switch) records should contain data. */
+ Assert(size > SizeOfXLogRecord);
+
+ /*
+ * The duration the spinlock needs to be held is minimized by minimizing
+ * the calculations that have to be done while holding the lock. The
+ * current tip of reserved WAL is kept in CurrBytePos, as a byte position
+ * that only counts "usable" bytes in WAL, that is, it excludes all WAL
+ * page headers. The mapping between "usable" byte positions and physical
+ * positions (XLogRecPtrs) can be done outside the locked region, and
+ * because the usable byte position doesn't include any headers, reserving
+ * X bytes from WAL is almost as simple as "CurrBytePos += X".
+ */
+ SpinLockAcquire(&Insert->insertpos_lck);
+
+ startbytepos = Insert->CurrBytePos;
+ endbytepos = startbytepos + size;
+ prevbytepos = Insert->PrevBytePos;
+ Insert->CurrBytePos = endbytepos;
+ Insert->PrevBytePos = startbytepos;
+
+ SpinLockRelease(&Insert->insertpos_lck);
+
+ *StartPos = XLogBytePosToRecPtr(startbytepos);
+ *EndPos = XLogBytePosToEndRecPtr(endbytepos);
+ *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+ /*
+ * Check that the conversions between "usable byte positions" and
+ * XLogRecPtrs work consistently in both directions.
+ */
+ Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+ Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+ Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+}
+
+/*
+ * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
+ *
+ * A log-switch record is handled slightly differently. The rest of the
+ * segment will be reserved for this insertion, as indicated by the returned
+ * *EndPos value. However, if we are already at the beginning of the current
+ * segment, *StartPos and *EndPos are set to the current location without
+ * reserving any space, and the function returns false.
+*/
+static bool
+ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 startbytepos;
+ uint64 endbytepos;
+ uint64 prevbytepos;
+ uint32 size = MAXALIGN(SizeOfXLogRecord);
+ XLogRecPtr ptr;
+ uint32 segleft;
+
+ /*
+ * These calculations are a bit heavy-weight to be done while holding a
+ * spinlock, but since we're holding all the WAL insertion locks, there
+ * are no other inserters competing for it. GetXLogInsertRecPtr() does
+ * compete for it, but that's not called very frequently.
+ */
+ SpinLockAcquire(&Insert->insertpos_lck);
+
+ startbytepos = Insert->CurrBytePos;
+
+ ptr = XLogBytePosToEndRecPtr(startbytepos);
+ if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
+ {
+ SpinLockRelease(&Insert->insertpos_lck);
+ *EndPos = *StartPos = ptr;
+ return false;
+ }
+
+ endbytepos = startbytepos + size;
+ prevbytepos = Insert->PrevBytePos;
+
+ *StartPos = XLogBytePosToRecPtr(startbytepos);
+ *EndPos = XLogBytePosToEndRecPtr(endbytepos);
+
+ segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
+ if (segleft != wal_segment_size)
+ {
+ /* consume the rest of the segment */
+ *EndPos += segleft;
+ endbytepos = XLogRecPtrToBytePos(*EndPos);
+ }
+ Insert->CurrBytePos = endbytepos;
+ Insert->PrevBytePos = startbytepos;
+
+ SpinLockRelease(&Insert->insertpos_lck);
+
+ *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+ Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
+ Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+ Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+ Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+
+ return true;
+}
+
+/*
+ * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
+ * area in the WAL.
+ */
+static void
+CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
+ XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
+{
+ char *currpos;
+ int freespace;
+ int written;
+ XLogRecPtr CurrPos;
+ XLogPageHeader pagehdr;
+
+ /*
+ * Get a pointer to the right place in the right WAL buffer to start
+ * inserting to.
+ */
+ CurrPos = StartPos;
+ currpos = GetXLogBuffer(CurrPos, tli);
+ freespace = INSERT_FREESPACE(CurrPos);
+
+ /*
+ * there should be enough space for at least the first field (xl_tot_len)
+ * on this page.
+ */
+ Assert(freespace >= sizeof(uint32));
+
+ /* Copy record data */
+ written = 0;
+ while (rdata != NULL)
+ {
+ char *rdata_data = rdata->data;
+ int rdata_len = rdata->len;
+
+ while (rdata_len > freespace)
+ {
+ /*
+ * Write what fits on this page, and continue on the next page.
+ */
+ Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
+ memcpy(currpos, rdata_data, freespace);
+ rdata_data += freespace;
+ rdata_len -= freespace;
+ written += freespace;
+ CurrPos += freespace;
+
+ /*
+ * Get pointer to beginning of next page, and set the xlp_rem_len
+ * in the page header. Set XLP_FIRST_IS_CONTRECORD.
+ *
+ * It's safe to set the contrecord flag and xlp_rem_len without a
+ * lock on the page. All the other flags were already set when the
+ * page was initialized, in AdvanceXLInsertBuffer, and we're the
+ * only backend that needs to set the contrecord flag.
+ */
+ currpos = GetXLogBuffer(CurrPos, tli);
+ pagehdr = (XLogPageHeader) currpos;
+ pagehdr->xlp_rem_len = write_len - written;
+ pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
+
+ /* skip over the page header */
+ if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
+ {
+ CurrPos += SizeOfXLogLongPHD;
+ currpos += SizeOfXLogLongPHD;
+ }
+ else
+ {
+ CurrPos += SizeOfXLogShortPHD;
+ currpos += SizeOfXLogShortPHD;
+ }
+ freespace = INSERT_FREESPACE(CurrPos);
+ }
+
+ Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
+ memcpy(currpos, rdata_data, rdata_len);
+ currpos += rdata_len;
+ CurrPos += rdata_len;
+ freespace -= rdata_len;
+ written += rdata_len;
+
+ rdata = rdata->next;
+ }
+ Assert(written == write_len);
+
+ /*
+ * If this was an xlog-switch, it's not enough to write the switch record,
+ * we also have to consume all the remaining space in the WAL segment. We
+ * have already reserved that space, but we need to actually fill it.
+ */
+ if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
+ {
+ /* An xlog-switch record doesn't contain any data besides the header */
+ Assert(write_len == SizeOfXLogRecord);
+
+ /* Assert that we did reserve the right amount of space */
+ Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
+
+ /* Use up all the remaining space on the current page */
+ CurrPos += freespace;
+
+ /*
+ * Cause all remaining pages in the segment to be flushed, leaving the
+ * XLog position where it should be, at the start of the next segment.
+ * We do this one page at a time, to make sure we don't deadlock
+ * against ourselves if wal_buffers < wal_segment_size.
+ */
+ while (CurrPos < EndPos)
+ {
+ /*
+ * The minimal action to flush the page would be to call
+ * WALInsertLockUpdateInsertingAt(CurrPos) followed by
+ * AdvanceXLInsertBuffer(...). The page would be left initialized
+ * mostly to zeros, except for the page header (always the short
+ * variant, as this is never a segment's first page).
+ *
+ * The large vistas of zeros are good for compressibility, but the
+ * headers interrupting them every XLOG_BLCKSZ (with values that
+ * differ from page to page) are not. The effect varies with
+ * compression tool, but bzip2 for instance compresses about an
+ * order of magnitude worse if those headers are left in place.
+ *
+ * Rather than complicating AdvanceXLInsertBuffer itself (which is
+ * called in heavily-loaded circumstances as well as this lightly-
+ * loaded one) with variant behavior, we just use GetXLogBuffer
+ * (which itself calls the two methods we need) to get the pointer
+ * and zero most of the page. Then we just zero the page header.
+ */
+ currpos = GetXLogBuffer(CurrPos, tli);
+ MemSet(currpos, 0, SizeOfXLogShortPHD);
+
+ CurrPos += XLOG_BLCKSZ;
+ }
+ }
+ else
+ {
+ /* Align the end position, so that the next record starts aligned */
+ CurrPos = MAXALIGN64(CurrPos);
+ }
+
+ if (CurrPos != EndPos)
+ elog(PANIC, "space reserved for WAL record does not match what was written");
+}
+
+/*
+ * Acquire a WAL insertion lock, for inserting to WAL.
+ */
+static void
+WALInsertLockAcquire(void)
+{
+ bool immed;
+
+ /*
+ * It doesn't matter which of the WAL insertion locks we acquire, so try
+ * the one we used last time. If the system isn't particularly busy, it's
+ * a good bet that it's still available, and it's good to have some
+ * affinity to a particular lock so that you don't unnecessarily bounce
+ * cache lines between processes when there's no contention.
+ *
+ * If this is the first time through in this backend, pick a lock
+ * (semi-)randomly. This allows the locks to be used evenly if you have a
+ * lot of very short connections.
+ */
+ static int lockToTry = -1;
+
+ if (lockToTry == -1)
+ lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
+ MyLockNo = lockToTry;
+
+ /*
+ * The insertingAt value is initially set to 0, as we don't know our
+ * insert location yet.
+ */
+ immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
+ if (!immed)
+ {
+ /*
+ * If we couldn't get the lock immediately, try another lock next
+ * time. On a system with more insertion locks than concurrent
+ * inserters, this causes all the inserters to eventually migrate to a
+ * lock that no-one else is using. On a system with more inserters
+ * than locks, it still helps to distribute the inserters evenly
+ * across the locks.
+ */
+ lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
+ }
+}
+
+/*
+ * Acquire all WAL insertion locks, to prevent other backends from inserting
+ * to WAL.
+ */
+static void
+WALInsertLockAcquireExclusive(void)
+{
+ int i;
+
+ /*
+ * When holding all the locks, all but the last lock's insertingAt
+ * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
+ * XLogRecPtr value, to make sure that no-one blocks waiting on those.
+ */
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
+ {
+ LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+ LWLockUpdateVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ PG_UINT64_MAX);
+ }
+ /* Variable value reset to 0 at release */
+ LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+
+ holdingAllLocks = true;
+}
+
+/*
+ * Release our insertion lock (or locks, if we're holding them all).
+ *
+ * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
+ * next time the lock is acquired.
+ */
+static void
+WALInsertLockRelease(void)
+{
+ if (holdingAllLocks)
+ {
+ int i;
+
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ 0);
+
+ holdingAllLocks = false;
+ }
+ else
+ {
+ LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
+ &WALInsertLocks[MyLockNo].l.insertingAt,
+ 0);
+ }
+}
+
+/*
+ * Update our insertingAt value, to let others know that we've finished
+ * inserting up to that point.
+ */
+static void
+WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
+{
+ if (holdingAllLocks)
+ {
+ /*
+ * We use the last lock to mark our actual position, see comments in
+ * WALInsertLockAcquireExclusive.
+ */
+ LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
+ &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
+ insertingAt);
+ }
+ else
+ LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
+ &WALInsertLocks[MyLockNo].l.insertingAt,
+ insertingAt);
+}
+
+/*
+ * Wait for any WAL insertions < upto to finish.
+ *
+ * Returns the location of the oldest insertion that is still in-progress.
+ * Any WAL prior to that point has been fully copied into WAL buffers, and
+ * can be flushed out to disk. Because this waits for any insertions older
+ * than 'upto' to finish, the return value is always >= 'upto'.
+ *
+ * Note: When you are about to write out WAL, you must call this function
+ * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
+ * need to wait for an insertion to finish (or at least advance to next
+ * uninitialized page), and the inserter might need to evict an old WAL buffer
+ * to make room for a new one, which in turn requires WALWriteLock.
+ */
+static XLogRecPtr
+WaitXLogInsertionsToFinish(XLogRecPtr upto)
+{
+ uint64 bytepos;
+ XLogRecPtr reservedUpto;
+ XLogRecPtr finishedUpto;
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ int i;
+
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ /* Read the current insert position */
+ SpinLockAcquire(&Insert->insertpos_lck);
+ bytepos = Insert->CurrBytePos;
+ SpinLockRelease(&Insert->insertpos_lck);
+ reservedUpto = XLogBytePosToEndRecPtr(bytepos);
+
+ /*
+ * No-one should request to flush a piece of WAL that hasn't even been
+ * reserved yet. However, it can happen if there is a block with a bogus
+ * LSN on disk, for example. XLogFlush checks for that situation and
+ * complains, but only after the flush. Here we just assume that to mean
+ * that all WAL that has been reserved needs to be finished. In this
+ * corner-case, the return value can be smaller than 'upto' argument.
+ */
+ if (upto > reservedUpto)
+ {
+ ereport(LOG,
+ (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
+ LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
+ upto = reservedUpto;
+ }
+
+ /*
+ * Loop through all the locks, sleeping on any in-progress insert older
+ * than 'upto'.
+ *
+ * finishedUpto is our return value, indicating the point upto which all
+ * the WAL insertions have been finished. Initialize it to the head of
+ * reserved WAL, and as we iterate through the insertion locks, back it
+ * out for any insertion that's still in progress.
+ */
+ finishedUpto = reservedUpto;
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ {
+ XLogRecPtr insertingat = InvalidXLogRecPtr;
+
+ do
+ {
+ /*
+ * See if this insertion is in progress. LWLockWaitForVar will
+ * wait for the lock to be released, or for the 'value' to be set
+ * by a LWLockUpdateVar call. When a lock is initially acquired,
+ * its value is 0 (InvalidXLogRecPtr), which means that we don't
+ * know where it's inserting yet. We will have to wait for it. If
+ * it's a small insertion, the record will most likely fit on the
+ * same page and the inserter will release the lock without ever
+ * calling LWLockUpdateVar. But if it has to sleep, it will
+ * advertise the insertion point with LWLockUpdateVar before
+ * sleeping.
+ */
+ if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
+ &WALInsertLocks[i].l.insertingAt,
+ insertingat, &insertingat))
+ {
+ /* the lock was free, so no insertion in progress */
+ insertingat = InvalidXLogRecPtr;
+ break;
+ }
+
+ /*
+ * This insertion is still in progress. Have to wait, unless the
+ * inserter has proceeded past 'upto'.
+ */
+ } while (insertingat < upto);
+
+ if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
+ finishedUpto = insertingat;
+ }
+ return finishedUpto;
+}
+
+/*
+ * Get a pointer to the right location in the WAL buffer containing the
+ * given XLogRecPtr.
+ *
+ * If the page is not initialized yet, it is initialized. That might require
+ * evicting an old dirty buffer from the buffer cache, which means I/O.
+ *
+ * The caller must ensure that the page containing the requested location
+ * isn't evicted yet, and won't be evicted. The way to ensure that is to
+ * hold onto a WAL insertion lock with the insertingAt position set to
+ * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
+ * to evict an old page from the buffer. (This means that once you call
+ * GetXLogBuffer() with a given 'ptr', you must not access anything before
+ * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
+ * later, because older buffers might be recycled already)
+ */
+static char *
+GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
+{
+ int idx;
+ XLogRecPtr endptr;
+ static uint64 cachedPage = 0;
+ static char *cachedPos = NULL;
+ XLogRecPtr expectedEndPtr;
+
+ /*
+ * Fast path for the common case that we need to access again the same
+ * page as last time.
+ */
+ if (ptr / XLOG_BLCKSZ == cachedPage)
+ {
+ Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+ Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+ return cachedPos + ptr % XLOG_BLCKSZ;
+ }
+
+ /*
+ * The XLog buffer cache is organized so that a page is always loaded to a
+ * particular buffer. That way we can easily calculate the buffer a given
+ * page must be loaded into, from the XLogRecPtr alone.
+ */
+ idx = XLogRecPtrToBufIdx(ptr);
+
+ /*
+ * See what page is loaded in the buffer at the moment. It could be the
+ * page we're looking for, or something older. It can't be anything newer
+ * - that would imply the page we're looking for has already been written
+ * out to disk and evicted, and the caller is responsible for making sure
+ * that doesn't happen.
+ *
+ * However, we don't hold a lock while we read the value. If someone has
+ * just initialized the page, it's possible that we get a "torn read" of
+ * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
+ * that case we will see a bogus value. That's ok, we'll grab the mapping
+ * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
+ * the page we're looking for. But it means that when we do this unlocked
+ * read, we might see a value that appears to be ahead of the page we're
+ * looking for. Don't PANIC on that, until we've verified the value while
+ * holding the lock.
+ */
+ expectedEndPtr = ptr;
+ expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
+
+ endptr = XLogCtl->xlblocks[idx];
+ if (expectedEndPtr != endptr)
+ {
+ XLogRecPtr initializedUpto;
+
+ /*
+ * Before calling AdvanceXLInsertBuffer(), which can block, let others
+ * know how far we're finished with inserting the record.
+ *
+ * NB: If 'ptr' points to just after the page header, advertise a
+ * position at the beginning of the page rather than 'ptr' itself. If
+ * there are no other insertions running, someone might try to flush
+ * up to our advertised location. If we advertised a position after
+ * the page header, someone might try to flush the page header, even
+ * though page might actually not be initialized yet. As the first
+ * inserter on the page, we are effectively responsible for making
+ * sure that it's initialized, before we let insertingAt to move past
+ * the page header.
+ */
+ if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
+ XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
+ initializedUpto = ptr - SizeOfXLogShortPHD;
+ else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
+ XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
+ initializedUpto = ptr - SizeOfXLogLongPHD;
+ else
+ initializedUpto = ptr;
+
+ WALInsertLockUpdateInsertingAt(initializedUpto);
+
+ AdvanceXLInsertBuffer(ptr, tli, false);
+ endptr = XLogCtl->xlblocks[idx];
+
+ if (expectedEndPtr != endptr)
+ elog(PANIC, "could not find WAL buffer for %X/%X",
+ LSN_FORMAT_ARGS(ptr));
+ }
+ else
+ {
+ /*
+ * Make sure the initialization of the page is visible to us, and
+ * won't arrive later to overwrite the WAL data we write on the page.
+ */
+ pg_memory_barrier();
+ }
+
+ /*
+ * Found the buffer holding this page. Return a pointer to the right
+ * offset within the page.
+ */
+ cachedPage = ptr / XLOG_BLCKSZ;
+ cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
+
+ Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+ Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+
+ return cachedPos + ptr % XLOG_BLCKSZ;
+}
+
+/*
+ * Converts a "usable byte position" to XLogRecPtr. A usable byte position
+ * is the position starting from the beginning of WAL, excluding all WAL
+ * page headers.
+ */
+static XLogRecPtr
+XLogBytePosToRecPtr(uint64 bytepos)
+{
+ uint64 fullsegs;
+ uint64 fullpages;
+ uint64 bytesleft;
+ uint32 seg_offset;
+ XLogRecPtr result;
+
+ fullsegs = bytepos / UsableBytesInSegment;
+ bytesleft = bytepos % UsableBytesInSegment;
+
+ if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+ {
+ /* fits on first page of segment */
+ seg_offset = bytesleft + SizeOfXLogLongPHD;
+ }
+ else
+ {
+ /* account for the first page on segment with long header */
+ seg_offset = XLOG_BLCKSZ;
+ bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+ fullpages = bytesleft / UsableBytesInPage;
+ bytesleft = bytesleft % UsableBytesInPage;
+
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ }
+
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
+
+ return result;
+}
+
+/*
+ * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
+ * returns a pointer to the beginning of the page (ie. before page header),
+ * not to where the first xlog record on that page would go to. This is used
+ * when converting a pointer to the end of a record.
+ */
+static XLogRecPtr
+XLogBytePosToEndRecPtr(uint64 bytepos)
+{
+ uint64 fullsegs;
+ uint64 fullpages;
+ uint64 bytesleft;
+ uint32 seg_offset;
+ XLogRecPtr result;
+
+ fullsegs = bytepos / UsableBytesInSegment;
+ bytesleft = bytepos % UsableBytesInSegment;
+
+ if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+ {
+ /* fits on first page of segment */
+ if (bytesleft == 0)
+ seg_offset = 0;
+ else
+ seg_offset = bytesleft + SizeOfXLogLongPHD;
+ }
+ else
+ {
+ /* account for the first page on segment with long header */
+ seg_offset = XLOG_BLCKSZ;
+ bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+ fullpages = bytesleft / UsableBytesInPage;
+ bytesleft = bytesleft % UsableBytesInPage;
+
+ if (bytesleft == 0)
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
+ else
+ seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ }
+
+ XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
+
+ return result;
+}
+
+/*
+ * Convert an XLogRecPtr to a "usable byte position".
+ */
+static uint64
+XLogRecPtrToBytePos(XLogRecPtr ptr)
+{
+ uint64 fullsegs;
+ uint32 fullpages;
+ uint32 offset;
+ uint64 result;
+
+ XLByteToSeg(ptr, fullsegs, wal_segment_size);
+
+ fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
+ offset = ptr % XLOG_BLCKSZ;
+
+ if (fullpages == 0)
+ {
+ result = fullsegs * UsableBytesInSegment;
+ if (offset > 0)
+ {
+ Assert(offset >= SizeOfXLogLongPHD);
+ result += offset - SizeOfXLogLongPHD;
+ }
+ }
+ else
+ {
+ result = fullsegs * UsableBytesInSegment +
+ (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
+ (fullpages - 1) * UsableBytesInPage; /* full pages */
+ if (offset > 0)
+ {
+ Assert(offset >= SizeOfXLogShortPHD);
+ result += offset - SizeOfXLogShortPHD;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * Initialize XLOG buffers, writing out old buffers if they still contain
+ * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
+ * true, initialize as many pages as we can without having to write out
+ * unwritten data. Any new pages are initialized to zeros, with pages headers
+ * initialized properly.
+ */
+static void
+AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ int nextidx;
+ XLogRecPtr OldPageRqstPtr;
+ XLogwrtRqst WriteRqst;
+ XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
+ XLogRecPtr NewPageBeginPtr;
+ XLogPageHeader NewPage;
+ int npages pg_attribute_unused() = 0;
+
+ LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+
+ /*
+ * Now that we have the lock, check if someone initialized the page
+ * already.
+ */
+ while (upto >= XLogCtl->InitializedUpTo || opportunistic)
+ {
+ nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
+
+ /*
+ * Get ending-offset of the buffer page we need to replace (this may
+ * be zero if the buffer hasn't been used yet). Fall through if it's
+ * already written out.
+ */
+ OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
+ if (LogwrtResult.Write < OldPageRqstPtr)
+ {
+ /*
+ * Nope, got work to do. If we just want to pre-initialize as much
+ * as we can without flushing, give up now.
+ */
+ if (opportunistic)
+ break;
+
+ /* Before waiting, get info_lck and update LogwrtResult */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
+ XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Now that we have an up-to-date LogwrtResult value, see if we
+ * still need to write it or if someone else already did.
+ */
+ if (LogwrtResult.Write < OldPageRqstPtr)
+ {
+ /*
+ * Must acquire write lock. Release WALBufMappingLock first,
+ * to make sure that all insertions that we need to wait for
+ * can finish (up to this same position). Otherwise we risk
+ * deadlock.
+ */
+ LWLockRelease(WALBufMappingLock);
+
+ WaitXLogInsertionsToFinish(OldPageRqstPtr);
+
+ LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+
+ LogwrtResult = XLogCtl->LogwrtResult;
+ if (LogwrtResult.Write >= OldPageRqstPtr)
+ {
+ /* OK, someone wrote it already */
+ LWLockRelease(WALWriteLock);
+ }
+ else
+ {
+ /* Have to write it ourselves */
+ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
+ WriteRqst.Write = OldPageRqstPtr;
+ WriteRqst.Flush = 0;
+ XLogWrite(WriteRqst, tli, false);
+ LWLockRelease(WALWriteLock);
+ PendingWalStats.wal_buffers_full++;
+ TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
+ }
+ /* Re-acquire WALBufMappingLock and retry */
+ LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+ continue;
+ }
+ }
+
+ /*
+ * Now the next buffer slot is free and we can set it up to be the
+ * next output page.
+ */
+ NewPageBeginPtr = XLogCtl->InitializedUpTo;
+ NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
+
+ Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
+
+ NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
+
+ /*
+ * Be sure to re-zero the buffer so that bytes beyond what we've
+ * written will look like zeroes and not valid XLOG records...
+ */
+ MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
+
+ /*
+ * Fill the new page's header
+ */
+ NewPage->xlp_magic = XLOG_PAGE_MAGIC;
+
+ /* NewPage->xlp_info = 0; */ /* done by memset */
+ NewPage->xlp_tli = tli;
+ NewPage->xlp_pageaddr = NewPageBeginPtr;
+
+ /* NewPage->xlp_rem_len = 0; */ /* done by memset */
+
+ /*
+ * If online backup is not in progress, mark the header to indicate
+ * that WAL records beginning in this page have removable backup
+ * blocks. This allows the WAL archiver to know whether it is safe to
+ * compress archived WAL data by transforming full-block records into
+ * the non-full-block format. It is sufficient to record this at the
+ * page level because we force a page switch (in fact a segment
+ * switch) when starting a backup, so the flag will be off before any
+ * records can be written during the backup. At the end of a backup,
+ * the last page will be marked as all unsafe when perhaps only part
+ * is unsafe, but at worst the archiver would miss the opportunity to
+ * compress a few records.
+ */
+ if (!Insert->forcePageWrites)
+ NewPage->xlp_info |= XLP_BKP_REMOVABLE;
+
+ /*
+ * If first page of an XLOG segment file, make it a long header.
+ */
+ if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
+ {
+ XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
+
+ NewLongPage->xlp_sysid = ControlFile->system_identifier;
+ NewLongPage->xlp_seg_size = wal_segment_size;
+ NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
+ NewPage->xlp_info |= XLP_LONG_HEADER;
+ }
+
+ /*
+ * Make sure the initialization of the page becomes visible to others
+ * before the xlblocks update. GetXLogBuffer() reads xlblocks without
+ * holding a lock.
+ */
+ pg_write_barrier();
+
+ *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
+
+ XLogCtl->InitializedUpTo = NewPageEndPtr;
+
+ npages++;
+ }
+ LWLockRelease(WALBufMappingLock);
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG && npages > 0)
+ {
+ elog(DEBUG1, "initialized %d pages, up to %X/%X",
+ npages, LSN_FORMAT_ARGS(NewPageEndPtr));
+ }
+#endif
+}
+
+/*
+ * Calculate CheckPointSegments based on max_wal_size_mb and
+ * checkpoint_completion_target.
+ */
+static void
+CalculateCheckpointSegments(void)
+{
+ double target;
+
+ /*-------
+ * Calculate the distance at which to trigger a checkpoint, to avoid
+ * exceeding max_wal_size_mb. This is based on two assumptions:
+ *
+ * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
+ * WAL for two checkpoint cycles to allow us to recover from the
+ * secondary checkpoint if the first checkpoint failed, though we
+ * only did this on the primary anyway, not on standby. Keeping just
+ * one checkpoint simplifies processing and reduces disk space in
+ * many smaller databases.)
+ * b) during checkpoint, we consume checkpoint_completion_target *
+ * number of segments consumed between checkpoints.
+ *-------
+ */
+ target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
+ (1.0 + CheckPointCompletionTarget);
+
+ /* round down */
+ CheckPointSegments = (int) target;
+
+ if (CheckPointSegments < 1)
+ CheckPointSegments = 1;
+}
+
+void
+assign_max_wal_size(int newval, void *extra)
+{
+ max_wal_size_mb = newval;
+ CalculateCheckpointSegments();
+}
+
+void
+assign_checkpoint_completion_target(double newval, void *extra)
+{
+ CheckPointCompletionTarget = newval;
+ CalculateCheckpointSegments();
+}
+
+/*
+ * At a checkpoint, how many WAL segments to recycle as preallocated future
+ * XLOG segments? Returns the highest segment that should be preallocated.
+ */
+static XLogSegNo
+XLOGfileslop(XLogRecPtr lastredoptr)
+{
+ XLogSegNo minSegNo;
+ XLogSegNo maxSegNo;
+ double distance;
+ XLogSegNo recycleSegNo;
+
+ /*
+ * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
+ * correspond to. Always recycle enough segments to meet the minimum, and
+ * remove enough segments to stay below the maximum.
+ */
+ minSegNo = lastredoptr / wal_segment_size +
+ ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
+ maxSegNo = lastredoptr / wal_segment_size +
+ ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
+
+ /*
+ * Between those limits, recycle enough segments to get us through to the
+ * estimated end of next checkpoint.
+ *
+ * To estimate where the next checkpoint will finish, assume that the
+ * system runs steadily consuming CheckPointDistanceEstimate bytes between
+ * every checkpoint.
+ */
+ distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
+ /* add 10% for good measure. */
+ distance *= 1.10;
+
+ recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
+ wal_segment_size);
+
+ if (recycleSegNo < minSegNo)
+ recycleSegNo = minSegNo;
+ if (recycleSegNo > maxSegNo)
+ recycleSegNo = maxSegNo;
+
+ return recycleSegNo;
+}
+
+/*
+ * Check whether we've consumed enough xlog space that a checkpoint is needed.
+ *
+ * new_segno indicates a log file that has just been filled up (or read
+ * during recovery). We measure the distance from RedoRecPtr to new_segno
+ * and see if that exceeds CheckPointSegments.
+ *
+ * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
+ */
+bool
+XLogCheckpointNeeded(XLogSegNo new_segno)
+{
+ XLogSegNo old_segno;
+
+ XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
+
+ if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
+ return true;
+ return false;
+}
+
+/*
+ * Write and/or fsync the log at least as far as WriteRqst indicates.
+ *
+ * If flexible == true, we don't have to write as far as WriteRqst, but
+ * may stop at any convenient boundary (such as a cache or logfile boundary).
+ * This option allows us to avoid uselessly issuing multiple writes when a
+ * single one would do.
+ *
+ * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
+ * must be called before grabbing the lock, to make sure the data is ready to
+ * write.
+ */
+static void
+XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
+{
+ bool ispartialpage;
+ bool last_iteration;
+ bool finishing_seg;
+ int curridx;
+ int npages;
+ int startidx;
+ uint32 startoffset;
+
+ /* We should always be inside a critical section here */
+ Assert(CritSectionCount > 0);
+
+ /*
+ * Update local LogwrtResult (caller probably did this already, but...)
+ */
+ LogwrtResult = XLogCtl->LogwrtResult;
+
+ /*
+ * Since successive pages in the xlog cache are consecutively allocated,
+ * we can usually gather multiple pages together and issue just one
+ * write() call. npages is the number of pages we have determined can be
+ * written together; startidx is the cache block index of the first one,
+ * and startoffset is the file offset at which it should go. The latter
+ * two variables are only valid when npages > 0, but we must initialize
+ * all of them to keep the compiler quiet.
+ */
+ npages = 0;
+ startidx = 0;
+ startoffset = 0;
+
+ /*
+ * Within the loop, curridx is the cache block index of the page to
+ * consider writing. Begin at the buffer containing the next unwritten
+ * page, or last partially written page.
+ */
+ curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
+
+ while (LogwrtResult.Write < WriteRqst.Write)
+ {
+ /*
+ * Make sure we're not ahead of the insert process. This could happen
+ * if we're passed a bogus WriteRqst.Write that is past the end of the
+ * last page that's been initialized by AdvanceXLInsertBuffer.
+ */
+ XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
+
+ if (LogwrtResult.Write >= EndPtr)
+ elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
+ LSN_FORMAT_ARGS(LogwrtResult.Write),
+ LSN_FORMAT_ARGS(EndPtr));
+
+ /* Advance LogwrtResult.Write to end of current buffer page */
+ LogwrtResult.Write = EndPtr;
+ ispartialpage = WriteRqst.Write < LogwrtResult.Write;
+
+ if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
+ {
+ /*
+ * Switch to new logfile segment. We cannot have any pending
+ * pages here (since we dump what we have at segment end).
+ */
+ Assert(npages == 0);
+ if (openLogFile >= 0)
+ XLogFileClose();
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
+ openLogTLI = tli;
+
+ /* create/use new log file */
+ openLogFile = XLogFileInit(openLogSegNo, tli);
+ ReserveExternalFD();
+ }
+
+ /* Make sure we have the current logfile open */
+ if (openLogFile < 0)
+ {
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
+ openLogTLI = tli;
+ openLogFile = XLogFileOpen(openLogSegNo, tli);
+ ReserveExternalFD();
+ }
+
+ /* Add current page to the set of pending pages-to-dump */
+ if (npages == 0)
+ {
+ /* first of group */
+ startidx = curridx;
+ startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
+ wal_segment_size);
+ }
+ npages++;
+
+ /*
+ * Dump the set if this will be the last loop iteration, or if we are
+ * at the last page of the cache area (since the next page won't be
+ * contiguous in memory), or if we are at the end of the logfile
+ * segment.
+ */
+ last_iteration = WriteRqst.Write <= LogwrtResult.Write;
+
+ finishing_seg = !ispartialpage &&
+ (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
+
+ if (last_iteration ||
+ curridx == XLogCtl->XLogCacheBlck ||
+ finishing_seg)
+ {
+ char *from;
+ Size nbytes;
+ Size nleft;
+ int written;
+ instr_time start;
+
+ /* OK to write the page(s) */
+ from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
+ nbytes = npages * (Size) XLOG_BLCKSZ;
+ nleft = nbytes;
+ do
+ {
+ errno = 0;
+
+ /* Measure I/O timing to write WAL data */
+ if (track_wal_io_timing)
+ INSTR_TIME_SET_CURRENT(start);
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
+ written = pg_pwrite(openLogFile, from, nleft, startoffset);
+ pgstat_report_wait_end();
+
+ /*
+ * Increment the I/O timing and the number of times WAL data
+ * were written out to disk.
+ */
+ if (track_wal_io_timing)
+ {
+ instr_time duration;
+
+ INSTR_TIME_SET_CURRENT(duration);
+ INSTR_TIME_SUBTRACT(duration, start);
+ PendingWalStats.wal_write_time += INSTR_TIME_GET_MICROSEC(duration);
+ }
+
+ PendingWalStats.wal_write++;
+
+ if (written <= 0)
+ {
+ char xlogfname[MAXFNAMELEN];
+ int save_errno;
+
+ if (errno == EINTR)
+ continue;
+
+ save_errno = errno;
+ XLogFileName(xlogfname, tli, openLogSegNo,
+ wal_segment_size);
+ errno = save_errno;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file %s "
+ "at offset %u, length %zu: %m",
+ xlogfname, startoffset, nleft)));
+ }
+ nleft -= written;
+ from += written;
+ startoffset += written;
+ } while (nleft > 0);
+
+ npages = 0;
+
+ /*
+ * If we just wrote the whole last page of a logfile segment,
+ * fsync the segment immediately. This avoids having to go back
+ * and re-open prior segments when an fsync request comes along
+ * later. Doing it here ensures that one and only one backend will
+ * perform this fsync.
+ *
+ * This is also the right place to notify the Archiver that the
+ * segment is ready to copy to archival storage, and to update the
+ * timer for archive_timeout, and to signal for a checkpoint if
+ * too many logfile segments have been used since the last
+ * checkpoint.
+ */
+ if (finishing_seg)
+ {
+ issue_xlog_fsync(openLogFile, openLogSegNo, tli);
+
+ /* signal that we need to wakeup walsenders later */
+ WalSndWakeupRequest();
+
+ LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
+
+ if (XLogArchivingActive())
+ XLogArchiveNotifySeg(openLogSegNo, tli);
+
+ XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
+
+ /*
+ * Request a checkpoint if we've consumed too much xlog since
+ * the last one. For speed, we first check using the local
+ * copy of RedoRecPtr, which might be out of date; if it looks
+ * like a checkpoint is needed, forcibly update RedoRecPtr and
+ * recheck.
+ */
+ if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
+ {
+ (void) GetRedoRecPtr();
+ if (XLogCheckpointNeeded(openLogSegNo))
+ RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+ }
+ }
+ }
+
+ if (ispartialpage)
+ {
+ /* Only asked to write a partial page */
+ LogwrtResult.Write = WriteRqst.Write;
+ break;
+ }
+ curridx = NextBufIdx(curridx);
+
+ /* If flexible, break out of loop as soon as we wrote something */
+ if (flexible && npages == 0)
+ break;
+ }
+
+ Assert(npages == 0);
+
+ /*
+ * If asked to flush, do so
+ */
+ if (LogwrtResult.Flush < WriteRqst.Flush &&
+ LogwrtResult.Flush < LogwrtResult.Write)
+ {
+ /*
+ * Could get here without iterating above loop, in which case we might
+ * have no open file or the wrong one. However, we do not need to
+ * fsync more than one file.
+ */
+ if (sync_method != SYNC_METHOD_OPEN &&
+ sync_method != SYNC_METHOD_OPEN_DSYNC)
+ {
+ if (openLogFile >= 0 &&
+ !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
+ XLogFileClose();
+ if (openLogFile < 0)
+ {
+ XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size);
+ openLogTLI = tli;
+ openLogFile = XLogFileOpen(openLogSegNo, tli);
+ ReserveExternalFD();
+ }
+
+ issue_xlog_fsync(openLogFile, openLogSegNo, tli);
+ }
+
+ /* signal that we need to wakeup walsenders later */
+ WalSndWakeupRequest();
+
+ LogwrtResult.Flush = LogwrtResult.Write;
+ }
+
+ /*
+ * Update shared-memory status
+ *
+ * We make sure that the shared 'request' values do not fall behind the
+ * 'result' values. This is not absolutely essential, but it saves some
+ * code in a couple of places.
+ */
+ {
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->LogwrtResult = LogwrtResult;
+ if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
+ XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
+ if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
+ XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+}
+
+/*
+ * Record the LSN for an asynchronous transaction commit/abort
+ * and nudge the WALWriter if there is work for it to do.
+ * (This should not be called for synchronous commits.)
+ */
+void
+XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
+{
+ XLogRecPtr WriteRqstPtr = asyncXactLSN;
+ bool sleeping;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ sleeping = XLogCtl->WalWriterSleeping;
+ if (XLogCtl->asyncXactLSN < asyncXactLSN)
+ XLogCtl->asyncXactLSN = asyncXactLSN;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * If the WALWriter is sleeping, we should kick it to make it come out of
+ * low-power mode. Otherwise, determine whether there's a full page of
+ * WAL available to write.
+ */
+ if (!sleeping)
+ {
+ /* back off to last completed page boundary */
+ WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
+
+ /* if we have already flushed that far, we're done */
+ if (WriteRqstPtr <= LogwrtResult.Flush)
+ return;
+ }
+
+ /*
+ * Nudge the WALWriter: it has a full page of WAL to write, or we want it
+ * to come out of low-power mode so that this async commit will reach disk
+ * within the expected amount of time.
+ */
+ if (ProcGlobal->walwriterLatch)
+ SetLatch(ProcGlobal->walwriterLatch);
+}
+
+/*
+ * Record the LSN up to which we can remove WAL because it's not required by
+ * any replication slot.
+ */
+void
+XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->replicationSlotMinLSN = lsn;
+ SpinLockRelease(&XLogCtl->info_lck);
+}
+
+
+/*
+ * Return the oldest LSN we must retain to satisfy the needs of some
+ * replication slot.
+ */
+static XLogRecPtr
+XLogGetReplicationSlotMinimumLSN(void)
+{
+ XLogRecPtr retval;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ retval = XLogCtl->replicationSlotMinLSN;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return retval;
+}
+
+/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent.
+ *
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+ /* Quick check using our local copy of the variable */
+ if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
+ return;
+
+ /*
+ * An invalid minRecoveryPoint means that we need to recover all the WAL,
+ * i.e., we're doing crash recovery. We never modify the control file's
+ * value in that case, so we can short-circuit future checks here too. The
+ * local values of minRecoveryPoint and minRecoveryPointTLI should not be
+ * updated until crash recovery finishes. We only do this for the startup
+ * process as it should not update its own reference of minRecoveryPoint
+ * until it has finished crash recovery to make sure that all WAL
+ * available is replayed in this case. This also saves from extra locks
+ * taken on the control file from the startup process.
+ */
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
+ {
+ updateMinRecoveryPoint = false;
+ return;
+ }
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+ /* update local copy */
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
+ updateMinRecoveryPoint = false;
+ else if (force || LocalMinRecoveryPoint < lsn)
+ {
+ XLogRecPtr newMinRecoveryPoint;
+ TimeLineID newMinRecoveryPointTLI;
+
+ /*
+ * To avoid having to update the control file too often, we update it
+ * all the way to the last record being replayed, even though 'lsn'
+ * would suffice for correctness. This also allows the 'force' case
+ * to not need a valid 'lsn' value.
+ *
+ * Another important reason for doing it this way is that the passed
+ * 'lsn' value could be bogus, i.e., past the end of available WAL, if
+ * the caller got it from a corrupted heap page. Accepting such a
+ * value as the min recovery point would prevent us from coming up at
+ * all. Instead, we just log a warning and continue with recovery.
+ * (See also the comments about corrupt LSNs in XLogFlush.)
+ */
+ newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
+ if (!force && newMinRecoveryPoint < lsn)
+ elog(WARNING,
+ "xlog min recovery request %X/%X is past current point %X/%X",
+ LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
+
+ /* update control file */
+ if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
+ {
+ ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+ ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
+ UpdateControlFile();
+ LocalMinRecoveryPoint = newMinRecoveryPoint;
+ LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
+
+ ereport(DEBUG2,
+ (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
+ LSN_FORMAT_ARGS(newMinRecoveryPoint),
+ newMinRecoveryPointTLI)));
+ }
+ }
+ LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Ensure that all XLOG data through the given position is flushed to disk.
+ *
+ * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
+ * already held, and we try to avoid acquiring it if possible.
+ */
+void
+XLogFlush(XLogRecPtr record)
+{
+ XLogRecPtr WriteRqstPtr;
+ XLogwrtRqst WriteRqst;
+ TimeLineID insertTLI = XLogCtl->InsertTimeLineID;
+
+ /*
+ * During REDO, we are reading not writing WAL. Therefore, instead of
+ * trying to flush the WAL, we should update minRecoveryPoint instead. We
+ * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
+ * to act this way too, and because when it tries to write the
+ * end-of-recovery checkpoint, it should indeed flush.
+ */
+ if (!XLogInsertAllowed())
+ {
+ UpdateMinRecoveryPoint(record, false);
+ return;
+ }
+
+ /* Quick exit if already known flushed */
+ if (record <= LogwrtResult.Flush)
+ return;
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG)
+ elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
+ LSN_FORMAT_ARGS(record),
+ LSN_FORMAT_ARGS(LogwrtResult.Write),
+ LSN_FORMAT_ARGS(LogwrtResult.Flush));
+#endif
+
+ START_CRIT_SECTION();
+
+ /*
+ * Since fsync is usually a horribly expensive operation, we try to
+ * piggyback as much data as we can on each fsync: if we see any more data
+ * entered into the xlog buffer, we'll write and fsync that too, so that
+ * the final value of LogwrtResult.Flush is as large as possible. This
+ * gives us some chance of avoiding another fsync immediately after.
+ */
+
+ /* initialize to given target; may increase below */
+ WriteRqstPtr = record;
+
+ /*
+ * Now wait until we get the write lock, or someone else does the flush
+ * for us.
+ */
+ for (;;)
+ {
+ XLogRecPtr insertpos;
+
+ /* read LogwrtResult and update local state */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
+ WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /* done already? */
+ if (record <= LogwrtResult.Flush)
+ break;
+
+ /*
+ * Before actually performing the write, wait for all in-flight
+ * insertions to the pages we're about to write to finish.
+ */
+ insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
+
+ /*
+ * Try to get the write lock. If we can't get it immediately, wait
+ * until it's released, and recheck if we still need to do the flush
+ * or if the backend that held the lock did it for us already. This
+ * helps to maintain a good rate of group committing when the system
+ * is bottlenecked by the speed of fsyncing.
+ */
+ if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
+ {
+ /*
+ * The lock is now free, but we didn't acquire it yet. Before we
+ * do, loop back to check if someone else flushed the record for
+ * us already.
+ */
+ continue;
+ }
+
+ /* Got the lock; recheck whether request is satisfied */
+ LogwrtResult = XLogCtl->LogwrtResult;
+ if (record <= LogwrtResult.Flush)
+ {
+ LWLockRelease(WALWriteLock);
+ break;
+ }
+
+ /*
+ * Sleep before flush! By adding a delay here, we may give further
+ * backends the opportunity to join the backlog of group commit
+ * followers; this can significantly improve transaction throughput,
+ * at the risk of increasing transaction latency.
+ *
+ * We do not sleep if enableFsync is not turned on, nor if there are
+ * fewer than CommitSiblings other backends with active transactions.
+ */
+ if (CommitDelay > 0 && enableFsync &&
+ MinimumActiveBackends(CommitSiblings))
+ {
+ pg_usleep(CommitDelay);
+
+ /*
+ * Re-check how far we can now flush the WAL. It's generally not
+ * safe to call WaitXLogInsertionsToFinish while holding
+ * WALWriteLock, because an in-progress insertion might need to
+ * also grab WALWriteLock to make progress. But we know that all
+ * the insertions up to insertpos have already finished, because
+ * that's what the earlier WaitXLogInsertionsToFinish() returned.
+ * We're only calling it again to allow insertpos to be moved
+ * further forward, not to actually wait for anyone.
+ */
+ insertpos = WaitXLogInsertionsToFinish(insertpos);
+ }
+
+ /* try to write/flush later additions to XLOG as well */
+ WriteRqst.Write = insertpos;
+ WriteRqst.Flush = insertpos;
+
+ XLogWrite(WriteRqst, insertTLI, false);
+
+ LWLockRelease(WALWriteLock);
+ /* done */
+ break;
+ }
+
+ END_CRIT_SECTION();
+
+ /* wake up walsenders now that we've released heavily contended locks */
+ WalSndWakeupProcessRequests();
+
+ /*
+ * If we still haven't flushed to the request point then we have a
+ * problem; most likely, the requested flush point is past end of XLOG.
+ * This has been seen to occur when a disk page has a corrupted LSN.
+ *
+ * Formerly we treated this as a PANIC condition, but that hurts the
+ * system's robustness rather than helping it: we do not want to take down
+ * the whole system due to corruption on one data page. In particular, if
+ * the bad page is encountered again during recovery then we would be
+ * unable to restart the database at all! (This scenario actually
+ * happened in the field several times with 7.1 releases.) As of 8.4, bad
+ * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
+ * the only time we can reach here during recovery is while flushing the
+ * end-of-recovery checkpoint record, and we don't expect that to have a
+ * bad LSN.
+ *
+ * Note that for calls from xact.c, the ERROR will be promoted to PANIC
+ * since xact.c calls this routine inside a critical section. However,
+ * calls from bufmgr.c are not within critical sections and so we will not
+ * force a restart for a bad LSN on a data page.
+ */
+ if (LogwrtResult.Flush < record)
+ elog(ERROR,
+ "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
+ LSN_FORMAT_ARGS(record),
+ LSN_FORMAT_ARGS(LogwrtResult.Flush));
+}
+
+/*
+ * Write & flush xlog, but without specifying exactly where to.
+ *
+ * We normally write only completed blocks; but if there is nothing to do on
+ * that basis, we check for unwritten async commits in the current incomplete
+ * block, and write through the latest one of those. Thus, if async commits
+ * are not being used, we will write complete blocks only.
+ *
+ * If, based on the above, there's anything to write we do so immediately. But
+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
+ * more than wal_writer_flush_after unflushed blocks.
+ *
+ * We can guarantee that async commits reach disk after at most three
+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
+ * this makes a difference only with very high load or long wal_writer_delay,
+ * but imposes one extra cycle for the worst case for async commits.)
+ *
+ * This routine is invoked periodically by the background walwriter process.
+ *
+ * Returns true if there was any work to do, even if we skipped flushing due
+ * to wal_writer_delay/wal_writer_flush_after.
+ */
+bool
+XLogBackgroundFlush(void)
+{
+ XLogwrtRqst WriteRqst;
+ bool flexible = true;
+ static TimestampTz lastflush;
+ TimestampTz now;
+ int flushbytes;
+ TimeLineID insertTLI;
+
+ /* XLOG doesn't need flushing during recovery */
+ if (RecoveryInProgress())
+ return false;
+
+ /*
+ * Since we're not in recovery, InsertTimeLineID is set and can't change,
+ * so we can read it without a lock.
+ */
+ insertTLI = XLogCtl->InsertTimeLineID;
+
+ /* read LogwrtResult and update local state */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ WriteRqst = XLogCtl->LogwrtRqst;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /* back off to last completed page boundary */
+ WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
+
+ /* if we have already flushed that far, consider async commit records */
+ if (WriteRqst.Write <= LogwrtResult.Flush)
+ {
+ SpinLockAcquire(&XLogCtl->info_lck);
+ WriteRqst.Write = XLogCtl->asyncXactLSN;
+ SpinLockRelease(&XLogCtl->info_lck);
+ flexible = false; /* ensure it all gets written */
+ }
+
+ /*
+ * If already known flushed, we're done. Just need to check if we are
+ * holding an open file handle to a logfile that's no longer in use,
+ * preventing the file from being deleted.
+ */
+ if (WriteRqst.Write <= LogwrtResult.Flush)
+ {
+ if (openLogFile >= 0)
+ {
+ if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+ wal_segment_size))
+ {
+ XLogFileClose();
+ }
+ }
+ return false;
+ }
+
+ /*
+ * Determine how far to flush WAL, based on the wal_writer_delay and
+ * wal_writer_flush_after GUCs.
+ */
+ now = GetCurrentTimestamp();
+ flushbytes =
+ WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
+
+ if (WalWriterFlushAfter == 0 || lastflush == 0)
+ {
+ /* first call, or block based limits disabled */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
+ {
+ /*
+ * Flush the writes at least every WalWriterDelay ms. This is
+ * important to bound the amount of time it takes for an asynchronous
+ * commit to hit disk.
+ */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (flushbytes >= WalWriterFlushAfter)
+ {
+ /* exceeded wal_writer_flush_after blocks, flush */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else
+ {
+ /* no flushing, this time round */
+ WriteRqst.Flush = 0;
+ }
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG)
+ elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+ LSN_FORMAT_ARGS(WriteRqst.Write),
+ LSN_FORMAT_ARGS(WriteRqst.Flush),
+ LSN_FORMAT_ARGS(LogwrtResult.Write),
+ LSN_FORMAT_ARGS(LogwrtResult.Flush));
+#endif
+
+ START_CRIT_SECTION();
+
+ /* now wait for any in-progress insertions to finish and get write lock */
+ WaitXLogInsertionsToFinish(WriteRqst.Write);
+ LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ if (WriteRqst.Write > LogwrtResult.Write ||
+ WriteRqst.Flush > LogwrtResult.Flush)
+ {
+ XLogWrite(WriteRqst, insertTLI, flexible);
+ }
+ LWLockRelease(WALWriteLock);
+
+ END_CRIT_SECTION();
+
+ /* wake up walsenders now that we've released heavily contended locks */
+ WalSndWakeupProcessRequests();
+
+ /*
+ * Great, done. To take some work off the critical path, try to initialize
+ * as many of the no-longer-needed WAL buffers for future use as we can.
+ */
+ AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
+
+ /*
+ * If we determined that we need to write data, but somebody else
+ * wrote/flushed already, it should be considered as being active, to
+ * avoid hibernating too early.
+ */
+ return true;
+}
+
+/*
+ * Test whether XLOG data has been flushed up to (at least) the given position.
+ *
+ * Returns true if a flush is still needed. (It may be that someone else
+ * is already in process of flushing that far, however.)
+ */
+bool
+XLogNeedsFlush(XLogRecPtr record)
+{
+ /*
+ * During recovery, we don't flush WAL but update minRecoveryPoint
+ * instead. So "needs flush" is taken to mean whether minRecoveryPoint
+ * would need to be updated.
+ */
+ if (RecoveryInProgress())
+ {
+ /*
+ * An invalid minRecoveryPoint means that we need to recover all the
+ * WAL, i.e., we're doing crash recovery. We never modify the control
+ * file's value in that case, so we can short-circuit future checks
+ * here too. This triggers a quick exit path for the startup process,
+ * which cannot update its local copy of minRecoveryPoint as long as
+ * it has not replayed all WAL available when doing crash recovery.
+ */
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
+ updateMinRecoveryPoint = false;
+
+ /* Quick exit if already known to be updated or cannot be updated */
+ if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
+ return false;
+
+ /*
+ * Update local copy of minRecoveryPoint. But if the lock is busy,
+ * just return a conservative guess.
+ */
+ if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
+ return true;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Check minRecoveryPoint for any other process than the startup
+ * process doing crash recovery, which should not update the control
+ * file value if crash recovery is still running.
+ */
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
+ updateMinRecoveryPoint = false;
+
+ /* check again */
+ if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
+ return false;
+ else
+ return true;
+ }
+
+ /* Quick exit if already known flushed */
+ if (record <= LogwrtResult.Flush)
+ return false;
+
+ /* read LogwrtResult and update local state */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /* check again */
+ if (record <= LogwrtResult.Flush)
+ return false;
+
+ return true;
+}
+
+/*
+ * Try to make a given XLOG file segment exist.
+ *
+ * logsegno: identify segment.
+ *
+ * *added: on return, true if this call raised the number of extant segments.
+ *
+ * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
+ *
+ * Returns -1 or FD of opened file. A -1 here is not an error; a caller
+ * wanting an open segment should attempt to open "path", which usually will
+ * succeed. (This is weird, but it's efficient for the callers.)
+ */
+static int
+XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
+ bool *added, char *path)
+{
+ char tmppath[MAXPGPATH];
+ PGAlignedXLogBlock zbuffer;
+ XLogSegNo installed_segno;
+ XLogSegNo max_segno;
+ int fd;
+ int save_errno;
+
+ Assert(logtli != 0);
+
+ XLogFilePath(path, logtli, logsegno, wal_segment_size);
+
+ /*
+ * Try to use existent file (checkpoint maker may have created it already)
+ */
+ *added = false;
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+ if (fd < 0)
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ }
+ else
+ return fd;
+
+ /*
+ * Initialize an empty (all zeroes) segment. NOTE: it is possible that
+ * another process is doing the same thing. If so, we will end up
+ * pre-creating an extra log segment. That seems OK, and better than
+ * holding the lock throughout this lengthy process.
+ */
+ elog(DEBUG2, "creating and filling new WAL file");
+
+ snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+ unlink(tmppath);
+
+ /* do not use get_sync_bit() here --- want to fsync only at end of fill */
+ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tmppath)));
+
+ memset(zbuffer.data, 0, XLOG_BLCKSZ);
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
+ save_errno = 0;
+ if (wal_init_zero)
+ {
+ struct iovec iov[PG_IOV_MAX];
+ int blocks;
+
+ /*
+ * Zero-fill the file. With this setting, we do this the hard way to
+ * ensure that all the file space has really been allocated. On
+ * platforms that allow "holes" in files, just seeking to the end
+ * doesn't allocate intermediate space. This way, we know that we
+ * have all the space and (after the fsync below) that all the
+ * indirect blocks are down on disk. Therefore, fdatasync(2) or
+ * O_DSYNC will be sufficient to sync future writes to the log file.
+ */
+
+ /* Prepare to write out a lot of copies of our zero buffer at once. */
+ for (int i = 0; i < lengthof(iov); ++i)
+ {
+ iov[i].iov_base = zbuffer.data;
+ iov[i].iov_len = XLOG_BLCKSZ;
+ }
+
+ /* Loop, writing as many blocks as we can for each system call. */
+ blocks = wal_segment_size / XLOG_BLCKSZ;
+ for (int i = 0; i < blocks;)
+ {
+ int iovcnt = Min(blocks - i, lengthof(iov));
+ off_t offset = i * XLOG_BLCKSZ;
+
+ if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
+ {
+ save_errno = errno;
+ break;
+ }
+
+ i += iovcnt;
+ }
+ }
+ else
+ {
+ /*
+ * Otherwise, seeking to the end and writing a solitary byte is
+ * enough.
+ */
+ errno = 0;
+ if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+ {
+ /* if write didn't set errno, assume no disk space */
+ save_errno = errno ? errno : ENOSPC;
+ }
+ }
+ pgstat_report_wait_end();
+
+ if (save_errno)
+ {
+ /*
+ * If we fail to make the file, delete it to release disk space
+ */
+ unlink(tmppath);
+
+ close(fd);
+
+ errno = save_errno;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
+ if (pg_fsync(fd) != 0)
+ {
+ int save_errno = errno;
+
+ close(fd);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", tmppath)));
+ }
+ pgstat_report_wait_end();
+
+ if (close(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", tmppath)));
+
+ /*
+ * Now move the segment into place with its final name. Cope with
+ * possibility that someone else has created the file while we were
+ * filling ours: if so, use ours to pre-create a future log segment.
+ */
+ installed_segno = logsegno;
+
+ /*
+ * XXX: What should we use as max_segno? We used to use XLOGfileslop when
+ * that was a constant, but that was always a bit dubious: normally, at a
+ * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
+ * here, it was the offset from the insert location. We can't do the
+ * normal XLOGfileslop calculation here because we don't have access to
+ * the prior checkpoint's redo location. So somewhat arbitrarily, just use
+ * CheckPointSegments.
+ */
+ max_segno = logsegno + CheckPointSegments;
+ if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
+ logtli))
+ {
+ *added = true;
+ elog(DEBUG2, "done creating and filling new WAL file");
+ }
+ else
+ {
+ /*
+ * No need for any more future segments, or InstallXLogFileSegment()
+ * failed to rename the file into place. If the rename failed, a
+ * caller opening the file may fail.
+ */
+ unlink(tmppath);
+ elog(DEBUG2, "abandoned new WAL file");
+ }
+
+ return -1;
+}
+
+/*
+ * Create a new XLOG file segment, or open a pre-existing one.
+ *
+ * logsegno: identify segment to be created/opened.
+ *
+ * Returns FD of opened file.
+ *
+ * Note: errors here are ERROR not PANIC because we might or might not be
+ * inside a critical section (eg, during checkpoint there is no reason to
+ * take down the system on failure). They will promote to PANIC if we are
+ * in a critical section.
+ */
+int
+XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
+{
+ bool ignore_added;
+ char path[MAXPGPATH];
+ int fd;
+
+ Assert(logtli != 0);
+
+ fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
+ if (fd >= 0)
+ return fd;
+
+ /* Now open original target segment (might not be file I just made) */
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return fd;
+}
+
+/*
+ * Create a new XLOG file segment by copying a pre-existing one.
+ *
+ * destsegno: identify segment to be created.
+ *
+ * srcTLI, srcsegno: identify segment to be copied (could be from
+ * a different timeline)
+ *
+ * upto: how much of the source file to copy (the rest is filled with
+ * zeros)
+ *
+ * Currently this is only used during recovery, and so there are no locking
+ * considerations. But we should be just as tense as XLogFileInit to avoid
+ * emplacing a bogus file.
+ */
+static void
+XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
+ TimeLineID srcTLI, XLogSegNo srcsegno,
+ int upto)
+{
+ char path[MAXPGPATH];
+ char tmppath[MAXPGPATH];
+ PGAlignedXLogBlock buffer;
+ int srcfd;
+ int fd;
+ int nbytes;
+
+ /*
+ * Open the source file
+ */
+ XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
+ srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+ if (srcfd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+
+ /*
+ * Copy into a temp file name.
+ */
+ snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+ unlink(tmppath);
+
+ /* do not use get_sync_bit() here --- want to fsync only at end of fill */
+ fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tmppath)));
+
+ /*
+ * Do the data copying.
+ */
+ for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
+ {
+ int nread;
+
+ nread = upto - nbytes;
+
+ /*
+ * The part that is not read from the source file is filled with
+ * zeros.
+ */
+ if (nread < sizeof(buffer))
+ memset(buffer.data, 0, sizeof(buffer));
+
+ if (nread > 0)
+ {
+ int r;
+
+ if (nread > sizeof(buffer))
+ nread = sizeof(buffer);
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
+ r = read(srcfd, buffer.data, nread);
+ if (r != nread)
+ {
+ if (r < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ path)));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ path, r, (Size) nread)));
+ }
+ pgstat_report_wait_end();
+ }
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
+ if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
+ {
+ int save_errno = errno;
+
+ /*
+ * If we fail to make the file, delete it to release disk space
+ */
+ unlink(tmppath);
+ /* if write didn't set errno, assume problem is no disk space */
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tmppath)));
+ }
+ pgstat_report_wait_end();
+ }
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
+ if (pg_fsync(fd) != 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", tmppath)));
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", tmppath)));
+
+ if (CloseTransientFile(srcfd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", path)));
+
+ /*
+ * Now move the segment into place with its final name.
+ */
+ if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
+ elog(ERROR, "InstallXLogFileSegment should not have failed");
+}
+
+/*
+ * Install a new XLOG segment file as a current or future log segment.
+ *
+ * This is used both to install a newly-created segment (which has a temp
+ * filename while it's being created) and to recycle an old segment.
+ *
+ * *segno: identify segment to install as (or first possible target).
+ * When find_free is true, this is modified on return to indicate the
+ * actual installation location or last segment searched.
+ *
+ * tmppath: initial name of file to install. It will be renamed into place.
+ *
+ * find_free: if true, install the new segment at the first empty segno
+ * number at or after the passed numbers. If false, install the new segment
+ * exactly where specified, deleting any existing segment file there.
+ *
+ * max_segno: maximum segment number to install the new file as. Fail if no
+ * free slot is found between *segno and max_segno. (Ignored when find_free
+ * is false.)
+ *
+ * tli: The timeline on which the new segment should be installed.
+ *
+ * Returns true if the file was installed successfully. false indicates that
+ * max_segno limit was exceeded, the startup process has disabled this
+ * function for now, or an error occurred while renaming the file into place.
+ */
+static bool
+InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
+ bool find_free, XLogSegNo max_segno, TimeLineID tli)
+{
+ char path[MAXPGPATH];
+ struct stat stat_buf;
+
+ Assert(tli != 0);
+
+ XLogFilePath(path, tli, *segno, wal_segment_size);
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ if (!XLogCtl->InstallXLogFileSegmentActive)
+ {
+ LWLockRelease(ControlFileLock);
+ return false;
+ }
+
+ if (!find_free)
+ {
+ /* Force installation: get rid of any pre-existing segment file */
+ durable_unlink(path, DEBUG1);
+ }
+ else
+ {
+ /* Find a free slot to put it in */
+ while (stat(path, &stat_buf) == 0)
+ {
+ if ((*segno) >= max_segno)
+ {
+ /* Failed to find a free slot within specified range */
+ LWLockRelease(ControlFileLock);
+ return false;
+ }
+ (*segno)++;
+ XLogFilePath(path, tli, *segno, wal_segment_size);
+ }
+ }
+
+ /*
+ * Perform the rename using link if available, paranoidly trying to avoid
+ * overwriting an existing file (there shouldn't be one).
+ */
+ if (durable_rename_excl(tmppath, path, LOG) != 0)
+ {
+ LWLockRelease(ControlFileLock);
+ /* durable_rename_excl already emitted log message */
+ return false;
+ }
+
+ LWLockRelease(ControlFileLock);
+
+ return true;
+}
+
+/*
+ * Open a pre-existing logfile segment for writing.
+ */
+int
+XLogFileOpen(XLogSegNo segno, TimeLineID tli)
+{
+ char path[MAXPGPATH];
+ int fd;
+
+ XLogFilePath(path, tli, segno, wal_segment_size);
+
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+ if (fd < 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+
+ return fd;
+}
+
+/*
+ * Close the current logfile segment for writing.
+ */
+static void
+XLogFileClose(void)
+{
+ Assert(openLogFile >= 0);
+
+ /*
+ * WAL segment files will not be re-read in normal operation, so we advise
+ * the OS to release any cached pages. But do not do so if WAL archiving
+ * or streaming is active, because archiver and walsender process could
+ * use the cache to read the WAL segment.
+ */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ if (!XLogIsNeeded())
+ (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+
+ if (close(openLogFile) != 0)
+ {
+ char xlogfname[MAXFNAMELEN];
+ int save_errno = errno;
+
+ XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
+ errno = save_errno;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", xlogfname)));
+ }
+
+ openLogFile = -1;
+ ReleaseExternalFD();
+}
+
+/*
+ * Preallocate log files beyond the specified log endpoint.
+ *
+ * XXX this is currently extremely conservative, since it forces only one
+ * future log segment to exist, and even that only if we are 75% done with
+ * the current one. This is only appropriate for very low-WAL-volume systems.
+ * High-volume systems will be OK once they've built up a sufficient set of
+ * recycled log segments, but the startup transient is likely to include
+ * a lot of segment creations by foreground processes, which is not so good.
+ *
+ * XLogFileInitInternal() can ereport(ERROR). All known causes indicate big
+ * trouble; for example, a full filesystem is one cause. The checkpoint WAL
+ * and/or ControlFile updates already completed. If a RequestCheckpoint()
+ * initiated the present checkpoint and an ERROR ends this function, the
+ * command that called RequestCheckpoint() fails. That's not ideal, but it's
+ * not worth contorting more functions to use caller-specified elevel values.
+ * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
+ * reporting and resource reclamation.)
+ */
+static void
+PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
+{
+ XLogSegNo _logSegNo;
+ int lf;
+ bool added;
+ char path[MAXPGPATH];
+ uint64 offset;
+
+ if (!XLogCtl->InstallXLogFileSegmentActive)
+ return; /* unlocked check says no */
+
+ XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
+ offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
+ if (offset >= (uint32) (0.75 * wal_segment_size))
+ {
+ _logSegNo++;
+ lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
+ if (lf >= 0)
+ close(lf);
+ if (added)
+ CheckpointStats.ckpt_segs_added++;
+ }
+}
+
+/*
+ * Throws an error if the given log segment has already been removed or
+ * recycled. The caller should only pass a segment that it knows to have
+ * existed while the server has been running, as this function always
+ * succeeds if no WAL segments have been removed since startup.
+ * 'tli' is only used in the error message.
+ *
+ * Note: this function guarantees to keep errno unchanged on return.
+ * This supports callers that use this to possibly deliver a better
+ * error message about a missing file, while still being able to throw
+ * a normal file-access error afterwards, if this does return.
+ */
+void
+CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
+{
+ int save_errno = errno;
+ XLogSegNo lastRemovedSegNo;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (segno <= lastRemovedSegNo)
+ {
+ char filename[MAXFNAMELEN];
+
+ XLogFileName(filename, tli, segno, wal_segment_size);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("requested WAL segment %s has already been removed",
+ filename)));
+ }
+ errno = save_errno;
+}
+
+/*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+ XLogSegNo lastRemovedSegNo;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return lastRemovedSegNo;
+}
+
+
+/*
+ * Update the last removed segno pointer in shared memory, to reflect that the
+ * given XLOG file has been removed.
+ */
+static void
+UpdateLastRemovedPtr(char *filename)
+{
+ uint32 tli;
+ XLogSegNo segno;
+
+ XLogFromFileName(filename, &tli, &segno, wal_segment_size);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ if (segno > XLogCtl->lastRemovedSegNo)
+ XLogCtl->lastRemovedSegNo = segno;
+ SpinLockRelease(&XLogCtl->info_lck);
+}
+
+/*
+ * Remove all temporary log files in pg_wal
+ *
+ * This is called at the beginning of recovery after a previous crash,
+ * at a point where no other processes write fresh WAL data.
+ */
+static void
+RemoveTempXlogFiles(void)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+
+ elog(DEBUG2, "removing all temporary WAL segments");
+
+ xldir = AllocateDir(XLOGDIR);
+ while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+ {
+ char path[MAXPGPATH];
+
+ if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
+ unlink(path);
+ elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+ }
+ FreeDir(xldir);
+}
+
+/*
+ * Recycle or remove all log files older or equal to passed segno.
+ *
+ * endptr is current (or recent) end of xlog, and lastredoptr is the
+ * redo pointer of the last checkpoint. These are used to determine
+ * whether we want to recycle rather than delete no-longer-wanted log files.
+ *
+ * insertTLI is the current timeline for XLOG insertion. Any recycled
+ * segments should be reused for this timeline.
+ */
+static void
+RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
+ TimeLineID insertTLI)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+ char lastoff[MAXFNAMELEN];
+ XLogSegNo endlogSegNo;
+ XLogSegNo recycleSegNo;
+
+ /* Initialize info about where to try to recycle to */
+ XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
+ recycleSegNo = XLOGfileslop(lastredoptr);
+
+ /*
+ * Construct a filename of the last segment to be kept. The timeline ID
+ * doesn't matter, we ignore that in the comparison. (During recovery,
+ * InsertTimeLineID isn't set, so we can't use that.)
+ */
+ XLogFileName(lastoff, 0, segno, wal_segment_size);
+
+ elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
+ lastoff);
+
+ xldir = AllocateDir(XLOGDIR);
+
+ while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+ {
+ /* Ignore files that are not XLOG segments */
+ if (!IsXLogFileName(xlde->d_name) &&
+ !IsPartialXLogFileName(xlde->d_name))
+ continue;
+
+ /*
+ * We ignore the timeline part of the XLOG segment identifiers in
+ * deciding whether a segment is still needed. This ensures that we
+ * won't prematurely remove a segment from a parent timeline. We could
+ * probably be a little more proactive about removing segments of
+ * non-parent timelines, but that would be a whole lot more
+ * complicated.
+ *
+ * We use the alphanumeric sorting property of the filenames to decide
+ * which ones are earlier than the lastoff segment.
+ */
+ if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
+ {
+ if (XLogArchiveCheckDone(xlde->d_name))
+ {
+ /* Update the last removed location in shared memory first */
+ UpdateLastRemovedPtr(xlde->d_name);
+
+ RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo,
+ insertTLI);
+ }
+ }
+ }
+
+ FreeDir(xldir);
+}
+
+/*
+ * Remove WAL files that are not part of the given timeline's history.
+ *
+ * This is called during recovery, whenever we switch to follow a new
+ * timeline, and at the end of recovery when we create a new timeline. We
+ * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
+ * might be leftover pre-allocated or recycled WAL segments on the old timeline
+ * that we haven't used yet, and contain garbage. If we just leave them in
+ * pg_wal, they will eventually be archived, and we can't let that happen.
+ * Files that belong to our timeline history are valid, because we have
+ * successfully replayed them, but from others we can't be sure.
+ *
+ * 'switchpoint' is the current point in WAL where we switch to new timeline,
+ * and 'newTLI' is the new timeline we switch to.
+ */
+void
+RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+ char switchseg[MAXFNAMELEN];
+ XLogSegNo endLogSegNo;
+ XLogSegNo switchLogSegNo;
+ XLogSegNo recycleSegNo;
+
+ /*
+ * Initialize info about where to begin the work. This will recycle,
+ * somewhat arbitrarily, 10 future segments.
+ */
+ XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
+ XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
+ recycleSegNo = endLogSegNo + 10;
+
+ /*
+ * Construct a filename of the last segment to be kept.
+ */
+ XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
+
+ elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
+ switchseg);
+
+ xldir = AllocateDir(XLOGDIR);
+
+ while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+ {
+ /* Ignore files that are not XLOG segments */
+ if (!IsXLogFileName(xlde->d_name))
+ continue;
+
+ /*
+ * Remove files that are on a timeline older than the new one we're
+ * switching to, but with a segment number >= the first segment on the
+ * new timeline.
+ */
+ if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
+ strcmp(xlde->d_name + 8, switchseg + 8) > 0)
+ {
+ /*
+ * If the file has already been marked as .ready, however, don't
+ * remove it yet. It should be OK to remove it - files that are
+ * not part of our timeline history are not required for recovery
+ * - but seems safer to let them be archived and removed later.
+ */
+ if (!XLogArchiveIsReady(xlde->d_name))
+ RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo,
+ newTLI);
+ }
+ }
+
+ FreeDir(xldir);
+}
+
+/*
+ * Recycle or remove a log file that's no longer needed.
+ *
+ * segname is the name of the segment to recycle or remove. recycleSegNo
+ * is the segment number to recycle up to. endlogSegNo is the segment
+ * number of the current (or recent) end of WAL.
+ *
+ * endlogSegNo gets incremented if the segment is recycled so as it is not
+ * checked again with future callers of this function.
+ *
+ * insertTLI is the current timeline for XLOG insertion. Any recycled segments
+ * should be used for this timeline.
+ */
+static void
+RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
+ XLogSegNo *endlogSegNo, TimeLineID insertTLI)
+{
+ char path[MAXPGPATH];
+#ifdef WIN32
+ char newpath[MAXPGPATH];
+#endif
+ struct stat statbuf;
+
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
+
+ /*
+ * Before deleting the file, see if it can be recycled as a future log
+ * segment. Only recycle normal files, because we don't want to recycle
+ * symbolic links pointing to a separate archive directory.
+ */
+ if (wal_recycle &&
+ *endlogSegNo <= recycleSegNo &&
+ XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
+ lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
+ InstallXLogFileSegment(endlogSegNo, path,
+ true, recycleSegNo, insertTLI))
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("recycled write-ahead log file \"%s\"",
+ segname)));
+ CheckpointStats.ckpt_segs_recycled++;
+ /* Needn't recheck that slot on future iterations */
+ (*endlogSegNo)++;
+ }
+ else
+ {
+ /* No need for any more future segments, or recycling failed ... */
+ int rc;
+
+ ereport(DEBUG2,
+ (errmsg_internal("removing write-ahead log file \"%s\"",
+ segname)));
+
+#ifdef WIN32
+
+ /*
+ * On Windows, if another process (e.g another backend) holds the file
+ * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
+ * will still show up in directory listing until the last handle is
+ * closed. To avoid confusing the lingering deleted file for a live
+ * WAL file that needs to be archived, rename it before deleting it.
+ *
+ * If another process holds the file open without FILE_SHARE_DELETE
+ * flag, rename will fail. We'll try again at the next checkpoint.
+ */
+ snprintf(newpath, MAXPGPATH, "%s.deleted", path);
+ if (rename(path, newpath) != 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\": %m",
+ path)));
+ return;
+ }
+ rc = durable_unlink(newpath, LOG);
+#else
+ rc = durable_unlink(path, LOG);
+#endif
+ if (rc != 0)
+ {
+ /* Message already logged by durable_unlink() */
+ return;
+ }
+ CheckpointStats.ckpt_segs_removed++;
+ }
+
+ XLogArchiveCleanup(segname);
+}
+
+/*
+ * Verify whether pg_wal and pg_wal/archive_status exist.
+ * If the latter does not exist, recreate it.
+ *
+ * It is not the goal of this function to verify the contents of these
+ * directories, but to help in cases where someone has performed a cluster
+ * copy for PITR purposes but omitted pg_wal from the copy.
+ *
+ * We could also recreate pg_wal if it doesn't exist, but a deliberate
+ * policy decision was made not to. It is fairly common for pg_wal to be
+ * a symlink, and if that was the DBA's intent then automatically making a
+ * plain directory would result in degraded performance with no notice.
+ */
+static void
+ValidateXLOGDirectoryStructure(void)
+{
+ char path[MAXPGPATH];
+ struct stat stat_buf;
+
+ /* Check for pg_wal; if it doesn't exist, error out */
+ if (stat(XLOGDIR, &stat_buf) != 0 ||
+ !S_ISDIR(stat_buf.st_mode))
+ ereport(FATAL,
+ (errmsg("required WAL directory \"%s\" does not exist",
+ XLOGDIR)));
+
+ /* Check for archive_status */
+ snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
+ if (stat(path, &stat_buf) == 0)
+ {
+ /* Check for weird cases where it exists but isn't a directory */
+ if (!S_ISDIR(stat_buf.st_mode))
+ ereport(FATAL,
+ (errmsg("required WAL directory \"%s\" does not exist",
+ path)));
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("creating missing WAL directory \"%s\"", path)));
+ if (MakePGDirectory(path) < 0)
+ ereport(FATAL,
+ (errmsg("could not create missing directory \"%s\": %m",
+ path)));
+ }
+}
+
+/*
+ * Remove previous backup history files. This also retries creation of
+ * .ready files for any backup history files for which XLogArchiveNotify
+ * failed earlier.
+ */
+static void
+CleanupBackupHistory(void)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+ char path[MAXPGPATH + sizeof(XLOGDIR)];
+
+ xldir = AllocateDir(XLOGDIR);
+
+ while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+ {
+ if (IsBackupHistoryFileName(xlde->d_name))
+ {
+ if (XLogArchiveCheckDone(xlde->d_name))
+ {
+ elog(DEBUG2, "removing WAL backup history file \"%s\"",
+ xlde->d_name);
+ snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
+ unlink(path);
+ XLogArchiveCleanup(xlde->d_name);
+ }
+ }
+ }
+
+ FreeDir(xldir);
+}
+
+/*
+ * I/O routines for pg_control
+ *
+ * *ControlFile is a buffer in shared memory that holds an image of the
+ * contents of pg_control. WriteControlFile() initializes pg_control
+ * given a preloaded buffer, ReadControlFile() loads the buffer from
+ * the pg_control file (during postmaster or standalone-backend startup),
+ * and UpdateControlFile() rewrites pg_control after we modify xlog state.
+ * InitControlFile() fills the buffer with initial values.
+ *
+ * For simplicity, WriteControlFile() initializes the fields of pg_control
+ * that are related to checking backend/database compatibility, and
+ * ReadControlFile() verifies they are correct. We could split out the
+ * I/O and compatibility-check functions, but there seems no need currently.
+ */
+
+static void
+InitControlFile(uint64 sysidentifier)
+{
+ char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
+
+ /*
+ * Generate a random nonce. This is used for authentication requests that
+ * will fail because the user does not exist. The nonce is used to create
+ * a genuine-looking password challenge for the non-existent user, in lieu
+ * of an actual stored password.
+ */
+ if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
+ ereport(PANIC,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not generate secret authorization token")));
+
+ memset(ControlFile, 0, sizeof(ControlFileData));
+ /* Initialize pg_control status fields */
+ ControlFile->system_identifier = sysidentifier;
+ memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
+ ControlFile->state = DB_SHUTDOWNED;
+ ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
+
+ /* Set important parameter values for use when replaying WAL */
+ ControlFile->MaxConnections = MaxConnections;
+ ControlFile->max_worker_processes = max_worker_processes;
+ ControlFile->max_wal_senders = max_wal_senders;
+ ControlFile->max_prepared_xacts = max_prepared_xacts;
+ ControlFile->max_locks_per_xact = max_locks_per_xact;
+ ControlFile->wal_level = wal_level;
+ ControlFile->wal_log_hints = wal_log_hints;
+ ControlFile->track_commit_timestamp = track_commit_timestamp;
+ ControlFile->data_checksum_version = bootstrap_data_checksum_version;
+}
+
+static void
+WriteControlFile(void)
+{
+ int fd;
+ char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
+
+ /*
+ * Ensure that the size of the pg_control data structure is sane. See the
+ * comments for these symbols in pg_control.h.
+ */
+ StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
+ "pg_control is too large for atomic disk writes");
+ StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
+ "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
+
+ /*
+ * Initialize version and compatibility-check fields
+ */
+ ControlFile->pg_control_version = PG_CONTROL_VERSION;
+ ControlFile->catalog_version_no = CATALOG_VERSION_NO;
+
+ ControlFile->maxAlign = MAXIMUM_ALIGNOF;
+ ControlFile->floatFormat = FLOATFORMAT_VALUE;
+
+ ControlFile->blcksz = BLCKSZ;
+ ControlFile->relseg_size = RELSEG_SIZE;
+ ControlFile->xlog_blcksz = XLOG_BLCKSZ;
+ ControlFile->xlog_seg_size = wal_segment_size;
+
+ ControlFile->nameDataLen = NAMEDATALEN;
+ ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
+
+ ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
+ ControlFile->loblksize = LOBLKSIZE;
+
+ ControlFile->float8ByVal = FLOAT8PASSBYVAL;
+
+ /* Contents are protected with a CRC */
+ INIT_CRC32C(ControlFile->crc);
+ COMP_CRC32C(ControlFile->crc,
+ (char *) ControlFile,
+ offsetof(ControlFileData, crc));
+ FIN_CRC32C(ControlFile->crc);
+
+ /*
+ * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
+ * the excess over sizeof(ControlFileData). This reduces the odds of
+ * premature-EOF errors when reading pg_control. We'll still fail when we
+ * check the contents of the file, but hopefully with a more specific
+ * error than "couldn't read pg_control".
+ */
+ memset(buffer, 0, PG_CONTROL_FILE_SIZE);
+ memcpy(buffer, ControlFile, sizeof(ControlFileData));
+
+ fd = BasicOpenFile(XLOG_CONTROL_FILE,
+ O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ if (fd < 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
+ if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ }
+ pgstat_report_wait_end();
+
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
+ if (pg_fsync(fd) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ pgstat_report_wait_end();
+
+ if (close(fd) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+}
+
+static void
+ReadControlFile(void)
+{
+ pg_crc32c crc;
+ int fd;
+ static char wal_segsz_str[20];
+ int r;
+
+ /*
+ * Read data...
+ */
+ fd = BasicOpenFile(XLOG_CONTROL_FILE,
+ O_RDWR | PG_BINARY);
+ if (fd < 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+
+ pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
+ r = read(fd, ControlFile, sizeof(ControlFileData));
+ if (r != sizeof(ControlFileData))
+ {
+ if (r < 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ else
+ ereport(PANIC,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
+ }
+ pgstat_report_wait_end();
+
+ close(fd);
+
+ /*
+ * Check for expected pg_control format version. If this is wrong, the
+ * CRC check will likely fail because we'll be checking the wrong number
+ * of bytes. Complaining about wrong version will probably be more
+ * enlightening than complaining about wrong CRC.
+ */
+
+ if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
+ " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
+ ControlFile->pg_control_version, ControlFile->pg_control_version,
+ PG_CONTROL_VERSION, PG_CONTROL_VERSION),
+ errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
+
+ if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
+ " but the server was compiled with PG_CONTROL_VERSION %d.",
+ ControlFile->pg_control_version, PG_CONTROL_VERSION),
+ errhint("It looks like you need to initdb.")));
+
+ /* Now check the CRC. */
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc,
+ (char *) ControlFile,
+ offsetof(ControlFileData, crc));
+ FIN_CRC32C(crc);
+
+ if (!EQ_CRC32C(crc, ControlFile->crc))
+ ereport(FATAL,
+ (errmsg("incorrect checksum in control file")));
+
+ /*
+ * Do compatibility checking immediately. If the database isn't
+ * compatible with the backend executable, we want to abort before we can
+ * possibly do any damage.
+ */
+ if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
+ " but the server was compiled with CATALOG_VERSION_NO %d.",
+ ControlFile->catalog_version_no, CATALOG_VERSION_NO),
+ errhint("It looks like you need to initdb.")));
+ if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with MAXALIGN %d,"
+ " but the server was compiled with MAXALIGN %d.",
+ ControlFile->maxAlign, MAXIMUM_ALIGNOF),
+ errhint("It looks like you need to initdb.")));
+ if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
+ errhint("It looks like you need to initdb.")));
+ if (ControlFile->blcksz != BLCKSZ)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with BLCKSZ %d,"
+ " but the server was compiled with BLCKSZ %d.",
+ ControlFile->blcksz, BLCKSZ),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->relseg_size != RELSEG_SIZE)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
+ " but the server was compiled with RELSEG_SIZE %d.",
+ ControlFile->relseg_size, RELSEG_SIZE),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
+ " but the server was compiled with XLOG_BLCKSZ %d.",
+ ControlFile->xlog_blcksz, XLOG_BLCKSZ),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->nameDataLen != NAMEDATALEN)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with NAMEDATALEN %d,"
+ " but the server was compiled with NAMEDATALEN %d.",
+ ControlFile->nameDataLen, NAMEDATALEN),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
+ " but the server was compiled with INDEX_MAX_KEYS %d.",
+ ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
+ " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
+ ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
+ errhint("It looks like you need to recompile or initdb.")));
+ if (ControlFile->loblksize != LOBLKSIZE)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with LOBLKSIZE %d,"
+ " but the server was compiled with LOBLKSIZE %d.",
+ ControlFile->loblksize, (int) LOBLKSIZE),
+ errhint("It looks like you need to recompile or initdb.")));
+
+#ifdef USE_FLOAT8_BYVAL
+ if (ControlFile->float8ByVal != true)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
+ " but the server was compiled with USE_FLOAT8_BYVAL."),
+ errhint("It looks like you need to recompile or initdb.")));
+#else
+ if (ControlFile->float8ByVal != false)
+ ereport(FATAL,
+ (errmsg("database files are incompatible with server"),
+ errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
+ " but the server was compiled without USE_FLOAT8_BYVAL."),
+ errhint("It looks like you need to recompile or initdb.")));
+#endif
+
+ wal_segment_size = ControlFile->xlog_seg_size;
+
+ if (!IsValidWalSegSize(wal_segment_size))
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
+ "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
+ wal_segment_size,
+ wal_segment_size)));
+
+ snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
+ SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
+ PGC_S_DYNAMIC_DEFAULT);
+
+ /* check and update variables dependent on wal_segment_size */
+ if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
+
+ if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
+
+ UsableBytesInSegment =
+ (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
+ (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
+
+ CalculateCheckpointSegments();
+
+ /* Make the initdb settings visible as GUC variables, too */
+ SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
+ PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+}
+
+/*
+ * Utility wrapper to update the control file. Note that the control
+ * file gets flushed.
+ */
+static void
+UpdateControlFile(void)
+{
+ update_controlfile(DataDir, ControlFile, true);
+}
+
+/*
+ * Returns the unique system identifier from control file.
+ */
+uint64
+GetSystemIdentifier(void)
+{
+ Assert(ControlFile != NULL);
+ return ControlFile->system_identifier;
+}
+
+/*
+ * Returns the random nonce from control file.
+ */
+char *
+GetMockAuthenticationNonce(void)
+{
+ Assert(ControlFile != NULL);
+ return ControlFile->mock_authentication_nonce;
+}
+
+/*
+ * Are checksums enabled for data pages?
+ */
+bool
+DataChecksumsEnabled(void)
+{
+ Assert(ControlFile != NULL);
+ return (ControlFile->data_checksum_version > 0);
+}
+
+/*
+ * Returns a fake LSN for unlogged relations.
+ *
+ * Each call generates an LSN that is greater than any previous value
+ * returned. The current counter value is saved and restored across clean
+ * shutdowns, but like unlogged relations, does not survive a crash. This can
+ * be used in lieu of real LSN values returned by XLogInsert, if you need an
+ * LSN-like increasing sequence of numbers without writing any WAL.
+ */
+XLogRecPtr
+GetFakeLSNForUnloggedRel(void)
+{
+ XLogRecPtr nextUnloggedLSN;
+
+ /* increment the unloggedLSN counter, need SpinLock */
+ SpinLockAcquire(&XLogCtl->ulsn_lck);
+ nextUnloggedLSN = XLogCtl->unloggedLSN++;
+ SpinLockRelease(&XLogCtl->ulsn_lck);
+
+ return nextUnloggedLSN;
+}
+
+/*
+ * Auto-tune the number of XLOG buffers.
+ *
+ * The preferred setting for wal_buffers is about 3% of shared_buffers, with
+ * a maximum of one XLOG segment (there is little reason to think that more
+ * is helpful, at least so long as we force an fsync when switching log files)
+ * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
+ * 9.1, when auto-tuning was added).
+ *
+ * This should not be called until NBuffers has received its final value.
+ */
+static int
+XLOGChooseNumBuffers(void)
+{
+ int xbuffers;
+
+ xbuffers = NBuffers / 32;
+ if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
+ xbuffers = (wal_segment_size / XLOG_BLCKSZ);
+ if (xbuffers < 8)
+ xbuffers = 8;
+ return xbuffers;
+}
+
+/*
+ * GUC check_hook for wal_buffers
+ */
+bool
+check_wal_buffers(int *newval, void **extra, GucSource source)
+{
+ /*
+ * -1 indicates a request for auto-tune.
+ */
+ if (*newval == -1)
+ {
+ /*
+ * If we haven't yet changed the boot_val default of -1, just let it
+ * be. We'll fix it when XLOGShmemSize is called.
+ */
+ if (XLOGbuffers == -1)
+ return true;
+
+ /* Otherwise, substitute the auto-tune value */
+ *newval = XLOGChooseNumBuffers();
+ }
+
+ /*
+ * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
+ * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
+ * the case, we just silently treat such values as a request for the
+ * minimum. (We could throw an error instead, but that doesn't seem very
+ * helpful.)
+ */
+ if (*newval < 4)
+ *newval = 4;
+
+ return true;
+}
+
+/*
+ * Read the control file, set respective GUCs.
+ *
+ * This is to be called during startup, including a crash recovery cycle,
+ * unless in bootstrap mode, where no control file yet exists. As there's no
+ * usable shared memory yet (its sizing can depend on the contents of the
+ * control file!), first store the contents in local memory. XLOGShmemInit()
+ * will then copy it to shared memory later.
+ *
+ * reset just controls whether previous contents are to be expected (in the
+ * reset case, there's a dangling pointer into old shared memory), or not.
+ */
+void
+LocalProcessControlFile(bool reset)
+{
+ Assert(reset || ControlFile == NULL);
+ ControlFile = palloc(sizeof(ControlFileData));
+ ReadControlFile();
+}
+
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+ Size size;
+
+ /*
+ * If the value of wal_buffers is -1, use the preferred auto-tune value.
+ * This isn't an amazingly clean place to do this, but we must wait till
+ * NBuffers has received its final value, and must do it before using the
+ * value of XLOGbuffers to do anything important.
+ *
+ * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
+ * However, if the DBA explicitly set wal_buffers = -1 in the config file,
+ * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
+ * the matter with PGC_S_OVERRIDE.
+ */
+ if (XLOGbuffers == -1)
+ {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
+ SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
+ PGC_S_DYNAMIC_DEFAULT);
+ if (XLOGbuffers == -1) /* failed to apply it? */
+ SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
+ PGC_S_OVERRIDE);
+ }
+ Assert(XLOGbuffers > 0);
+
+ /* XLogCtl */
+ size = sizeof(XLogCtlData);
+
+ /* WAL insertion locks, plus alignment */
+ size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
+ /* xlblocks array */
+ size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
+ /* extra alignment padding for XLOG I/O buffers */
+ size = add_size(size, XLOG_BLCKSZ);
+ /* and the buffers themselves */
+ size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
+
+ /*
+ * Note: we don't count ControlFileData, it comes out of the "slop factor"
+ * added by CreateSharedMemoryAndSemaphores. This lets us use this
+ * routine again below to compute the actual allocation size.
+ */
+
+ return size;
+}
+
+void
+XLOGShmemInit(void)
+{
+ bool foundCFile,
+ foundXLog;
+ char *allocptr;
+ int i;
+ ControlFileData *localControlFile;
+
+#ifdef WAL_DEBUG
+
+ /*
+ * Create a memory context for WAL debugging that's exempt from the normal
+ * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
+ * an allocation fails, but wal_debug is not for production use anyway.
+ */
+ if (walDebugCxt == NULL)
+ {
+ walDebugCxt = AllocSetContextCreate(TopMemoryContext,
+ "WAL Debug",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(walDebugCxt, true);
+ }
+#endif
+
+
+ XLogCtl = (XLogCtlData *)
+ ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+
+ localControlFile = ControlFile;
+ ControlFile = (ControlFileData *)
+ ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
+
+ if (foundCFile || foundXLog)
+ {
+ /* both should be present or neither */
+ Assert(foundCFile && foundXLog);
+
+ /* Initialize local copy of WALInsertLocks */
+ WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
+
+ if (localControlFile)
+ pfree(localControlFile);
+ return;
+ }
+ memset(XLogCtl, 0, sizeof(XLogCtlData));
+
+ /*
+ * Already have read control file locally, unless in bootstrap mode. Move
+ * contents into shared memory.
+ */
+ if (localControlFile)
+ {
+ memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
+ pfree(localControlFile);
+ }
+
+ /*
+ * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
+ * multiple of the alignment for same, so no extra alignment padding is
+ * needed here.
+ */
+ allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
+ XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
+ memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
+ allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
+
+
+ /* WAL insertion locks. Ensure they're aligned to the full padded size */
+ allocptr += sizeof(WALInsertLockPadded) -
+ ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
+ WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
+ (WALInsertLockPadded *) allocptr;
+ allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
+
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ {
+ LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
+ WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+ WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
+ }
+
+ /*
+ * Align the start of the page buffers to a full xlog block size boundary.
+ * This simplifies some calculations in XLOG insertion. It is also
+ * required for O_DIRECT.
+ */
+ allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
+ XLogCtl->pages = allocptr;
+ memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
+
+ /*
+ * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
+ * in additional info.)
+ */
+ XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+ XLogCtl->InstallXLogFileSegmentActive = false;
+ XLogCtl->WalWriterSleeping = false;
+
+ SpinLockInit(&XLogCtl->Insert.insertpos_lck);
+ SpinLockInit(&XLogCtl->info_lck);
+ SpinLockInit(&XLogCtl->ulsn_lck);
+}
+
+/*
+ * This func must be called ONCE on system install. It creates pg_control
+ * and the initial XLOG segment.
+ */
+void
+BootStrapXLOG(void)
+{
+ CheckPoint checkPoint;
+ char *buffer;
+ XLogPageHeader page;
+ XLogLongPageHeader longpage;
+ XLogRecord *record;
+ char *recptr;
+ uint64 sysidentifier;
+ struct timeval tv;
+ pg_crc32c crc;
+
+ /* allow ordinary WAL segment creation, like StartupXLOG() would */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ XLogCtl->InstallXLogFileSegmentActive = true;
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Select a hopefully-unique system identifier code for this installation.
+ * We use the result of gettimeofday(), including the fractional seconds
+ * field, as being about as unique as we can easily get. (Think not to
+ * use random(), since it hasn't been seeded and there's no portable way
+ * to seed it other than the system clock value...) The upper half of the
+ * uint64 value is just the tv_sec part, while the lower half contains the
+ * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
+ * PID for a little extra uniqueness. A person knowing this encoding can
+ * determine the initialization time of the installation, which could
+ * perhaps be useful sometimes.
+ */
+ gettimeofday(&tv, NULL);
+ sysidentifier = ((uint64) tv.tv_sec) << 32;
+ sysidentifier |= ((uint64) tv.tv_usec) << 12;
+ sysidentifier |= getpid() & 0xFFF;
+
+ /* page buffer must be aligned suitably for O_DIRECT */
+ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
+ page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
+ memset(page, 0, XLOG_BLCKSZ);
+
+ /*
+ * Set up information for the initial checkpoint record
+ *
+ * The initial checkpoint record is written to the beginning of the WAL
+ * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
+ * used, so that we can use 0/0 to mean "before any valid WAL segment".
+ */
+ checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
+ checkPoint.ThisTimeLineID = BootstrapTimeLineID;
+ checkPoint.PrevTimeLineID = BootstrapTimeLineID;
+ checkPoint.fullPageWrites = fullPageWrites;
+ checkPoint.nextXid =
+ FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
+ checkPoint.nextOid = FirstGenbkiObjectId;
+ checkPoint.nextMulti = FirstMultiXactId;
+ checkPoint.nextMultiOffset = 0;
+ checkPoint.oldestXid = FirstNormalTransactionId;
+ checkPoint.oldestXidDB = Template1DbOid;
+ checkPoint.oldestMulti = FirstMultiXactId;
+ checkPoint.oldestMultiDB = Template1DbOid;
+ checkPoint.oldestCommitTsXid = InvalidTransactionId;
+ checkPoint.newestCommitTsXid = InvalidTransactionId;
+ checkPoint.time = (pg_time_t) time(NULL);
+ checkPoint.oldestActiveXid = InvalidTransactionId;
+
+ ShmemVariableCache->nextXid = checkPoint.nextXid;
+ ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->oidCount = 0;
+ MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+ AdvanceOldestClogXid(checkPoint.oldestXid);
+ SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+ SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
+
+ /* Set up the XLOG page header */
+ page->xlp_magic = XLOG_PAGE_MAGIC;
+ page->xlp_info = XLP_LONG_HEADER;
+ page->xlp_tli = BootstrapTimeLineID;
+ page->xlp_pageaddr = wal_segment_size;
+ longpage = (XLogLongPageHeader) page;
+ longpage->xlp_sysid = sysidentifier;
+ longpage->xlp_seg_size = wal_segment_size;
+ longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
+
+ /* Insert the initial checkpoint record */
+ recptr = ((char *) page + SizeOfXLogLongPHD);
+ record = (XLogRecord *) recptr;
+ record->xl_prev = 0;
+ record->xl_xid = InvalidTransactionId;
+ record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
+ record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
+ record->xl_rmid = RM_XLOG_ID;
+ recptr += SizeOfXLogRecord;
+ /* fill the XLogRecordDataHeaderShort struct */
+ *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
+ *(recptr++) = sizeof(checkPoint);
+ memcpy(recptr, &checkPoint, sizeof(checkPoint));
+ recptr += sizeof(checkPoint);
+ Assert(recptr - (char *) record == record->xl_tot_len);
+
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
+ COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
+ FIN_CRC32C(crc);
+ record->xl_crc = crc;
+
+ /* Create first XLOG segment file */
+ openLogTLI = BootstrapTimeLineID;
+ openLogFile = XLogFileInit(1, BootstrapTimeLineID);
+
+ /*
+ * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
+ * close the file again in a moment.
+ */
+
+ /* Write the first page with the initial record */
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
+ if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write bootstrap write-ahead log file: %m")));
+ }
+ pgstat_report_wait_end();
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
+ if (pg_fsync(openLogFile) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not fsync bootstrap write-ahead log file: %m")));
+ pgstat_report_wait_end();
+
+ if (close(openLogFile) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not close bootstrap write-ahead log file: %m")));
+
+ openLogFile = -1;
+
+ /* Now create pg_control */
+ InitControlFile(sysidentifier);
+ ControlFile->time = checkPoint.time;
+ ControlFile->checkPoint = checkPoint.redo;
+ ControlFile->checkPointCopy = checkPoint;
+
+ /* some additional ControlFile fields are set in WriteControlFile() */
+ WriteControlFile();
+
+ /* Bootstrap the commit log, too */
+ BootStrapCLOG();
+ BootStrapCommitTs();
+ BootStrapSUBTRANS();
+ BootStrapMultiXact();
+
+ pfree(buffer);
+
+ /*
+ * Force control file to be read - in contrast to normal processing we'd
+ * otherwise never run the checks and GUC related initializations therein.
+ */
+ ReadControlFile();
+}
+
+static char *
+str_time(pg_time_t tnow)
+{
+ static char buf[128];
+
+ pg_strftime(buf, sizeof(buf),
+ "%Y-%m-%d %H:%M:%S %Z",
+ pg_localtime(&tnow, log_timezone));
+
+ return buf;
+}
+
+/*
+ * Initialize the first WAL segment on new timeline.
+ */
+static void
+XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
+{
+ char xlogfname[MAXFNAMELEN];
+ XLogSegNo endLogSegNo;
+ XLogSegNo startLogSegNo;
+
+ /* we always switch to a new timeline after archive recovery */
+ Assert(endTLI != newTLI);
+
+ /*
+ * Update min recovery point one last time.
+ */
+ UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+
+ /*
+ * Calculate the last segment on the old timeline, and the first segment
+ * on the new timeline. If the switch happens in the middle of a segment,
+ * they are the same, but if the switch happens exactly at a segment
+ * boundary, startLogSegNo will be endLogSegNo + 1.
+ */
+ XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
+ XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
+
+ /*
+ * Initialize the starting WAL segment for the new timeline. If the switch
+ * happens in the middle of a segment, copy data from the last WAL segment
+ * of the old timeline up to the switch point, to the starting WAL segment
+ * on the new timeline.
+ */
+ if (endLogSegNo == startLogSegNo)
+ {
+ /*
+ * Make a copy of the file on the new timeline.
+ *
+ * Writing WAL isn't allowed yet, so there are no locking
+ * considerations. But we should be just as tense as XLogFileInit to
+ * avoid emplacing a bogus file.
+ */
+ XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
+ XLogSegmentOffset(endOfLog, wal_segment_size));
+ }
+ else
+ {
+ /*
+ * The switch happened at a segment boundary, so just create the next
+ * segment on the new timeline.
+ */
+ int fd;
+
+ fd = XLogFileInit(startLogSegNo, newTLI);
+
+ if (close(fd) != 0)
+ {
+ char xlogfname[MAXFNAMELEN];
+ int save_errno = errno;
+
+ XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", xlogfname)));
+ }
+ }
+
+ /*
+ * Let's just make real sure there are not .ready or .done flags posted
+ * for the new segment.
+ */
+ XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
+ XLogArchiveCleanup(xlogfname);
+}
+
+/*
+ * Perform cleanup actions at the conclusion of archive recovery.
+ */
+static void
+CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
+ TimeLineID newTLI)
+{
+ /*
+ * Execute the recovery_end_command, if any.
+ */
+ if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
+ ExecuteRecoveryCommand(recoveryEndCommand,
+ "recovery_end_command",
+ true,
+ WAIT_EVENT_RECOVERY_END_COMMAND);
+
+ /*
+ * We switched to a new timeline. Clean up segments on the old timeline.
+ *
+ * If there are any higher-numbered segments on the old timeline, remove
+ * them. They might contain valid WAL, but they might also be
+ * pre-allocated files containing garbage. In any case, they are not part
+ * of the new timeline's history so we don't need them.
+ */
+ RemoveNonParentXlogFiles(EndOfLog, newTLI);
+
+ /*
+ * If the switch happened in the middle of a segment, what to do with the
+ * last, partial segment on the old timeline? If we don't archive it, and
+ * the server that created the WAL never archives it either (e.g. because
+ * it was hit by a meteor), it will never make it to the archive. That's
+ * OK from our point of view, because the new segment that we created with
+ * the new TLI contains all the WAL from the old timeline up to the switch
+ * point. But if you later try to do PITR to the "missing" WAL on the old
+ * timeline, recovery won't find it in the archive. It's physically
+ * present in the new file with new TLI, but recovery won't look there
+ * when it's recovering to the older timeline. On the other hand, if we
+ * archive the partial segment, and the original server on that timeline
+ * is still running and archives the completed version of the same segment
+ * later, it will fail. (We used to do that in 9.4 and below, and it
+ * caused such problems).
+ *
+ * As a compromise, we rename the last segment with the .partial suffix,
+ * and archive it. Archive recovery will never try to read .partial
+ * segments, so they will normally go unused. But in the odd PITR case,
+ * the administrator can copy them manually to the pg_wal directory
+ * (removing the suffix). They can be useful in debugging, too.
+ *
+ * If a .done or .ready file already exists for the old timeline, however,
+ * we had already determined that the segment is complete, so we can let
+ * it be archived normally. (In particular, if it was restored from the
+ * archive to begin with, it's expected to have a .done file).
+ */
+ if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
+ XLogArchivingActive())
+ {
+ char origfname[MAXFNAMELEN];
+ XLogSegNo endLogSegNo;
+
+ XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+ XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
+
+ if (!XLogArchiveIsReadyOrDone(origfname))
+ {
+ char origpath[MAXPGPATH];
+ char partialfname[MAXFNAMELEN];
+ char partialpath[MAXPGPATH];
+
+ XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
+ snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
+ snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
+
+ /*
+ * Make sure there's no .done or .ready file for the .partial
+ * file.
+ */
+ XLogArchiveCleanup(partialfname);
+
+ durable_rename(origpath, partialpath, ERROR);
+ XLogArchiveNotify(partialfname);
+ }
+ }
+}
+
+/*
+ * Check to see if required parameters are set high enough on this server
+ * for various aspects of recovery operation.
+ *
+ * Note that all the parameters which this function tests need to be
+ * listed in Administrator's Overview section in high-availability.sgml.
+ * If you change them, don't forget to update the list.
+ */
+static void
+CheckRequiredParameterValues(void)
+{
+ /*
+ * For archive recovery, the WAL must be generated with at least 'replica'
+ * wal_level.
+ */
+ if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+ {
+ ereport(FATAL,
+ (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
+ errdetail("This happens if you temporarily set wal_level=minimal on the server."),
+ errhint("Use a backup taken after setting wal_level to higher than minimal.")));
+ }
+
+ /*
+ * For Hot Standby, the WAL must be generated with 'replica' mode, and we
+ * must have at least as many backend slots as the primary.
+ */
+ if (ArchiveRecoveryRequested && EnableHotStandby)
+ {
+ /* We ignore autovacuum_max_workers when we make this test. */
+ RecoveryRequiresIntParameter("max_connections",
+ MaxConnections,
+ ControlFile->MaxConnections);
+ RecoveryRequiresIntParameter("max_worker_processes",
+ max_worker_processes,
+ ControlFile->max_worker_processes);
+ RecoveryRequiresIntParameter("max_wal_senders",
+ max_wal_senders,
+ ControlFile->max_wal_senders);
+ RecoveryRequiresIntParameter("max_prepared_transactions",
+ max_prepared_xacts,
+ ControlFile->max_prepared_xacts);
+ RecoveryRequiresIntParameter("max_locks_per_transaction",
+ max_locks_per_xact,
+ ControlFile->max_locks_per_xact);
+ }
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup
+ */
+void
+StartupXLOG(void)
+{
+ XLogCtlInsert *Insert;
+ CheckPoint checkPoint;
+ bool wasShutdown;
+ bool didCrash;
+ bool haveTblspcMap;
+ bool haveBackupLabel;
+ XLogRecPtr EndOfLog;
+ TimeLineID EndOfLogTLI;
+ TimeLineID newTLI;
+ bool performedWalRecovery;
+ EndOfWalRecoveryInfo *endOfRecoveryInfo;
+ XLogRecPtr abortedRecPtr;
+ XLogRecPtr missingContrecPtr;
+ TransactionId oldestActiveXID;
+ bool promoted = false;
+
+ /*
+ * We should have an aux process resource owner to use, and we should not
+ * be in a transaction that's installed some other resowner.
+ */
+ Assert(AuxProcessResourceOwner != NULL);
+ Assert(CurrentResourceOwner == NULL ||
+ CurrentResourceOwner == AuxProcessResourceOwner);
+ CurrentResourceOwner = AuxProcessResourceOwner;
+
+ /*
+ * Check that contents look valid.
+ */
+ if (!XRecOffIsValid(ControlFile->checkPoint))
+ ereport(FATAL,
+ (errmsg("control file contains invalid checkpoint location")));
+
+ switch (ControlFile->state)
+ {
+ case DB_SHUTDOWNED:
+
+ /*
+ * This is the expected case, so don't be chatty in standalone
+ * mode
+ */
+ ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+ (errmsg("database system was shut down at %s",
+ str_time(ControlFile->time))));
+ break;
+
+ case DB_SHUTDOWNED_IN_RECOVERY:
+ ereport(LOG,
+ (errmsg("database system was shut down in recovery at %s",
+ str_time(ControlFile->time))));
+ break;
+
+ case DB_SHUTDOWNING:
+ ereport(LOG,
+ (errmsg("database system shutdown was interrupted; last known up at %s",
+ str_time(ControlFile->time))));
+ break;
+
+ case DB_IN_CRASH_RECOVERY:
+ ereport(LOG,
+ (errmsg("database system was interrupted while in recovery at %s",
+ str_time(ControlFile->time)),
+ errhint("This probably means that some data is corrupted and"
+ " you will have to use the last backup for recovery.")));
+ break;
+
+ case DB_IN_ARCHIVE_RECOVERY:
+ ereport(LOG,
+ (errmsg("database system was interrupted while in recovery at log time %s",
+ str_time(ControlFile->checkPointCopy.time)),
+ errhint("If this has occurred more than once some data might be corrupted"
+ " and you might need to choose an earlier recovery target.")));
+ break;
+
+ case DB_IN_PRODUCTION:
+ ereport(LOG,
+ (errmsg("database system was interrupted; last known up at %s",
+ str_time(ControlFile->time))));
+ break;
+
+ default:
+ ereport(FATAL,
+ (errmsg("control file contains invalid database cluster state")));
+ }
+
+ /* This is just to allow attaching to startup process with a debugger */
+#ifdef XLOG_REPLAY_DELAY
+ if (ControlFile->state != DB_SHUTDOWNED)
+ pg_usleep(60000000L);
+#endif
+
+ /*
+ * Verify that pg_wal and pg_wal/archive_status exist. In cases where
+ * someone has performed a copy for PITR, these directories may have been
+ * excluded and need to be re-created.
+ */
+ ValidateXLOGDirectoryStructure();
+
+ /* Set up timeout handler needed to report startup progress. */
+ if (!IsBootstrapProcessingMode())
+ RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
+ startup_progress_timeout_handler);
+
+ /*----------
+ * If we previously crashed, perform a couple of actions:
+ *
+ * - The pg_wal directory may still include some temporary WAL segments
+ * used when creating a new segment, so perform some clean up to not
+ * bloat this path. This is done first as there is no point to sync
+ * this temporary data.
+ *
+ * - There might be data which we had written, intending to fsync it, but
+ * which we had not actually fsync'd yet. Therefore, a power failure in
+ * the near future might cause earlier unflushed writes to be lost, even
+ * though more recent data written to disk from here on would be
+ * persisted. To avoid that, fsync the entire data directory.
+ */
+ if (ControlFile->state != DB_SHUTDOWNED &&
+ ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+ {
+ RemoveTempXlogFiles();
+ SyncDataDirectory();
+ didCrash = true;
+ }
+ else
+ didCrash = false;
+
+ /*
+ * Prepare for WAL recovery if needed.
+ *
+ * InitWalRecovery analyzes the control file and the backup label file, if
+ * any. It updates the in-memory ControlFile buffer according to the
+ * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
+ * It also applies the tablespace map file, if any.
+ */
+ InitWalRecovery(ControlFile, &wasShutdown,
+ &haveBackupLabel, &haveTblspcMap);
+ checkPoint = ControlFile->checkPointCopy;
+
+ /* initialize shared memory variables from the checkpoint record */
+ ShmemVariableCache->nextXid = checkPoint.nextXid;
+ ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->oidCount = 0;
+ MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+ AdvanceOldestClogXid(checkPoint.oldestXid);
+ SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+ SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+ SetCommitTsLimit(checkPoint.oldestCommitTsXid,
+ checkPoint.newestCommitTsXid);
+ XLogCtl->ckptFullXid = checkPoint.nextXid;
+
+ /*
+ * Clear out any old relcache cache files. This is *necessary* if we do
+ * any WAL replay, since that would probably result in the cache files
+ * being out of sync with database reality. In theory we could leave them
+ * in place if the database had been cleanly shut down, but it seems
+ * safest to just remove them always and let them be rebuilt during the
+ * first backend startup. These files needs to be removed from all
+ * directories including pg_tblspc, however the symlinks are created only
+ * after reading tablespace_map file in case of archive recovery from
+ * backup, so needs to clear old relcache files here after creating
+ * symlinks.
+ */
+ RelationCacheInitFileRemove();
+
+ /*
+ * Initialize replication slots, before there's a chance to remove
+ * required resources.
+ */
+ StartupReplicationSlots();
+
+ /*
+ * Startup logical state, needs to be setup now so we have proper data
+ * during crash recovery.
+ */
+ StartupReorderBuffer();
+
+ /*
+ * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
+ * been initialized and before we accept connections or begin WAL replay.
+ */
+ StartupCLOG();
+
+ /*
+ * Startup MultiXact. We need to do this early to be able to replay
+ * truncations.
+ */
+ StartupMultiXact();
+
+ /*
+ * Ditto for commit timestamps. Activate the facility if the setting is
+ * enabled in the control file, as there should be no tracking of commit
+ * timestamps done when the setting was disabled. This facility can be
+ * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
+ */
+ if (ControlFile->track_commit_timestamp)
+ StartupCommitTs();
+
+ /*
+ * Recover knowledge about replay progress of known replication partners.
+ */
+ StartupReplicationOrigin();
+
+ /*
+ * Initialize unlogged LSN. On a clean shutdown, it's restored from the
+ * control file. On recovery, all unlogged relations are blown away, so
+ * the unlogged LSN counter can be reset too.
+ */
+ if (ControlFile->state == DB_SHUTDOWNED)
+ XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
+ else
+ XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
+
+ /*
+ * Copy any missing timeline history files between 'now' and the recovery
+ * target timeline from archive to pg_wal. While we don't need those files
+ * ourselves - the history file of the recovery target timeline covers all
+ * the previous timelines in the history too - a cascading standby server
+ * might be interested in them. Or, if you archive the WAL from this
+ * server to a different archive than the primary, it'd be good for all
+ * the history files to get archived there after failover, so that you can
+ * use one of the old timelines as a PITR target. Timeline history files
+ * are small, so it's better to copy them unnecessarily than not copy them
+ * and regret later.
+ */
+ restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
+
+ /*
+ * Before running in recovery, scan pg_twophase and fill in its status to
+ * be able to work on entries generated by redo. Doing a scan before
+ * taking any recovery action has the merit to discard any 2PC files that
+ * are newer than the first record to replay, saving from any conflicts at
+ * replay. This avoids as well any subsequent scans when doing recovery
+ * of the on-disk two-phase data.
+ */
+ restoreTwoPhaseData();
+
+ /*
+ * When starting with crash recovery, reset pgstat data - it might not be
+ * valid. Otherwise restore pgstat data. It's safe to do this here,
+ * because postmaster will not yet have started any other processes.
+ *
+ * NB: Restoring replication slot stats relies on slot state to have
+ * already been restored from disk.
+ *
+ * TODO: With a bit of extra work we could just start with a pgstat file
+ * associated with the checkpoint redo location we're starting from.
+ */
+ if (didCrash)
+ pgstat_discard_stats();
+ else
+ pgstat_restore_stats();
+
+ lastFullPageWrites = checkPoint.fullPageWrites;
+
+ RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+ doPageWrites = lastFullPageWrites;
+
+ /* REDO */
+ if (InRecovery)
+ {
+ /* Initialize state for RecoveryInProgress() */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ if (InArchiveRecovery)
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+ else
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Update pg_control to show that we are recovering and to show the
+ * selected checkpoint as the place we are starting from. We also mark
+ * pg_control with any minimum recovery stop point obtained from a
+ * backup history file.
+ *
+ * No need to hold ControlFileLock yet, we aren't up far enough.
+ */
+ UpdateControlFile();
+
+ /*
+ * If there was a backup label file, it's done its job and the info
+ * has now been propagated into pg_control. We must get rid of the
+ * label file so that if we crash during recovery, we'll pick up at
+ * the latest recovery restartpoint instead of going all the way back
+ * to the backup start point. It seems prudent though to just rename
+ * the file out of the way rather than delete it completely.
+ */
+ if (haveBackupLabel)
+ {
+ unlink(BACKUP_LABEL_OLD);
+ durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
+ }
+
+ /*
+ * If there was a tablespace_map file, it's done its job and the
+ * symlinks have been created. We must get rid of the map file so
+ * that if we crash during recovery, we don't create symlinks again.
+ * It seems prudent though to just rename the file out of the way
+ * rather than delete it completely.
+ */
+ if (haveTblspcMap)
+ {
+ unlink(TABLESPACE_MAP_OLD);
+ durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
+ }
+
+ /*
+ * Initialize our local copy of minRecoveryPoint. When doing crash
+ * recovery we want to replay up to the end of WAL. Particularly, in
+ * the case of a promoted standby minRecoveryPoint value in the
+ * control file is only updated after the first checkpoint. However,
+ * if the instance crashes before the first post-recovery checkpoint
+ * is completed then recovery will use a stale location causing the
+ * startup process to think that there are still invalid page
+ * references when checking for data consistency.
+ */
+ if (InArchiveRecovery)
+ {
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ else
+ {
+ LocalMinRecoveryPoint = InvalidXLogRecPtr;
+ LocalMinRecoveryPointTLI = 0;
+ }
+
+ /* Check that the GUCs used to generate the WAL allow recovery */
+ CheckRequiredParameterValues();
+
+ /*
+ * We're in recovery, so unlogged relations may be trashed and must be
+ * reset. This should be done BEFORE allowing Hot Standby
+ * connections, so that read-only backends don't try to read whatever
+ * garbage is left over from before.
+ */
+ ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
+
+ /*
+ * Likewise, delete any saved transaction snapshot files that got left
+ * behind by crashed backends.
+ */
+ DeleteAllExportedSnapshotFiles();
+
+ /*
+ * Initialize for Hot Standby, if enabled. We won't let backends in
+ * yet, not until we've reached the min recovery point specified in
+ * control file and we've established a recovery snapshot from a
+ * running-xacts WAL record.
+ */
+ if (ArchiveRecoveryRequested && EnableHotStandby)
+ {
+ TransactionId *xids;
+ int nxids;
+
+ ereport(DEBUG1,
+ (errmsg_internal("initializing for hot standby")));
+
+ InitRecoveryTransactionEnvironment();
+
+ if (wasShutdown)
+ oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+ else
+ oldestActiveXID = checkPoint.oldestActiveXid;
+ Assert(TransactionIdIsValid(oldestActiveXID));
+
+ /* Tell procarray about the range of xids it has to deal with */
+ ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
+
+ /*
+ * Startup subtrans only. CLOG, MultiXact and commit timestamp
+ * have already been started up and other SLRUs are not maintained
+ * during recovery and need not be started yet.
+ */
+ StartupSUBTRANS(oldestActiveXID);
+
+ /*
+ * If we're beginning at a shutdown checkpoint, we know that
+ * nothing was running on the primary at this point. So fake-up an
+ * empty running-xacts record and use that here and now. Recover
+ * additional standby state for prepared transactions.
+ */
+ if (wasShutdown)
+ {
+ RunningTransactionsData running;
+ TransactionId latestCompletedXid;
+
+ /*
+ * Construct a RunningTransactions snapshot representing a
+ * shut down server, with only prepared transactions still
+ * alive. We're never overflowed at this point because all
+ * subxids are listed with their parent prepared transactions.
+ */
+ running.xcnt = nxids;
+ running.subxcnt = 0;
+ running.subxid_overflow = false;
+ running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+ running.oldestRunningXid = oldestActiveXID;
+ latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+ TransactionIdRetreat(latestCompletedXid);
+ Assert(TransactionIdIsNormal(latestCompletedXid));
+ running.latestCompletedXid = latestCompletedXid;
+ running.xids = xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+
+ StandbyRecoverPreparedTransactions();
+ }
+ }
+
+ /*
+ * We're all set for replaying the WAL now. Do it.
+ */
+ PerformWalRecovery();
+ performedWalRecovery = true;
+ }
+ else
+ performedWalRecovery = false;
+
+ /*
+ * Finish WAL recovery.
+ */
+ endOfRecoveryInfo = FinishWalRecovery();
+ EndOfLog = endOfRecoveryInfo->endOfLog;
+ EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
+ abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
+ missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
+
+ /*
+ * Reset ps status display, so as no information related to recovery
+ * shows up.
+ */
+ set_ps_display("");
+
+ /*
+ * When recovering from a backup (we are in recovery, and archive recovery
+ * was requested), complain if we did not roll forward far enough to reach
+ * the point where the database is consistent. For regular online
+ * backup-from-primary, that means reaching the end-of-backup WAL record
+ * (at which point we reset backupStartPoint to be Invalid), for
+ * backup-from-replica (which can't inject records into the WAL stream),
+ * that point is when we reach the minRecoveryPoint in pg_control (which
+ * we purposefully copy last when backing up from a replica). For
+ * pg_rewind (which creates a backup_label with a method of "pg_rewind")
+ * or snapshot-style backups (which don't), backupEndRequired will be set
+ * to false.
+ *
+ * Note: it is indeed okay to look at the local variable
+ * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
+ * might be further ahead --- ControlFile->minRecoveryPoint cannot have
+ * been advanced beyond the WAL we processed.
+ */
+ if (InRecovery &&
+ (EndOfLog < LocalMinRecoveryPoint ||
+ !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
+ {
+ /*
+ * Ran off end of WAL before reaching end-of-backup WAL record, or
+ * minRecoveryPoint. That's a bad sign, indicating that you tried to
+ * recover from an online backup but never called pg_backup_stop(), or
+ * you didn't archive all the WAL needed.
+ */
+ if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
+ {
+ if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
+ ereport(FATAL,
+ (errmsg("WAL ends before end of online backup"),
+ errhint("All WAL generated while online backup was taken must be available at recovery.")));
+ else
+ ereport(FATAL,
+ (errmsg("WAL ends before consistent recovery point")));
+ }
+ }
+
+ /*
+ * Reset unlogged relations to the contents of their INIT fork. This is
+ * done AFTER recovery is complete so as to include any unlogged relations
+ * created during recovery, but BEFORE recovery is marked as having
+ * completed successfully. Otherwise we'd not retry if any of the post
+ * end-of-recovery steps fail.
+ */
+ if (InRecovery)
+ ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
+
+ /*
+ * Pre-scan prepared transactions to find out the range of XIDs present.
+ * This information is not quite needed yet, but it is positioned here so
+ * as potential problems are detected before any on-disk change is done.
+ */
+ oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+
+ /*
+ * Allow ordinary WAL segment creation before possibly switching to a new
+ * timeline, which creates a new segment, and after the last ReadRecord().
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ XLogCtl->InstallXLogFileSegmentActive = true;
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Consider whether we need to assign a new timeline ID.
+ *
+ * If we did archive recovery, we always assign a new ID. This handles a
+ * couple of issues. If we stopped short of the end of WAL during
+ * recovery, then we are clearly generating a new timeline and must assign
+ * it a unique new ID. Even if we ran to the end, modifying the current
+ * last segment is problematic because it may result in trying to
+ * overwrite an already-archived copy of that segment, and we encourage
+ * DBAs to make their archive_commands reject that. We can dodge the
+ * problem by making the new active segment have a new timeline ID.
+ *
+ * In a normal crash recovery, we can just extend the timeline we were in.
+ */
+ newTLI = endOfRecoveryInfo->lastRecTLI;
+ if (ArchiveRecoveryRequested)
+ {
+ newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
+ ereport(LOG,
+ (errmsg("selected new timeline ID: %u", newTLI)));
+
+ /*
+ * Make a writable copy of the last WAL segment. (Note that we also
+ * have a copy of the last block of the old WAL in
+ * endOfRecovery->lastPage; we will use that below.)
+ */
+ XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
+
+ /*
+ * Remove the signal files out of the way, so that we don't
+ * accidentally re-enter archive recovery mode in a subsequent crash.
+ */
+ if (endOfRecoveryInfo->standby_signal_file_found)
+ durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
+
+ if (endOfRecoveryInfo->recovery_signal_file_found)
+ durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
+
+ /*
+ * Write the timeline history file, and have it archived. After this
+ * point (or rather, as soon as the file is archived), the timeline
+ * will appear as "taken" in the WAL archive and to any standby
+ * servers. If we crash before actually switching to the new
+ * timeline, standby servers will nevertheless think that we switched
+ * to the new timeline, and will try to connect to the new timeline.
+ * To minimize the window for that, try to do as little as possible
+ * between here and writing the end-of-recovery record.
+ */
+ writeTimeLineHistory(newTLI, recoveryTargetTLI,
+ EndOfLog, endOfRecoveryInfo->recoveryStopReason);
+
+ ereport(LOG,
+ (errmsg("archive recovery complete")));
+ }
+
+ /* Save the selected TimeLineID in shared memory, too */
+ XLogCtl->InsertTimeLineID = newTLI;
+ XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
+
+ /*
+ * Actually, if WAL ended in an incomplete record, skip the parts that
+ * made it through and start writing after the portion that persisted.
+ * (It's critical to first write an OVERWRITE_CONTRECORD message, which
+ * we'll do as soon as we're open for writing new WAL.)
+ */
+ if (!XLogRecPtrIsInvalid(missingContrecPtr))
+ {
+ /*
+ * We should only have a missingContrecPtr if we're not switching to
+ * a new timeline. When a timeline switch occurs, WAL is copied from
+ * the old timeline to the new only up to the end of the last complete
+ * record, so there can't be an incomplete WAL record that we need to
+ * disregard.
+ */
+ Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
+ Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
+ EndOfLog = missingContrecPtr;
+ }
+
+ /*
+ * Prepare to write WAL starting at EndOfLog location, and init xlog
+ * buffer cache using the block containing the last record from the
+ * previous incarnation.
+ */
+ Insert = &XLogCtl->Insert;
+ Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
+ Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
+
+ /*
+ * Tricky point here: lastPage contains the *last* block that the LastRec
+ * record spans, not the one it starts in. The last block is indeed the
+ * one we want to use.
+ */
+ if (EndOfLog % XLOG_BLCKSZ != 0)
+ {
+ char *page;
+ int len;
+ int firstIdx;
+
+ firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+ len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
+ Assert(len < XLOG_BLCKSZ);
+
+ /* Copy the valid part of the last block, and zero the rest */
+ page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+ memcpy(page, endOfRecoveryInfo->lastPage, len);
+ memset(page + len, 0, XLOG_BLCKSZ - len);
+
+ XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+ XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+ }
+ else
+ {
+ /*
+ * There is no partial block to copy. Just set InitializedUpTo, and
+ * let the first attempt to insert a log record to initialize the next
+ * buffer.
+ */
+ XLogCtl->InitializedUpTo = EndOfLog;
+ }
+
+ LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+
+ XLogCtl->LogwrtResult = LogwrtResult;
+
+ XLogCtl->LogwrtRqst.Write = EndOfLog;
+ XLogCtl->LogwrtRqst.Flush = EndOfLog;
+
+ /*
+ * Preallocate additional log files, if wanted.
+ */
+ PreallocXlogFiles(EndOfLog, newTLI);
+
+ /*
+ * Okay, we're officially UP.
+ */
+ InRecovery = false;
+
+ /* start the archive_timeout timer and LSN running */
+ XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+ XLogCtl->lastSegSwitchLSN = EndOfLog;
+
+ /* also initialize latestCompletedXid, to nextXid - 1 */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
+ FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Start up subtrans, if not already done for hot standby. (commit
+ * timestamps are started below, if necessary.)
+ */
+ if (standbyState == STANDBY_DISABLED)
+ StartupSUBTRANS(oldestActiveXID);
+
+ /*
+ * Perform end of recovery actions for any SLRUs that need it.
+ */
+ TrimCLOG();
+ TrimMultiXact();
+
+ /*
+ * Reload shared-memory state for prepared transactions. This needs to
+ * happen before renaming the last partial segment of the old timeline as
+ * it may be possible that we have to recovery some transactions from it.
+ */
+ RecoverPreparedTransactions();
+
+ /* Shut down xlogreader */
+ ShutdownWalRecovery();
+
+ /* Enable WAL writes for this backend only. */
+ LocalSetXLogInsertAllowed();
+
+ /* If necessary, write overwrite-contrecord before doing anything else */
+ if (!XLogRecPtrIsInvalid(abortedRecPtr))
+ {
+ Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
+ CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
+ }
+
+ /*
+ * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
+ * record before resource manager writes cleanup WAL records or checkpoint
+ * record is written.
+ */
+ Insert->fullPageWrites = lastFullPageWrites;
+ UpdateFullPageWrites();
+
+ /*
+ * Emit checkpoint or end-of-recovery record in XLOG, if required.
+ */
+ if (performedWalRecovery)
+ promoted = PerformRecoveryXLogAction();
+
+ /*
+ * If any of the critical GUCs have changed, log them before we allow
+ * backends to write WAL.
+ */
+ XLogReportParameters();
+
+ /* If this is archive recovery, perform post-recovery cleanup actions. */
+ if (ArchiveRecoveryRequested)
+ CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
+
+ /*
+ * Local WAL inserts enabled, so it's time to finish initialization of
+ * commit timestamp.
+ */
+ CompleteCommitTsInitialization();
+
+ /*
+ * All done with end-of-recovery actions.
+ *
+ * Now allow backends to write WAL and update the control file status in
+ * consequence. SharedRecoveryState, that controls if backends can write
+ * WAL, is updated while holding ControlFileLock to prevent other backends
+ * to look at an inconsistent state of the control file in shared memory.
+ * There is still a small window during which backends can write WAL and
+ * the control file is still referring to a system not in DB_IN_PRODUCTION
+ * state while looking at the on-disk control file.
+ *
+ * Also, we use info_lck to update SharedRecoveryState to ensure that
+ * there are no race conditions concerning visibility of other recent
+ * updates to shared memory.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_IN_PRODUCTION;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Shutdown the recovery environment. This must occur after
+ * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
+ * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
+ * any session building a snapshot will not rely on KnownAssignedXids as
+ * RecoveryInProgress() would return false at this stage. This is
+ * particularly critical for prepared 2PC transactions, that would still
+ * need to be included in snapshots once recovery has ended.
+ */
+ if (standbyState != STANDBY_DISABLED)
+ ShutdownRecoveryTransactionEnvironment();
+
+ /*
+ * If there were cascading standby servers connected to us, nudge any wal
+ * sender processes to notice that we've been promoted.
+ */
+ WalSndWakeup();
+
+ /*
+ * If this was a promotion, request an (online) checkpoint now. This isn't
+ * required for consistency, but the last restartpoint might be far back,
+ * and in case of a crash, recovering from it might take a longer than is
+ * appropriate now that we're not in standby mode anymore.
+ */
+ if (promoted)
+ RequestCheckpoint(CHECKPOINT_FORCE);
+}
+
+/*
+ * Callback from PerformWalRecovery(), called when we switch from crash
+ * recovery to archive recovery mode. Updates the control file accordingly.
+ */
+void
+SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
+{
+ /* initialize minRecoveryPoint to this record */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+ if (ControlFile->minRecoveryPoint < EndRecPtr)
+ {
+ ControlFile->minRecoveryPoint = EndRecPtr;
+ ControlFile->minRecoveryPointTLI = replayTLI;
+ }
+ /* update local copy */
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+
+ /*
+ * The startup process can update its local copy of minRecoveryPoint from
+ * this point.
+ */
+ updateMinRecoveryPoint = true;
+
+ UpdateControlFile();
+
+ /*
+ * We update SharedRecoveryState while holding the lock on ControlFileLock
+ * so both states are consistent in shared memory.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Callback from PerformWalRecovery(), called when we reach the end of backup.
+ * Updates the control file accordingly.
+ */
+void
+ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
+{
+ /*
+ * We have reached the end of base backup, as indicated by pg_control. The
+ * data on disk is now consistent (unless minRecovery point is further
+ * ahead, which can happen if we crashed during previous recovery). Reset
+ * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
+ * make sure we don't allow starting up at an earlier point even if
+ * recovery is stopped and restarted soon after this.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+ if (ControlFile->minRecoveryPoint < EndRecPtr)
+ {
+ ControlFile->minRecoveryPoint = EndRecPtr;
+ ControlFile->minRecoveryPointTLI = tli;
+ }
+
+ ControlFile->backupStartPoint = InvalidXLogRecPtr;
+ ControlFile->backupEndPoint = InvalidXLogRecPtr;
+ ControlFile->backupEndRequired = false;
+ UpdateControlFile();
+
+ LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Perform whatever XLOG actions are necessary at end of REDO.
+ *
+ * The goal here is to make sure that we'll be able to recover properly if
+ * we crash again. If we choose to write a checkpoint, we'll write a shutdown
+ * checkpoint rather than an on-line one. This is not particularly critical,
+ * but since we may be assigning a new TLI, using a shutdown checkpoint allows
+ * us to have the rule that TLI only changes in shutdown checkpoints, which
+ * allows some extra error checking in xlog_redo.
+ */
+static bool
+PerformRecoveryXLogAction(void)
+{
+ bool promoted = false;
+
+ /*
+ * Perform a checkpoint to update all our recovery activity to disk.
+ *
+ * Note that we write a shutdown checkpoint rather than an on-line one.
+ * This is not particularly critical, but since we may be assigning a new
+ * TLI, using a shutdown checkpoint allows us to have the rule that TLI
+ * only changes in shutdown checkpoints, which allows some extra error
+ * checking in xlog_redo.
+ *
+ * In promotion, only create a lightweight end-of-recovery record instead
+ * of a full checkpoint. A checkpoint is requested later, after we're
+ * fully out of recovery mode and already accepting queries.
+ */
+ if (ArchiveRecoveryRequested && IsUnderPostmaster &&
+ PromoteIsTriggered())
+ {
+ promoted = true;
+
+ /*
+ * Insert a special WAL record to mark the end of recovery, since we
+ * aren't doing a checkpoint. That means that the checkpointer process
+ * may likely be in the middle of a time-smoothed restartpoint and
+ * could continue to be for minutes after this. That sounds strange,
+ * but the effect is roughly the same and it would be stranger to try
+ * to come out of the restartpoint and then checkpoint. We request a
+ * checkpoint later anyway, just for safety.
+ */
+ CreateEndOfRecoveryRecord();
+ }
+ else
+ {
+ RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+ CHECKPOINT_IMMEDIATE |
+ CHECKPOINT_WAIT);
+ }
+
+ return promoted;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * Unlike testing InRecovery, this works in any process that's connected to
+ * shared memory.
+ */
+bool
+RecoveryInProgress(void)
+{
+ /*
+ * We check shared state each time only until we leave recovery mode. We
+ * can't re-enter recovery, so there's no need to keep checking after the
+ * shared variable has once been seen false.
+ */
+ if (!LocalRecoveryInProgress)
+ return false;
+ else
+ {
+ /*
+ * use volatile pointer to make sure we make a fresh read of the
+ * shared variable.
+ */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
+
+ /*
+ * Note: We don't need a memory barrier when we're still in recovery.
+ * We might exit recovery immediately after return, so the caller
+ * can't rely on 'true' meaning that we're still in recovery anyway.
+ */
+
+ return LocalRecoveryInProgress;
+ }
+}
+
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is kept consistent with the contents of the control
+ * file. See details about the possible values of RecoveryState in xlog.h.
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+ RecoveryState retval;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ retval = XLogCtl->SharedRecoveryState;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return retval;
+}
+
+/*
+ * Is this process allowed to insert new WAL records?
+ *
+ * Ordinarily this is essentially equivalent to !RecoveryInProgress().
+ * But we also have provisions for forcing the result "true" or "false"
+ * within specific processes regardless of the global state.
+ */
+bool
+XLogInsertAllowed(void)
+{
+ /*
+ * If value is "unconditionally true" or "unconditionally false", just
+ * return it. This provides the normal fast path once recovery is known
+ * done.
+ */
+ if (LocalXLogInsertAllowed >= 0)
+ return (bool) LocalXLogInsertAllowed;
+
+ /*
+ * Else, must check to see if we're still in recovery.
+ */
+ if (RecoveryInProgress())
+ return false;
+
+ /*
+ * On exit from recovery, reset to "unconditionally true", since there is
+ * no need to keep checking.
+ */
+ LocalXLogInsertAllowed = 1;
+ return true;
+}
+
+/*
+ * Make XLogInsertAllowed() return true in the current process only.
+ *
+ * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
+ * and even call LocalSetXLogInsertAllowed() again after that.
+ *
+ * Returns the previous value of LocalXLogInsertAllowed.
+ */
+static int
+LocalSetXLogInsertAllowed(void)
+{
+ int oldXLogAllowed = LocalXLogInsertAllowed;
+
+ LocalXLogInsertAllowed = 1;
+
+ return oldXLogAllowed;
+}
+
+/*
+ * Return the current Redo pointer from shared memory.
+ *
+ * As a side-effect, the local RedoRecPtr copy is updated.
+ */
+XLogRecPtr
+GetRedoRecPtr(void)
+{
+ XLogRecPtr ptr;
+
+ /*
+ * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+ * grabbed a WAL insertion lock to read the authoritative value in
+ * Insert->RedoRecPtr, someone might update it just after we've released
+ * the lock.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ ptr = XLogCtl->RedoRecPtr;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (RedoRecPtr < ptr)
+ RedoRecPtr = ptr;
+
+ return RedoRecPtr;
+}
+
+/*
+ * Return information needed to decide whether a modified block needs a
+ * full-page image to be included in the WAL record.
+ *
+ * The returned values are cached copies from backend-private memory, and
+ * possibly out-of-date or, indeed, uninitialized, in which case they will
+ * be InvalidXLogRecPtr and false, respectively. XLogInsertRecord will
+ * re-check them against up-to-date values, while holding the WAL insert lock.
+ */
+void
+GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
+{
+ *RedoRecPtr_p = RedoRecPtr;
+ *doPageWrites_p = doPageWrites;
+}
+
+/*
+ * GetInsertRecPtr -- Returns the current insert position.
+ *
+ * NOTE: The value *actually* returned is the position of the last full
+ * xlog page. It lags behind the real insert position by at most 1 page.
+ * For that, we don't need to scan through WAL insertion locks, and an
+ * approximation is enough for the current usage of this function.
+ */
+XLogRecPtr
+GetInsertRecPtr(void)
+{
+ XLogRecPtr recptr;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ recptr = XLogCtl->LogwrtRqst.Write;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return recptr;
+}
+
+/*
+ * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
+ * position known to be fsync'd to disk. This should only be used on a
+ * system that is known not to be in recovery.
+ */
+XLogRecPtr
+GetFlushRecPtr(TimeLineID *insertTLI)
+{
+ Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * If we're writing and flushing WAL, the time line can't be changing, so
+ * no lock is required.
+ */
+ if (insertTLI)
+ *insertTLI = XLogCtl->InsertTimeLineID;
+
+ return LogwrtResult.Flush;
+}
+
+/*
+ * GetWALInsertionTimeLine -- Returns the current timeline of a system that
+ * is not in recovery.
+ */
+TimeLineID
+GetWALInsertionTimeLine(void)
+{
+ Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
+
+ /* Since the value can't be changing, no lock is required. */
+ return XLogCtl->InsertTimeLineID;
+}
+
+/*
+ * GetLastImportantRecPtr -- Returns the LSN of the last important record
+ * inserted. All records not explicitly marked as unimportant are considered
+ * important.
+ *
+ * The LSN is determined by computing the maximum of
+ * WALInsertLocks[i].lastImportantAt.
+ */
+XLogRecPtr
+GetLastImportantRecPtr(void)
+{
+ XLogRecPtr res = InvalidXLogRecPtr;
+ int i;
+
+ for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+ {
+ XLogRecPtr last_important;
+
+ /*
+ * Need to take a lock to prevent torn reads of the LSN, which are
+ * possible on some of the supported platforms. WAL insert locks only
+ * support exclusive mode, so we have to use that.
+ */
+ LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+ last_important = WALInsertLocks[i].l.lastImportantAt;
+ LWLockRelease(&WALInsertLocks[i].l.lock);
+
+ if (res < last_important)
+ res = last_important;
+ }
+
+ return res;
+}
+
+/*
+ * Get the time and LSN of the last xlog segment switch
+ */
+pg_time_t
+GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
+{
+ pg_time_t result;
+
+ /* Need WALWriteLock, but shared lock is sufficient */
+ LWLockAcquire(WALWriteLock, LW_SHARED);
+ result = XLogCtl->lastSegSwitchTime;
+ *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
+ LWLockRelease(WALWriteLock);
+
+ return result;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownXLOG(int code, Datum arg)
+{
+ /*
+ * We should have an aux process resource owner to use, and we should not
+ * be in a transaction that's installed some other resowner.
+ */
+ Assert(AuxProcessResourceOwner != NULL);
+ Assert(CurrentResourceOwner == NULL ||
+ CurrentResourceOwner == AuxProcessResourceOwner);
+ CurrentResourceOwner = AuxProcessResourceOwner;
+
+ /* Don't be chatty in standalone mode */
+ ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+ (errmsg("shutting down")));
+
+ /*
+ * Signal walsenders to move to stopping state.
+ */
+ WalSndInitStopping();
+
+ /*
+ * Wait for WAL senders to be in stopping state. This prevents commands
+ * from writing new WAL.
+ */
+ WalSndWaitStopping();
+
+ if (RecoveryInProgress())
+ CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+ else
+ {
+ /*
+ * If archiving is enabled, rotate the last XLOG file so that all the
+ * remaining records are archived (postmaster wakes up the archiver
+ * process one more time at the end of shutdown). The checkpoint
+ * record will go to the next XLOG file and won't be archived (yet).
+ */
+ if (XLogArchivingActive())
+ RequestXLogSwitch(false);
+
+ CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+ }
+}
+
+/*
+ * Log start of a checkpoint.
+ */
+static void
+LogCheckpointStart(int flags, bool restartpoint)
+{
+ if (restartpoint)
+ ereport(LOG,
+ /* translator: the placeholders show checkpoint options */
+ (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
+ (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+ (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+ (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+ (flags & CHECKPOINT_FORCE) ? " force" : "",
+ (flags & CHECKPOINT_WAIT) ? " wait" : "",
+ (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+ (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+ (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+ else
+ ereport(LOG,
+ /* translator: the placeholders show checkpoint options */
+ (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
+ (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+ (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+ (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+ (flags & CHECKPOINT_FORCE) ? " force" : "",
+ (flags & CHECKPOINT_WAIT) ? " wait" : "",
+ (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+ (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+ (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+}
+
+/*
+ * Log end of a checkpoint.
+ */
+static void
+LogCheckpointEnd(bool restartpoint)
+{
+ long write_msecs,
+ sync_msecs,
+ total_msecs,
+ longest_msecs,
+ average_msecs;
+ uint64 average_sync_time;
+
+ CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
+
+ write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
+ CheckpointStats.ckpt_sync_t);
+
+ sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
+ CheckpointStats.ckpt_sync_end_t);
+
+ /* Accumulate checkpoint timing summary data, in milliseconds. */
+ PendingCheckpointerStats.checkpoint_write_time += write_msecs;
+ PendingCheckpointerStats.checkpoint_sync_time += sync_msecs;
+
+ /*
+ * All of the published timing statistics are accounted for. Only
+ * continue if a log message is to be written.
+ */
+ if (!log_checkpoints)
+ return;
+
+ total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
+ CheckpointStats.ckpt_end_t);
+
+ /*
+ * Timing values returned from CheckpointStats are in microseconds.
+ * Convert to milliseconds for consistent printing.
+ */
+ longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
+
+ average_sync_time = 0;
+ if (CheckpointStats.ckpt_sync_rels > 0)
+ average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+ CheckpointStats.ckpt_sync_rels;
+ average_msecs = (long) ((average_sync_time + 999) / 1000);
+
+ if (restartpoint)
+ ereport(LOG,
+ (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
+ "%d WAL file(s) added, %d removed, %d recycled; "
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+ "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+ "distance=%d kB, estimate=%d kB",
+ CheckpointStats.ckpt_bufs_written,
+ (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+ CheckpointStats.ckpt_segs_added,
+ CheckpointStats.ckpt_segs_removed,
+ CheckpointStats.ckpt_segs_recycled,
+ write_msecs / 1000, (int) (write_msecs % 1000),
+ sync_msecs / 1000, (int) (sync_msecs % 1000),
+ total_msecs / 1000, (int) (total_msecs % 1000),
+ CheckpointStats.ckpt_sync_rels,
+ longest_msecs / 1000, (int) (longest_msecs % 1000),
+ average_msecs / 1000, (int) (average_msecs % 1000),
+ (int) (PrevCheckPointDistance / 1024.0),
+ (int) (CheckPointDistanceEstimate / 1024.0))));
+ else
+ ereport(LOG,
+ (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
+ "%d WAL file(s) added, %d removed, %d recycled; "
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+ "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+ "distance=%d kB, estimate=%d kB",
+ CheckpointStats.ckpt_bufs_written,
+ (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+ CheckpointStats.ckpt_segs_added,
+ CheckpointStats.ckpt_segs_removed,
+ CheckpointStats.ckpt_segs_recycled,
+ write_msecs / 1000, (int) (write_msecs % 1000),
+ sync_msecs / 1000, (int) (sync_msecs % 1000),
+ total_msecs / 1000, (int) (total_msecs % 1000),
+ CheckpointStats.ckpt_sync_rels,
+ longest_msecs / 1000, (int) (longest_msecs % 1000),
+ average_msecs / 1000, (int) (average_msecs % 1000),
+ (int) (PrevCheckPointDistance / 1024.0),
+ (int) (CheckPointDistanceEstimate / 1024.0))));
+}
+
+/*
+ * Update the estimate of distance between checkpoints.
+ *
+ * The estimate is used to calculate the number of WAL segments to keep
+ * preallocated, see XLOGfileslop().
+ */
+static void
+UpdateCheckPointDistanceEstimate(uint64 nbytes)
+{
+ /*
+ * To estimate the number of segments consumed between checkpoints, keep a
+ * moving average of the amount of WAL generated in previous checkpoint
+ * cycles. However, if the load is bursty, with quiet periods and busy
+ * periods, we want to cater for the peak load. So instead of a plain
+ * moving average, let the average decline slowly if the previous cycle
+ * used less WAL than estimated, but bump it up immediately if it used
+ * more.
+ *
+ * When checkpoints are triggered by max_wal_size, this should converge to
+ * CheckpointSegments * wal_segment_size,
+ *
+ * Note: This doesn't pay any attention to what caused the checkpoint.
+ * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
+ * starting a base backup, are counted the same as those created
+ * automatically. The slow-decline will largely mask them out, if they are
+ * not frequent. If they are frequent, it seems reasonable to count them
+ * in as any others; if you issue a manual checkpoint every 5 minutes and
+ * never let a timed checkpoint happen, it makes sense to base the
+ * preallocation on that 5 minute interval rather than whatever
+ * checkpoint_timeout is set to.
+ */
+ PrevCheckPointDistance = nbytes;
+ if (CheckPointDistanceEstimate < nbytes)
+ CheckPointDistanceEstimate = nbytes;
+ else
+ CheckPointDistanceEstimate =
+ (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
+}
+
+/*
+ * Update the ps display for a process running a checkpoint. Note that
+ * this routine should not do any allocations so as it can be called
+ * from a critical section.
+ */
+static void
+update_checkpoint_display(int flags, bool restartpoint, bool reset)
+{
+ /*
+ * The status is reported only for end-of-recovery and shutdown
+ * checkpoints or shutdown restartpoints. Updating the ps display is
+ * useful in those situations as it may not be possible to rely on
+ * pg_stat_activity to see the status of the checkpointer or the startup
+ * process.
+ */
+ if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
+ return;
+
+ if (reset)
+ set_ps_display("");
+ else
+ {
+ char activitymsg[128];
+
+ snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
+ (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
+ (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
+ restartpoint ? "restartpoint" : "checkpoint");
+ set_ps_display(activitymsg);
+ }
+}
+
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * flags is a bitwise OR of the following:
+ * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
+ * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
+ * ignoring checkpoint_completion_target parameter.
+ * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
+ * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
+ * CHECKPOINT_END_OF_RECOVERY).
+ * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
+ *
+ * Note: flags contains other bits, of interest here only for logging purposes.
+ * In particular note that this routine is synchronous and does not pay
+ * attention to CHECKPOINT_WAIT.
+ *
+ * If !shutdown then we are writing an online checkpoint. This is a very special
+ * kind of operation and WAL record because the checkpoint action occurs over
+ * a period of time yet logically occurs at just a single LSN. The logical
+ * position of the WAL record (redo ptr) is the same or earlier than the
+ * physical position. When we replay WAL we locate the checkpoint via its
+ * physical position then read the redo ptr and actually start replay at the
+ * earlier logical position. Note that we don't write *anything* to WAL at
+ * the logical position, so that location could be any other kind of WAL record.
+ * All of this mechanism allows us to continue working while we checkpoint.
+ * As a result, timing of actions is critical here and be careful to note that
+ * this function will likely take minutes to execute on a busy system.
+ */
+void
+CreateCheckPoint(int flags)
+{
+ bool shutdown;
+ CheckPoint checkPoint;
+ XLogRecPtr recptr;
+ XLogSegNo _logSegNo;
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint32 freespace;
+ XLogRecPtr PriorRedoPtr;
+ XLogRecPtr curInsert;
+ XLogRecPtr last_important_lsn;
+ VirtualTransactionId *vxids;
+ int nvxids;
+ int oldXLogAllowed = 0;
+
+ /*
+ * An end-of-recovery checkpoint is really a shutdown checkpoint, just
+ * issued at a different time.
+ */
+ if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
+ shutdown = true;
+ else
+ shutdown = false;
+
+ /* sanity check */
+ if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+ elog(ERROR, "can't create a checkpoint during recovery");
+
+ /*
+ * Prepare to accumulate statistics.
+ *
+ * Note: because it is possible for log_checkpoints to change while a
+ * checkpoint proceeds, we always accumulate stats, even if
+ * log_checkpoints is currently off.
+ */
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ /*
+ * Let smgr prepare for checkpoint; this has to happen outside the
+ * critical section and before we determine the REDO pointer. Note that
+ * smgr must not do anything that'd have to be undone if we decide no
+ * checkpoint is needed.
+ */
+ SyncPreCheckpoint();
+
+ /*
+ * Use a critical section to force system panic if we have trouble.
+ */
+ START_CRIT_SECTION();
+
+ if (shutdown)
+ {
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_SHUTDOWNING;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+ }
+
+ /* Begin filling in the checkpoint WAL record */
+ MemSet(&checkPoint, 0, sizeof(checkPoint));
+ checkPoint.time = (pg_time_t) time(NULL);
+
+ /*
+ * For Hot Standby, derive the oldestActiveXid before we fix the redo
+ * pointer. This allows us to begin accumulating changes to assemble our
+ * starting snapshot of locks and transactions.
+ */
+ if (!shutdown && XLogStandbyInfoActive())
+ checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
+ else
+ checkPoint.oldestActiveXid = InvalidTransactionId;
+
+ /*
+ * Get location of last important record before acquiring insert locks (as
+ * GetLastImportantRecPtr() also locks WAL locks).
+ */
+ last_important_lsn = GetLastImportantRecPtr();
+
+ /*
+ * We must block concurrent insertions while examining insert state to
+ * determine the checkpoint REDO pointer.
+ */
+ WALInsertLockAcquireExclusive();
+ curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
+
+ /*
+ * If this isn't a shutdown or forced checkpoint, and if there has been no
+ * WAL activity requiring a checkpoint, skip it. The idea here is to
+ * avoid inserting duplicate checkpoints when the system is idle.
+ */
+ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+ CHECKPOINT_FORCE)) == 0)
+ {
+ if (last_important_lsn == ControlFile->checkPoint)
+ {
+ WALInsertLockRelease();
+ END_CRIT_SECTION();
+ ereport(DEBUG1,
+ (errmsg_internal("checkpoint skipped because system is idle")));
+ return;
+ }
+ }
+
+ /*
+ * An end-of-recovery checkpoint is created before anyone is allowed to
+ * write WAL. To allow us to write the checkpoint record, temporarily
+ * enable XLogInsertAllowed.
+ */
+ if (flags & CHECKPOINT_END_OF_RECOVERY)
+ oldXLogAllowed = LocalSetXLogInsertAllowed();
+
+ checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
+ if (flags & CHECKPOINT_END_OF_RECOVERY)
+ checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+ else
+ checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
+
+ checkPoint.fullPageWrites = Insert->fullPageWrites;
+
+ /*
+ * Compute new REDO record ptr = location of next XLOG record.
+ *
+ * NB: this is NOT necessarily where the checkpoint record itself will be,
+ * since other backends may insert more XLOG records while we're off doing
+ * the buffer flush work. Those XLOG records are logically after the
+ * checkpoint, even though physically before it. Got that?
+ */
+ freespace = INSERT_FREESPACE(curInsert);
+ if (freespace == 0)
+ {
+ if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+ curInsert += SizeOfXLogLongPHD;
+ else
+ curInsert += SizeOfXLogShortPHD;
+ }
+ checkPoint.redo = curInsert;
+
+ /*
+ * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+ * must be done while holding all the insertion locks.
+ *
+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+ * pointing past where it really needs to point. This is okay; the only
+ * consequence is that XLogInsert might back up whole buffers that it
+ * didn't really need to. We can't postpone advancing RedoRecPtr because
+ * XLogInserts that happen while we are dumping buffers must assume that
+ * their buffer changes are not included in the checkpoint.
+ */
+ RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+
+ /*
+ * Now we can release the WAL insertion locks, allowing other xacts to
+ * proceed while we are flushing disk buffers.
+ */
+ WALInsertLockRelease();
+
+ /* Update the info_lck-protected copy of RedoRecPtr as well */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->RedoRecPtr = checkPoint.redo;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * If enabled, log checkpoint start. We postpone this until now so as not
+ * to log anything if we decided to skip the checkpoint.
+ */
+ if (log_checkpoints)
+ LogCheckpointStart(flags, false);
+
+ /* Update the process title */
+ update_checkpoint_display(flags, false, false);
+
+ TRACE_POSTGRESQL_CHECKPOINT_START(flags);
+
+ /*
+ * Get the other info we need for the checkpoint record.
+ *
+ * We don't need to save oldestClogXid in the checkpoint, it only matters
+ * for the short period in which clog is being truncated, and if we crash
+ * during that we'll redo the clog truncation and fix up oldestClogXid
+ * there.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ checkPoint.nextXid = ShmemVariableCache->nextXid;
+ checkPoint.oldestXid = ShmemVariableCache->oldestXid;
+ checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
+ LWLockRelease(XidGenLock);
+
+ LWLockAcquire(CommitTsLock, LW_SHARED);
+ checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
+ checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
+ LWLockRelease(CommitTsLock);
+
+ LWLockAcquire(OidGenLock, LW_SHARED);
+ checkPoint.nextOid = ShmemVariableCache->nextOid;
+ if (!shutdown)
+ checkPoint.nextOid += ShmemVariableCache->oidCount;
+ LWLockRelease(OidGenLock);
+
+ MultiXactGetCheckptMulti(shutdown,
+ &checkPoint.nextMulti,
+ &checkPoint.nextMultiOffset,
+ &checkPoint.oldestMulti,
+ &checkPoint.oldestMultiDB);
+
+ /*
+ * Having constructed the checkpoint record, ensure all shmem disk buffers
+ * and commit-log buffers are flushed to disk.
+ *
+ * This I/O could fail for various reasons. If so, we will fail to
+ * complete the checkpoint, but there is no reason to force a system
+ * panic. Accordingly, exit critical section while doing it.
+ */
+ END_CRIT_SECTION();
+
+ /*
+ * In some cases there are groups of actions that must all occur on one
+ * side or the other of a checkpoint record. Before flushing the
+ * checkpoint record we must explicitly wait for any backend currently
+ * performing those groups of actions.
+ *
+ * One example is end of transaction, so we must wait for any transactions
+ * that are currently in commit critical sections. If an xact inserted
+ * its commit record into XLOG just before the REDO point, then a crash
+ * restart from the REDO point would not replay that record, which means
+ * that our flushing had better include the xact's update of pg_xact. So
+ * we wait till he's out of his commit critical section before proceeding.
+ * See notes in RecordTransactionCommit().
+ *
+ * Because we've already released the insertion locks, this test is a bit
+ * fuzzy: it is possible that we will wait for xacts we didn't really need
+ * to wait for. But the delay should be short and it seems better to make
+ * checkpoint take a bit longer than to hold off insertions longer than
+ * necessary. (In fact, the whole reason we have this issue is that xact.c
+ * does commit record XLOG insertion and clog update as two separate steps
+ * protected by different locks, but again that seems best on grounds of
+ * minimizing lock contention.)
+ *
+ * A transaction that has not yet set delayChkptFlags when we look cannot
+ * be at risk, since it has not inserted its commit record yet; and one
+ * that's already cleared it is not at risk either, since it's done fixing
+ * clog and we will correctly flush the update below. So we cannot miss
+ * any xacts we need to wait for.
+ */
+ vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
+ if (nvxids > 0)
+ {
+ do
+ {
+ pg_usleep(10000L); /* wait for 10 msec */
+ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+ DELAY_CHKPT_START));
+ }
+ pfree(vxids);
+
+ CheckPointGuts(checkPoint.redo, flags);
+
+ vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+ if (nvxids > 0)
+ {
+ do
+ {
+ pg_usleep(10000L); /* wait for 10 msec */
+ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+ DELAY_CHKPT_COMPLETE));
+ }
+ pfree(vxids);
+
+ /*
+ * Take a snapshot of running transactions and write this to WAL. This
+ * allows us to reconstruct the state of running transactions during
+ * archive recovery, if required. Skip, if this info disabled.
+ *
+ * If we are shutting down, or Startup process is completing crash
+ * recovery we don't need to write running xact data.
+ */
+ if (!shutdown && XLogStandbyInfoActive())
+ LogStandbySnapshot();
+
+ START_CRIT_SECTION();
+
+ /*
+ * Now insert the checkpoint record into XLOG.
+ */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
+ recptr = XLogInsert(RM_XLOG_ID,
+ shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
+ XLOG_CHECKPOINT_ONLINE);
+
+ XLogFlush(recptr);
+
+ /*
+ * We mustn't write any new WAL after a shutdown checkpoint, or it will be
+ * overwritten at next startup. No-one should even try, this just allows
+ * sanity-checking. In the case of an end-of-recovery checkpoint, we want
+ * to just temporarily disable writing until the system has exited
+ * recovery.
+ */
+ if (shutdown)
+ {
+ if (flags & CHECKPOINT_END_OF_RECOVERY)
+ LocalXLogInsertAllowed = oldXLogAllowed;
+ else
+ LocalXLogInsertAllowed = 0; /* never again write WAL */
+ }
+
+ /*
+ * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
+ * = end of actual checkpoint record.
+ */
+ if (shutdown && checkPoint.redo != ProcLastRecPtr)
+ ereport(PANIC,
+ (errmsg("concurrent write-ahead log activity while database system is shutting down")));
+
+ /*
+ * Remember the prior checkpoint's redo ptr for
+ * UpdateCheckPointDistanceEstimate()
+ */
+ PriorRedoPtr = ControlFile->checkPointCopy.redo;
+
+ /*
+ * Update the control file.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ if (shutdown)
+ ControlFile->state = DB_SHUTDOWNED;
+ ControlFile->checkPoint = ProcLastRecPtr;
+ ControlFile->checkPointCopy = checkPoint;
+ /* crash recovery should always recover to the end of WAL */
+ ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
+ ControlFile->minRecoveryPointTLI = 0;
+
+ /*
+ * Persist unloggedLSN value. It's reset on crash recovery, so this goes
+ * unused on non-shutdown checkpoints, but seems useful to store it always
+ * for debugging purposes.
+ */
+ SpinLockAcquire(&XLogCtl->ulsn_lck);
+ ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
+ SpinLockRelease(&XLogCtl->ulsn_lck);
+
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /* Update shared-memory copy of checkpoint XID/epoch */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->ckptFullXid = checkPoint.nextXid;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * We are now done with critical updates; no need for system panic if we
+ * have trouble while fooling with old log segments.
+ */
+ END_CRIT_SECTION();
+
+ /*
+ * Let smgr do post-checkpoint cleanup (eg, deleting old files).
+ */
+ SyncPostCheckpoint();
+
+ /*
+ * Update the average distance between checkpoints if the prior checkpoint
+ * exists.
+ */
+ if (PriorRedoPtr != InvalidXLogRecPtr)
+ UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+ /*
+ * Delete old log files, those no longer needed for last checkpoint to
+ * prevent the disk holding the xlog from growing full.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(recptr, &_logSegNo);
+ if (InvalidateObsoleteReplicationSlots(_logSegNo))
+ {
+ /*
+ * Some slots have been invalidated; recalculate the old-segment
+ * horizon, starting again from RedoRecPtr.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(recptr, &_logSegNo);
+ }
+ _logSegNo--;
+ RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
+ checkPoint.ThisTimeLineID);
+
+ /*
+ * Make more log segments if needed. (Do this after recycling old log
+ * segments, since that may supply some of the needed files.)
+ */
+ if (!shutdown)
+ PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
+
+ /*
+ * Truncate pg_subtrans if possible. We can throw away all data before
+ * the oldest XMIN of any running transaction. No future transaction will
+ * attempt to reference any pg_subtrans entry older than that (see Asserts
+ * in subtrans.c). During recovery, though, we mustn't do this because
+ * StartupSUBTRANS hasn't been called yet.
+ */
+ if (!RecoveryInProgress())
+ TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+
+ /* Real work is done; log and update stats. */
+ LogCheckpointEnd(false);
+
+ /* Reset the process title */
+ update_checkpoint_display(flags, false, true);
+
+ TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
+ NBuffers,
+ CheckpointStats.ckpt_segs_added,
+ CheckpointStats.ckpt_segs_removed,
+ CheckpointStats.ckpt_segs_recycled);
+}
+
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+static void
+CreateEndOfRecoveryRecord(void)
+{
+ xl_end_of_recovery xlrec;
+ XLogRecPtr recptr;
+
+ /* sanity check */
+ if (!RecoveryInProgress())
+ elog(ERROR, "can only be used to end recovery");
+
+ xlrec.end_time = GetCurrentTimestamp();
+
+ WALInsertLockAcquireExclusive();
+ xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
+ xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+ WALInsertLockRelease();
+
+ START_CRIT_SECTION();
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
+
+ XLogFlush(recptr);
+
+ /*
+ * Update the control file so that crash recovery can follow the timeline
+ * changes to this point.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->minRecoveryPoint = recptr;
+ ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Write an OVERWRITE_CONTRECORD message.
+ *
+ * When on WAL replay we expect a continuation record at the start of a page
+ * that is not there, recovery ends and WAL writing resumes at that point.
+ * But it's wrong to resume writing new WAL back at the start of the record
+ * that was broken, because downstream consumers of that WAL (physical
+ * replicas) are not prepared to "rewind". So the first action after
+ * finishing replay of all valid WAL must be to write a record of this type
+ * at the point where the contrecord was missing; to support xlogreader
+ * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
+ * to the page header where the record occurs. xlogreader has an ad-hoc
+ * mechanism to report metadata about the broken record, which is what we
+ * use here.
+ *
+ * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
+ * skip the record it was reading, and pass back the LSN of the skipped
+ * record, so that its caller can verify (on "replay" of that record) that the
+ * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
+ *
+ * 'aborted_lsn' is the beginning position of the record that was incomplete.
+ * It is included in the WAL record. 'pagePtr' and 'newTLI' point to the
+ * beginning of the XLOG page where the record is to be inserted. They must
+ * match the current WAL insert position, they're passed here just so that we
+ * can verify that.
+ */
+static XLogRecPtr
+CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
+ TimeLineID newTLI)
+{
+ xl_overwrite_contrecord xlrec;
+ XLogRecPtr recptr;
+ XLogPageHeader pagehdr;
+ XLogRecPtr startPos;
+
+ /* sanity checks */
+ if (!RecoveryInProgress())
+ elog(ERROR, "can only be used at end of recovery");
+ if (pagePtr % XLOG_BLCKSZ != 0)
+ elog(ERROR, "invalid position for missing continuation record %X/%X",
+ LSN_FORMAT_ARGS(pagePtr));
+
+ /* The current WAL insert position should be right after the page header */
+ startPos = pagePtr;
+ if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
+ startPos += SizeOfXLogLongPHD;
+ else
+ startPos += SizeOfXLogShortPHD;
+ recptr = GetXLogInsertRecPtr();
+ if (recptr != startPos)
+ elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
+ LSN_FORMAT_ARGS(recptr));
+
+ START_CRIT_SECTION();
+
+ /*
+ * Initialize the XLOG page header (by GetXLogBuffer), and set the
+ * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
+ *
+ * No other backend is allowed to write WAL yet, so acquiring the WAL
+ * insertion lock is just pro forma.
+ */
+ WALInsertLockAcquire();
+ pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
+ pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
+ WALInsertLockRelease();
+
+ /*
+ * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
+ * page. We know it becomes the first record, because no other backend is
+ * allowed to write WAL yet.
+ */
+ XLogBeginInsert();
+ xlrec.overwritten_lsn = aborted_lsn;
+ xlrec.overwrite_time = GetCurrentTimestamp();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
+
+ /* check that the record was inserted to the right place */
+ if (ProcLastRecPtr != startPos)
+ elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
+ LSN_FORMAT_ARGS(ProcLastRecPtr));
+
+ XLogFlush(recptr);
+
+ END_CRIT_SECTION();
+
+ return recptr;
+}
+
+/*
+ * Flush all data in shared memory to disk, and fsync
+ *
+ * This is the common code shared between regular checkpoints and
+ * recovery restartpoints.
+ */
+static void
+CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
+{
+ CheckPointRelationMap();
+ CheckPointReplicationSlots();
+ CheckPointSnapBuild();
+ CheckPointLogicalRewriteHeap();
+ CheckPointReplicationOrigin();
+
+ /* Write out all dirty data in SLRUs and the main buffer pool */
+ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
+ CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
+ CheckPointCLOG();
+ CheckPointCommitTs();
+ CheckPointSUBTRANS();
+ CheckPointMultiXact();
+ CheckPointPredicate();
+ CheckPointBuffers(flags);
+
+ /* Perform all queued up fsyncs */
+ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
+ CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
+ ProcessSyncRequests();
+ CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
+ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
+
+ /* We deliberately delay 2PC checkpointing as long as possible */
+ CheckPointTwoPhase(checkPointRedo);
+}
+
+/*
+ * Save a checkpoint for recovery restart if appropriate
+ *
+ * This function is called each time a checkpoint record is read from XLOG.
+ * It must determine whether the checkpoint represents a safe restartpoint or
+ * not. If so, the checkpoint record is stashed in shared memory so that
+ * CreateRestartPoint can consult it. (Note that the latter function is
+ * executed by the checkpointer, while this one will be executed by the
+ * startup process.)
+ */
+static void
+RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
+{
+ /*
+ * Also refrain from creating a restartpoint if we have seen any
+ * references to non-existent pages. Restarting recovery from the
+ * restartpoint would not see the references, so we would lose the
+ * cross-check that the pages belonged to a relation that was dropped
+ * later.
+ */
+ if (XLogHaveInvalidPages())
+ {
+ elog(trace_recovery(DEBUG2),
+ "could not record restart point at %X/%X because there "
+ "are unresolved references to invalid pages",
+ LSN_FORMAT_ARGS(checkPoint->redo));
+ return;
+ }
+
+ /*
+ * Copy the checkpoint record to shared memory, so that checkpointer can
+ * work out the next time it wants to perform a restartpoint.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
+ XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
+ XLogCtl->lastCheckPoint = *checkPoint;
+ SpinLockRelease(&XLogCtl->info_lck);
+}
+
+/*
+ * Establish a restartpoint if possible.
+ *
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a safe checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+ XLogRecPtr lastCheckPointRecPtr;
+ XLogRecPtr lastCheckPointEndPtr;
+ CheckPoint lastCheckPoint;
+ XLogRecPtr PriorRedoPtr;
+ XLogRecPtr receivePtr;
+ XLogRecPtr replayPtr;
+ TimeLineID replayTLI;
+ XLogRecPtr endptr;
+ XLogSegNo _logSegNo;
+ TimestampTz xtime;
+
+ /* Concurrent checkpoint/restartpoint cannot happen */
+ Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
+ /* Get a local copy of the last safe checkpoint record. */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
+ lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
+ lastCheckPoint = XLogCtl->lastCheckPoint;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Check that we're still in recovery mode. It's ok if we exit recovery
+ * mode after this check, the restart point is valid anyway.
+ */
+ if (!RecoveryInProgress())
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("skipping restartpoint, recovery has already ended")));
+ return false;
+ }
+
+ /*
+ * If the last checkpoint record we've replayed is already our last
+ * restartpoint, we can't perform a new restart point. We still update
+ * minRecoveryPoint in that case, so that if this is a shutdown restart
+ * point, we won't start up earlier than before. That's not strictly
+ * necessary, but when hot standby is enabled, it would be rather weird if
+ * the database opened up for read-only connections at a point-in-time
+ * before the last shutdown. Such time travel is still possible in case of
+ * immediate shutdown, though.
+ *
+ * We don't explicitly advance minRecoveryPoint when we do create a
+ * restartpoint. It's assumed that flushing the buffers will do that as a
+ * side-effect.
+ */
+ if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+ lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("skipping restartpoint, already performed at %X/%X",
+ LSN_FORMAT_ARGS(lastCheckPoint.redo))));
+
+ UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+ if (flags & CHECKPOINT_IS_SHUTDOWN)
+ {
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+ }
+ return false;
+ }
+
+ /*
+ * Update the shared RedoRecPtr so that the startup process can calculate
+ * the number of segments replayed since last restartpoint, and request a
+ * restartpoint if it exceeds CheckPointSegments.
+ *
+ * Like in CreateCheckPoint(), hold off insertions to update it, although
+ * during recovery this is just pro forma, because no WAL insertions are
+ * happening.
+ */
+ WALInsertLockAcquireExclusive();
+ RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
+ WALInsertLockRelease();
+
+ /* Also update the info_lck-protected copy */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->RedoRecPtr = lastCheckPoint.redo;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Prepare to accumulate statistics.
+ *
+ * Note: because it is possible for log_checkpoints to change while a
+ * checkpoint proceeds, we always accumulate stats, even if
+ * log_checkpoints is currently off.
+ */
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ if (log_checkpoints)
+ LogCheckpointStart(flags, true);
+
+ /* Update the process title */
+ update_checkpoint_display(flags, true, false);
+
+ CheckPointGuts(lastCheckPoint.redo, flags);
+
+ /*
+ * Remember the prior checkpoint's redo ptr for
+ * UpdateCheckPointDistanceEstimate()
+ */
+ PriorRedoPtr = ControlFile->checkPointCopy.redo;
+
+ /*
+ * Update pg_control, using current time. Check that it still shows an
+ * older checkpoint, else do nothing; this is a quick hack to make sure
+ * nothing really bad happens if somehow we get here after the
+ * end-of-recovery checkpoint.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
+ {
+ /*
+ * Update the checkpoint information. We do this even if the cluster
+ * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
+ * segments recycled below.
+ */
+ ControlFile->checkPoint = lastCheckPointRecPtr;
+ ControlFile->checkPointCopy = lastCheckPoint;
+
+ /*
+ * Ensure minRecoveryPoint is past the checkpoint record and update it
+ * if the control file still shows DB_IN_ARCHIVE_RECOVERY. Normally,
+ * this will have happened already while writing out dirty buffers,
+ * but not necessarily - e.g. because no buffers were dirtied. We do
+ * this because a backup performed in recovery uses minRecoveryPoint
+ * to determine which WAL files must be included in the backup, and
+ * the file (or files) containing the checkpoint record must be
+ * included, at a minimum. Note that for an ordinary restart of
+ * recovery there's no value in having the minimum recovery point any
+ * earlier than this anyway, because redo will begin just after the
+ * checkpoint record.
+ */
+ if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
+ {
+ if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
+ {
+ ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
+ ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
+
+ /* update local copy */
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ if (flags & CHECKPOINT_IS_SHUTDOWN)
+ ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+ }
+ UpdateControlFile();
+ }
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Update the average distance between checkpoints/restartpoints if the
+ * prior checkpoint exists.
+ */
+ if (PriorRedoPtr != InvalidXLogRecPtr)
+ UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+ /*
+ * Delete old log files, those no longer needed for last restartpoint to
+ * prevent the disk holding the xlog from growing full.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+
+ /*
+ * Retreat _logSegNo using the current end of xlog replayed or received,
+ * whichever is later.
+ */
+ receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
+ replayPtr = GetXLogReplayRecPtr(&replayTLI);
+ endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
+ KeepLogSeg(endptr, &_logSegNo);
+ if (InvalidateObsoleteReplicationSlots(_logSegNo))
+ {
+ /*
+ * Some slots have been invalidated; recalculate the old-segment
+ * horizon, starting again from RedoRecPtr.
+ */
+ XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+ KeepLogSeg(endptr, &_logSegNo);
+ }
+ _logSegNo--;
+
+ /*
+ * Try to recycle segments on a useful timeline. If we've been promoted
+ * since the beginning of this restartpoint, use the new timeline chosen
+ * at end of recovery. If we're still in recovery, use the timeline we're
+ * currently replaying.
+ *
+ * There is no guarantee that the WAL segments will be useful on the
+ * current timeline; if recovery proceeds to a new timeline right after
+ * this, the pre-allocated WAL segments on this timeline will not be used,
+ * and will go wasted until recycled on the next restartpoint. We'll live
+ * with that.
+ */
+ if (!RecoveryInProgress())
+ replayTLI = XLogCtl->InsertTimeLineID;
+
+ RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
+
+ /*
+ * Make more log segments if needed. (Do this after recycling old log
+ * segments, since that may supply some of the needed files.)
+ */
+ PreallocXlogFiles(endptr, replayTLI);
+
+ /*
+ * Truncate pg_subtrans if possible. We can throw away all data before
+ * the oldest XMIN of any running transaction. No future transaction will
+ * attempt to reference any pg_subtrans entry older than that (see Asserts
+ * in subtrans.c). When hot standby is disabled, though, we mustn't do
+ * this because StartupSUBTRANS hasn't been called yet.
+ */
+ if (EnableHotStandby)
+ TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+
+ /* Real work is done; log and update stats. */
+ LogCheckpointEnd(true);
+
+ /* Reset the process title */
+ update_checkpoint_display(flags, true, true);
+
+ xtime = GetLatestXTime();
+ ereport((log_checkpoints ? LOG : DEBUG2),
+ (errmsg("recovery restart point at %X/%X",
+ LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+ xtime ? errdetail("Last completed transaction was at log time %s.",
+ timestamptz_to_str(xtime)) : 0));
+
+ /*
+ * Finally, execute archive_cleanup_command, if any.
+ */
+ if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
+ ExecuteRecoveryCommand(archiveCleanupCommand,
+ "archive_cleanup_command",
+ false,
+ WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
+
+ return true;
+}
+
+/*
+ * Report availability of WAL for the given target LSN
+ * (typically a slot's restart_lsn)
+ *
+ * Returns one of the following enum values:
+ *
+ * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
+ * max_wal_size.
+ *
+ * * WALAVAIL_EXTENDED means it is still available by preserving extra
+ * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
+ * than max_wal_size, this state is not returned.
+ *
+ * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
+ * remove reserved segments. The walsender using this slot may return to the
+ * above.
+ *
+ * * WALAVAIL_REMOVED means it has been removed. A replication stream on
+ * a slot with this LSN cannot continue after a restart.
+ *
+ * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
+ */
+WALAvailability
+GetWALAvailability(XLogRecPtr targetLSN)
+{
+ XLogRecPtr currpos; /* current write LSN */
+ XLogSegNo currSeg; /* segid of currpos */
+ XLogSegNo targetSeg; /* segid of targetLSN */
+ XLogSegNo oldestSeg; /* actual oldest segid */
+ XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
+ XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
+ uint64 keepSegs;
+
+ /*
+ * slot does not reserve WAL. Either deactivated, or has never been active
+ */
+ if (XLogRecPtrIsInvalid(targetLSN))
+ return WALAVAIL_INVALID_LSN;
+
+ /*
+ * Calculate the oldest segment currently reserved by all slots,
+ * considering wal_keep_size and max_slot_wal_keep_size. Initialize
+ * oldestSlotSeg to the current segment.
+ */
+ currpos = GetXLogWriteRecPtr();
+ XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
+ KeepLogSeg(currpos, &oldestSlotSeg);
+
+ /*
+ * Find the oldest extant segment file. We get 1 until checkpoint removes
+ * the first WAL segment file since startup, which causes the status being
+ * wrong under certain abnormal conditions but that doesn't actually harm.
+ */
+ oldestSeg = XLogGetLastRemovedSegno() + 1;
+
+ /* calculate oldest segment by max_wal_size */
+ XLByteToSeg(currpos, currSeg, wal_segment_size);
+ keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
+
+ if (currSeg > keepSegs)
+ oldestSegMaxWalSize = currSeg - keepSegs;
+ else
+ oldestSegMaxWalSize = 1;
+
+ /* the segment we care about */
+ XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+
+ /*
+ * No point in returning reserved or extended status values if the
+ * targetSeg is known to be lost.
+ */
+ if (targetSeg >= oldestSlotSeg)
+ {
+ /* show "reserved" when targetSeg is within max_wal_size */
+ if (targetSeg >= oldestSegMaxWalSize)
+ return WALAVAIL_RESERVED;
+
+ /* being retained by slots exceeding max_wal_size */
+ return WALAVAIL_EXTENDED;
+ }
+
+ /* WAL segments are no longer retained but haven't been removed yet */
+ if (targetSeg >= oldestSeg)
+ return WALAVAIL_UNRESERVED;
+
+ /* Definitely lost */
+ return WALAVAIL_REMOVED;
+}
+
+
+/*
+ * Retreat *logSegNo to the last segment that we need to retain because of
+ * either wal_keep_size or replication slots.
+ *
+ * This is calculated by subtracting wal_keep_size from the given xlog
+ * location, recptr and by making sure that that result is below the
+ * requirement of replication slots. For the latter criterion we do consider
+ * the effects of max_slot_wal_keep_size: reserve at most that much space back
+ * from recptr.
+ *
+ * Note about replication slots: if this function calculates a value
+ * that's further ahead than what slots need reserved, then affected
+ * slots need to be invalidated and this function invoked again.
+ * XXX it might be a good idea to rewrite this function so that
+ * invalidation is optionally done here, instead.
+ */
+static void
+KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
+{
+ XLogSegNo currSegNo;
+ XLogSegNo segno;
+ XLogRecPtr keep;
+
+ XLByteToSeg(recptr, currSegNo, wal_segment_size);
+ segno = currSegNo;
+
+ /*
+ * Calculate how many segments are kept by slots first, adjusting for
+ * max_slot_wal_keep_size.
+ */
+ keep = XLogGetReplicationSlotMinimumLSN();
+ if (keep != InvalidXLogRecPtr && keep < recptr)
+ {
+ XLByteToSeg(keep, segno, wal_segment_size);
+
+ /* Cap by max_slot_wal_keep_size ... */
+ if (max_slot_wal_keep_size_mb >= 0)
+ {
+ uint64 slot_keep_segs;
+
+ slot_keep_segs =
+ ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
+
+ if (currSegNo - segno > slot_keep_segs)
+ segno = currSegNo - slot_keep_segs;
+ }
+ }
+
+ /* but, keep at least wal_keep_size if that's set */
+ if (wal_keep_size_mb > 0)
+ {
+ uint64 keep_segs;
+
+ keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
+ if (currSegNo - segno < keep_segs)
+ {
+ /* avoid underflow, don't go below 1 */
+ if (currSegNo <= keep_segs)
+ segno = 1;
+ else
+ segno = currSegNo - keep_segs;
+ }
+ }
+
+ /* don't delete WAL segments newer than the calculated segment */
+ if (segno < *logSegNo)
+ *logSegNo = segno;
+}
+
+/*
+ * Write a NEXTOID log record
+ */
+void
+XLogPutNextOid(Oid nextOid)
+{
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&nextOid), sizeof(Oid));
+ (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
+
+ /*
+ * We need not flush the NEXTOID record immediately, because any of the
+ * just-allocated OIDs could only reach disk as part of a tuple insert or
+ * update that would have its own XLOG record that must follow the NEXTOID
+ * record. Therefore, the standard buffer LSN interlock applied to those
+ * records will ensure no such OID reaches disk before the NEXTOID record
+ * does.
+ *
+ * Note, however, that the above statement only covers state "within" the
+ * database. When we use a generated OID as a file or directory name, we
+ * are in a sense violating the basic WAL rule, because that filesystem
+ * change may reach disk before the NEXTOID WAL record does. The impact
+ * of this is that if a database crash occurs immediately afterward, we
+ * might after restart re-generate the same OID and find that it conflicts
+ * with the leftover file or directory. But since for safety's sake we
+ * always loop until finding a nonconflicting filename, this poses no real
+ * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
+ */
+}
+
+/*
+ * Write an XLOG SWITCH record.
+ *
+ * Here we just blindly issue an XLogInsert request for the record.
+ * All the magic happens inside XLogInsert.
+ *
+ * The return value is either the end+1 address of the switch record,
+ * or the end+1 address of the prior segment if we did not need to
+ * write a switch record because we are already at segment start.
+ */
+XLogRecPtr
+RequestXLogSwitch(bool mark_unimportant)
+{
+ XLogRecPtr RecPtr;
+
+ /* XLOG SWITCH has no data */
+ XLogBeginInsert();
+
+ if (mark_unimportant)
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+ RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
+
+ return RecPtr;
+}
+
+/*
+ * Write a RESTORE POINT record
+ */
+XLogRecPtr
+XLogRestorePoint(const char *rpName)
+{
+ XLogRecPtr RecPtr;
+ xl_restore_point xlrec;
+
+ xlrec.rp_time = GetCurrentTimestamp();
+ strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
+
+ RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
+
+ ereport(LOG,
+ (errmsg("restore point \"%s\" created at %X/%X",
+ rpName, LSN_FORMAT_ARGS(RecPtr))));
+
+ return RecPtr;
+}
+
+/*
+ * Check if any of the GUC parameters that are critical for hot standby
+ * have changed, and update the value in pg_control file if necessary.
+ */
+static void
+XLogReportParameters(void)
+{
+ if (wal_level != ControlFile->wal_level ||
+ wal_log_hints != ControlFile->wal_log_hints ||
+ MaxConnections != ControlFile->MaxConnections ||
+ max_worker_processes != ControlFile->max_worker_processes ||
+ max_wal_senders != ControlFile->max_wal_senders ||
+ max_prepared_xacts != ControlFile->max_prepared_xacts ||
+ max_locks_per_xact != ControlFile->max_locks_per_xact ||
+ track_commit_timestamp != ControlFile->track_commit_timestamp)
+ {
+ /*
+ * The change in number of backend slots doesn't need to be WAL-logged
+ * if archiving is not enabled, as you can't start archive recovery
+ * with wal_level=minimal anyway. We don't really care about the
+ * values in pg_control either if wal_level=minimal, but seems better
+ * to keep them up-to-date to avoid confusion.
+ */
+ if (wal_level != ControlFile->wal_level || XLogIsNeeded())
+ {
+ xl_parameter_change xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.MaxConnections = MaxConnections;
+ xlrec.max_worker_processes = max_worker_processes;
+ xlrec.max_wal_senders = max_wal_senders;
+ xlrec.max_prepared_xacts = max_prepared_xacts;
+ xlrec.max_locks_per_xact = max_locks_per_xact;
+ xlrec.wal_level = wal_level;
+ xlrec.wal_log_hints = wal_log_hints;
+ xlrec.track_commit_timestamp = track_commit_timestamp;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
+ XLogFlush(recptr);
+ }
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+ ControlFile->MaxConnections = MaxConnections;
+ ControlFile->max_worker_processes = max_worker_processes;
+ ControlFile->max_wal_senders = max_wal_senders;
+ ControlFile->max_prepared_xacts = max_prepared_xacts;
+ ControlFile->max_locks_per_xact = max_locks_per_xact;
+ ControlFile->wal_level = wal_level;
+ ControlFile->wal_log_hints = wal_log_hints;
+ ControlFile->track_commit_timestamp = track_commit_timestamp;
+ UpdateControlFile();
+
+ LWLockRelease(ControlFileLock);
+ }
+}
+
+/*
+ * Update full_page_writes in shared memory, and write an
+ * XLOG_FPW_CHANGE record if necessary.
+ *
+ * Note: this function assumes there is no other process running
+ * concurrently that could update it.
+ */
+void
+UpdateFullPageWrites(void)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ bool recoveryInProgress;
+
+ /*
+ * Do nothing if full_page_writes has not been changed.
+ *
+ * It's safe to check the shared full_page_writes without the lock,
+ * because we assume that there is no concurrently running process which
+ * can update it.
+ */
+ if (fullPageWrites == Insert->fullPageWrites)
+ return;
+
+ /*
+ * Perform this outside critical section so that the WAL insert
+ * initialization done by RecoveryInProgress() doesn't trigger an
+ * assertion failure.
+ */
+ recoveryInProgress = RecoveryInProgress();
+
+ START_CRIT_SECTION();
+
+ /*
+ * It's always safe to take full page images, even when not strictly
+ * required, but not the other round. So if we're setting full_page_writes
+ * to true, first set it true and then write the WAL record. If we're
+ * setting it to false, first write the WAL record and then set the global
+ * flag.
+ */
+ if (fullPageWrites)
+ {
+ WALInsertLockAcquireExclusive();
+ Insert->fullPageWrites = true;
+ WALInsertLockRelease();
+ }
+
+ /*
+ * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
+ * full_page_writes during archive recovery, if required.
+ */
+ if (XLogStandbyInfoActive() && !recoveryInProgress)
+ {
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
+
+ XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
+ }
+
+ if (!fullPageWrites)
+ {
+ WALInsertLockAcquireExclusive();
+ Insert->fullPageWrites = false;
+ WALInsertLockRelease();
+ }
+ END_CRIT_SECTION();
+}
+
+/*
+ * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all record types are related to control file updates.
+ *
+ * NOTE: Some XLOG record types that are directly related to WAL recovery
+ * are handled in xlogrecovery_redo().
+ */
+void
+xlog_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ XLogRecPtr lsn = record->EndRecPtr;
+
+ /*
+ * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
+ * XLOG_FPI_FOR_HINT records.
+ */
+ Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+ !XLogRecHasAnyBlockRefs(record));
+
+ if (info == XLOG_NEXTOID)
+ {
+ Oid nextOid;
+
+ /*
+ * We used to try to take the maximum of ShmemVariableCache->nextOid
+ * and the recorded nextOid, but that fails if the OID counter wraps
+ * around. Since no OID allocation should be happening during replay
+ * anyway, better to just believe the record exactly. We still take
+ * OidGenLock while setting the variable, just in case.
+ */
+ memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
+ LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextOid = nextOid;
+ ShmemVariableCache->oidCount = 0;
+ LWLockRelease(OidGenLock);
+ }
+ else if (info == XLOG_CHECKPOINT_SHUTDOWN)
+ {
+ CheckPoint checkPoint;
+ TimeLineID replayTLI;
+
+ memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+ /* In a SHUTDOWN checkpoint, believe the counters exactly */
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextXid = checkPoint.nextXid;
+ LWLockRelease(XidGenLock);
+ LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->oidCount = 0;
+ LWLockRelease(OidGenLock);
+ MultiXactSetNextMXact(checkPoint.nextMulti,
+ checkPoint.nextMultiOffset);
+
+ MultiXactAdvanceOldest(checkPoint.oldestMulti,
+ checkPoint.oldestMultiDB);
+
+ /*
+ * No need to set oldestClogXid here as well; it'll be set when we
+ * redo an xl_clog_truncate if it changed since initialization.
+ */
+ SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+
+ /*
+ * If we see a shutdown checkpoint while waiting for an end-of-backup
+ * record, the backup was canceled and the end-of-backup record will
+ * never arrive.
+ */
+ if (ArchiveRecoveryRequested &&
+ !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
+ XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
+ ereport(PANIC,
+ (errmsg("online backup was canceled, recovery cannot continue")));
+
+ /*
+ * If we see a shutdown checkpoint, we know that nothing was running
+ * on the primary at this point. So fake-up an empty running-xacts
+ * record and use that here and now. Recover additional standby state
+ * for prepared transactions.
+ */
+ if (standbyState >= STANDBY_INITIALIZED)
+ {
+ TransactionId *xids;
+ int nxids;
+ TransactionId oldestActiveXID;
+ TransactionId latestCompletedXid;
+ RunningTransactionsData running;
+
+ oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
+ /*
+ * Construct a RunningTransactions snapshot representing a shut
+ * down server, with only prepared transactions still alive. We're
+ * never overflowed at this point because all subxids are listed
+ * with their parent prepared transactions.
+ */
+ running.xcnt = nxids;
+ running.subxcnt = 0;
+ running.subxid_overflow = false;
+ running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+ running.oldestRunningXid = oldestActiveXID;
+ latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+ TransactionIdRetreat(latestCompletedXid);
+ Assert(TransactionIdIsNormal(latestCompletedXid));
+ running.latestCompletedXid = latestCompletedXid;
+ running.xids = xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+
+ StandbyRecoverPreparedTransactions();
+ }
+
+ /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+ LWLockRelease(ControlFileLock);
+
+ /* Update shared-memory copy of checkpoint XID/epoch */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->ckptFullXid = checkPoint.nextXid;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * We should've already switched to the new TLI before replaying this
+ * record.
+ */
+ (void) GetCurrentReplayRecPtr(&replayTLI);
+ if (checkPoint.ThisTimeLineID != replayTLI)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
+ checkPoint.ThisTimeLineID, replayTLI)));
+
+ RecoveryRestartPoint(&checkPoint, record);
+ }
+ else if (info == XLOG_CHECKPOINT_ONLINE)
+ {
+ CheckPoint checkPoint;
+ TimeLineID replayTLI;
+
+ memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+ /* In an ONLINE checkpoint, treat the XID counter as a minimum */
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+ if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
+ checkPoint.nextXid))
+ ShmemVariableCache->nextXid = checkPoint.nextXid;
+ LWLockRelease(XidGenLock);
+
+ /*
+ * We ignore the nextOid counter in an ONLINE checkpoint, preferring
+ * to track OID assignment through XLOG_NEXTOID records. The nextOid
+ * counter is from the start of the checkpoint and might well be stale
+ * compared to later XLOG_NEXTOID records. We could try to take the
+ * maximum of the nextOid counter and our latest value, but since
+ * there's no particular guarantee about the speed with which the OID
+ * counter wraps around, that's a risky thing to do. In any case,
+ * users of the nextOid counter are required to avoid assignment of
+ * duplicates, so that a somewhat out-of-date value should be safe.
+ */
+
+ /* Handle multixact */
+ MultiXactAdvanceNextMXact(checkPoint.nextMulti,
+ checkPoint.nextMultiOffset);
+
+ /*
+ * NB: This may perform multixact truncation when replaying WAL
+ * generated by an older primary.
+ */
+ MultiXactAdvanceOldest(checkPoint.oldestMulti,
+ checkPoint.oldestMultiDB);
+ if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
+ checkPoint.oldestXid))
+ SetTransactionIdLimit(checkPoint.oldestXid,
+ checkPoint.oldestXidDB);
+ /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+ LWLockRelease(ControlFileLock);
+
+ /* Update shared-memory copy of checkpoint XID/epoch */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->ckptFullXid = checkPoint.nextXid;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /* TLI should not change in an on-line checkpoint */
+ (void) GetCurrentReplayRecPtr(&replayTLI);
+ if (checkPoint.ThisTimeLineID != replayTLI)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
+ checkPoint.ThisTimeLineID, replayTLI)));
+
+ RecoveryRestartPoint(&checkPoint, record);
+ }
+ else if (info == XLOG_OVERWRITE_CONTRECORD)
+ {
+ /* nothing to do here, handled in xlogrecovery_redo() */
+ }
+ else if (info == XLOG_END_OF_RECOVERY)
+ {
+ xl_end_of_recovery xlrec;
+ TimeLineID replayTLI;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+ /*
+ * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+ * but this case is rarer and harder to test, so the benefit doesn't
+ * outweigh the potential extra cost of maintenance.
+ */
+
+ /*
+ * We should've already switched to the new TLI before replaying this
+ * record.
+ */
+ (void) GetCurrentReplayRecPtr(&replayTLI);
+ if (xlrec.ThisTimeLineID != replayTLI)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
+ xlrec.ThisTimeLineID, replayTLI)));
+ }
+ else if (info == XLOG_NOOP)
+ {
+ /* nothing to do here */
+ }
+ else if (info == XLOG_SWITCH)
+ {
+ /* nothing to do here */
+ }
+ else if (info == XLOG_RESTORE_POINT)
+ {
+ /* nothing to do here, handled in xlogrecovery.c */
+ }
+ else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+ {
+ /*
+ * XLOG_FPI records contain nothing else but one or more block
+ * references. Every block reference must include a full-page image
+ * even if full_page_writes was disabled when the record was generated
+ * - otherwise there would be no point in this record.
+ *
+ * XLOG_FPI_FOR_HINT records are generated when a page needs to be
+ * WAL-logged because of a hint bit update. They are only generated
+ * when checksums and/or wal_log_hints are enabled. They may include
+ * no full-page images if full_page_writes was disabled when they were
+ * generated. In this case there is nothing to do here.
+ *
+ * No recovery conflicts are generated by these generic records - if a
+ * resource manager needs to generate conflicts, it has to define a
+ * separate WAL record type and redo routine.
+ */
+ for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ Buffer buffer;
+
+ if (!XLogRecHasBlockImage(record, block_id))
+ {
+ if (info == XLOG_FPI)
+ elog(ERROR, "XLOG_FPI record did not contain a full-page image");
+ continue;
+ }
+
+ if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+ elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ else if (info == XLOG_BACKUP_END)
+ {
+ /* nothing to do here, handled in xlogrecovery_redo() */
+ }
+ else if (info == XLOG_PARAMETER_CHANGE)
+ {
+ xl_parameter_change xlrec;
+
+ /* Update our copy of the parameters in pg_control */
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->MaxConnections = xlrec.MaxConnections;
+ ControlFile->max_worker_processes = xlrec.max_worker_processes;
+ ControlFile->max_wal_senders = xlrec.max_wal_senders;
+ ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
+ ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
+ ControlFile->wal_level = xlrec.wal_level;
+ ControlFile->wal_log_hints = xlrec.wal_log_hints;
+
+ /*
+ * Update minRecoveryPoint to ensure that if recovery is aborted, we
+ * recover back up to this point before allowing hot standby again.
+ * This is important if the max_* settings are decreased, to ensure
+ * you don't run queries against the WAL preceding the change. The
+ * local copies cannot be updated as long as crash recovery is
+ * happening and we expect all the WAL to be replayed.
+ */
+ if (InArchiveRecovery)
+ {
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
+ {
+ TimeLineID replayTLI;
+
+ (void) GetCurrentReplayRecPtr(&replayTLI);
+ ControlFile->minRecoveryPoint = lsn;
+ ControlFile->minRecoveryPointTLI = replayTLI;
+ }
+
+ CommitTsParameterChange(xlrec.track_commit_timestamp,
+ ControlFile->track_commit_timestamp);
+ ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /* Check to see if any parameter change gives a problem on recovery */
+ CheckRequiredParameterValues();
+ }
+ else if (info == XLOG_FPW_CHANGE)
+ {
+ bool fpw;
+
+ memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
+
+ /*
+ * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
+ * do_pg_backup_start() and do_pg_backup_stop() can check whether
+ * full_page_writes has been disabled during online backup.
+ */
+ if (!fpw)
+ {
+ SpinLockAcquire(&XLogCtl->info_lck);
+ if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
+ XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
+ /* Keep track of full_page_writes */
+ lastFullPageWrites = fpw;
+ }
+}
+
+/*
+ * Return the (possible) sync flag used for opening a file, depending on the
+ * value of the GUC wal_sync_method.
+ */
+static int
+get_sync_bit(int method)
+{
+ int o_direct_flag = 0;
+
+ /* If fsync is disabled, never open in sync mode */
+ if (!enableFsync)
+ return 0;
+
+ /*
+ * Optimize writes by bypassing kernel cache with O_DIRECT when using
+ * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
+ * disabled, otherwise the archive command or walsender process will read
+ * the WAL soon after writing it, which is guaranteed to cause a physical
+ * read if we bypassed the kernel cache. We also skip the
+ * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
+ * reason.
+ *
+ * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+ * written by walreceiver is normally read by the startup process soon
+ * after it's written. Also, walreceiver performs unaligned writes, which
+ * don't work with O_DIRECT, so it is required for correctness too.
+ */
+ if (!XLogIsNeeded() && !AmWalReceiverProcess())
+ o_direct_flag = PG_O_DIRECT;
+
+ switch (method)
+ {
+ /*
+ * enum values for all sync options are defined even if they are
+ * not supported on the current platform. But if not, they are
+ * not included in the enum option array, and therefore will never
+ * be seen here.
+ */
+ case SYNC_METHOD_FSYNC:
+ case SYNC_METHOD_FSYNC_WRITETHROUGH:
+ case SYNC_METHOD_FDATASYNC:
+ return 0;
+#ifdef OPEN_SYNC_FLAG
+ case SYNC_METHOD_OPEN:
+ return OPEN_SYNC_FLAG | o_direct_flag;
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+ case SYNC_METHOD_OPEN_DSYNC:
+ return OPEN_DATASYNC_FLAG | o_direct_flag;
+#endif
+ default:
+ /* can't happen (unless we are out of sync with option array) */
+ elog(ERROR, "unrecognized wal_sync_method: %d", method);
+ return 0; /* silence warning */
+ }
+}
+
+/*
+ * GUC support
+ */
+void
+assign_xlog_sync_method(int new_sync_method, void *extra)
+{
+ if (sync_method != new_sync_method)
+ {
+ /*
+ * To ensure that no blocks escape unsynced, force an fsync on the
+ * currently open log segment (if any). Also, if the open flag is
+ * changing, close the log file so it will be reopened (with new flag
+ * bit) at next use.
+ */
+ if (openLogFile >= 0)
+ {
+ pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
+ if (pg_fsync(openLogFile) != 0)
+ {
+ char xlogfname[MAXFNAMELEN];
+ int save_errno;
+
+ save_errno = errno;
+ XLogFileName(xlogfname, openLogTLI, openLogSegNo,
+ wal_segment_size);
+ errno = save_errno;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", xlogfname)));
+ }
+
+ pgstat_report_wait_end();
+ if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
+ XLogFileClose();
+ }
+ }
+}
+
+
+/*
+ * Issue appropriate kind of fsync (if any) for an XLOG output file.
+ *
+ * 'fd' is a file descriptor for the XLOG file to be fsync'd.
+ * 'segno' is for error reporting purposes.
+ */
+void
+issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
+{
+ char *msg = NULL;
+ instr_time start;
+
+ Assert(tli != 0);
+
+ /*
+ * Quick exit if fsync is disabled or write() has already synced the WAL
+ * file.
+ */
+ if (!enableFsync ||
+ sync_method == SYNC_METHOD_OPEN ||
+ sync_method == SYNC_METHOD_OPEN_DSYNC)
+ return;
+
+ /* Measure I/O timing to sync the WAL file */
+ if (track_wal_io_timing)
+ INSTR_TIME_SET_CURRENT(start);
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
+ switch (sync_method)
+ {
+ case SYNC_METHOD_FSYNC:
+ if (pg_fsync_no_writethrough(fd) != 0)
+ msg = _("could not fsync file \"%s\": %m");
+ break;
+#ifdef HAVE_FSYNC_WRITETHROUGH
+ case SYNC_METHOD_FSYNC_WRITETHROUGH:
+ if (pg_fsync_writethrough(fd) != 0)
+ msg = _("could not fsync write-through file \"%s\": %m");
+ break;
+#endif
+#ifdef HAVE_FDATASYNC
+ case SYNC_METHOD_FDATASYNC:
+ if (pg_fdatasync(fd) != 0)
+ msg = _("could not fdatasync file \"%s\": %m");
+ break;
+#endif
+ case SYNC_METHOD_OPEN:
+ case SYNC_METHOD_OPEN_DSYNC:
+ /* not reachable */
+ Assert(false);
+ break;
+ default:
+ elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
+ break;
+ }
+
+ /* PANIC if failed to fsync */
+ if (msg)
+ {
+ char xlogfname[MAXFNAMELEN];
+ int save_errno = errno;
+
+ XLogFileName(xlogfname, tli, segno, wal_segment_size);
+ errno = save_errno;
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg(msg, xlogfname)));
+ }
+
+ pgstat_report_wait_end();
+
+ /*
+ * Increment the I/O timing and the number of times WAL files were synced.
+ */
+ if (track_wal_io_timing)
+ {
+ instr_time duration;
+
+ INSTR_TIME_SET_CURRENT(duration);
+ INSTR_TIME_SUBTRACT(duration, start);
+ PendingWalStats.wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
+ }
+
+ PendingWalStats.wal_sync++;
+}
+
+/*
+ * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
+ * function. It creates the necessary starting checkpoint and constructs the
+ * backup label and tablespace map.
+ *
+ * Input parameters are "backupidstr" (the backup label string) and "fast"
+ * (if true, we do the checkpoint in immediate mode to make it faster).
+ *
+ * The backup label and tablespace map contents are appended to *labelfile and
+ * *tblspcmapfile, and the caller is responsible for including them in the
+ * backup archive as 'backup_label' and 'tablespace_map'.
+ * tblspcmapfile is required mainly for tar format in windows as native windows
+ * utilities are not able to create symlinks while extracting files from tar.
+ * However for consistency and platform-independence, we do it the same way
+ * everywhere.
+ *
+ * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
+ * describing the cluster's tablespaces.
+ *
+ * Returns the minimum WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *starttli_p.
+ *
+ * Every successfully started backup must be stopped by calling
+ * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
+ * backups active at the same time.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
+ */
+XLogRecPtr
+do_pg_backup_start(const char *backupidstr, bool fast, TimeLineID *starttli_p,
+ StringInfo labelfile, List **tablespaces,
+ StringInfo tblspcmapfile)
+{
+ bool backup_started_in_recovery = false;
+ XLogRecPtr checkpointloc;
+ XLogRecPtr startpoint;
+ TimeLineID starttli;
+ pg_time_t stamp_time;
+ char strfbuf[128];
+ char xlogfilename[MAXFNAMELEN];
+ XLogSegNo _logSegNo;
+
+ backup_started_in_recovery = RecoveryInProgress();
+
+ /*
+ * During recovery, we don't need to check WAL level. Because, if WAL
+ * level is not sufficient, it's impossible to get here during recovery.
+ */
+ if (!backup_started_in_recovery && !XLogIsNeeded())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL level not sufficient for making an online backup"),
+ errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+ if (strlen(backupidstr) > MAXPGPATH)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("backup label too long (max %d bytes)",
+ MAXPGPATH)));
+
+ /*
+ * Mark backup active in shared memory. We must do full-page WAL writes
+ * during an on-line backup even if not doing so at other times, because
+ * it's quite possible for the backup dump to obtain a "torn" (partially
+ * written) copy of a database page if it reads the page concurrently with
+ * our write to the same page. This can be fixed as long as the first
+ * write to the page in the WAL sequence is a full-page write. Hence, we
+ * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
+ * are no dirty pages in shared memory that might get dumped while the
+ * backup is in progress without having a corresponding WAL record. (Once
+ * the backup is complete, we need not force full-page writes anymore,
+ * since we expect that any pages not modified during the backup interval
+ * must have been correctly captured by the backup.)
+ *
+ * Note that forcePageWrites has no effect during an online backup from
+ * the standby.
+ *
+ * We must hold all the insertion locks to change the value of
+ * forcePageWrites, to ensure adequate interlocking against
+ * XLogInsertRecord().
+ */
+ WALInsertLockAcquireExclusive();
+ XLogCtl->Insert.runningBackups++;
+ XLogCtl->Insert.forcePageWrites = true;
+ WALInsertLockRelease();
+
+ /* Ensure we release forcePageWrites if fail below */
+ PG_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0);
+ {
+ bool gotUniqueStartpoint = false;
+ DIR *tblspcdir;
+ struct dirent *de;
+ tablespaceinfo *ti;
+ int datadirpathlen;
+
+ /*
+ * Force an XLOG file switch before the checkpoint, to ensure that the
+ * WAL segment the checkpoint is written to doesn't contain pages with
+ * old timeline IDs. That would otherwise happen if you called
+ * pg_backup_start() right after restoring from a PITR archive: the
+ * first WAL segment containing the startup checkpoint has pages in
+ * the beginning with the old timeline ID. That can cause trouble at
+ * recovery: we won't have a history file covering the old timeline if
+ * pg_wal directory was not included in the base backup and the WAL
+ * archive was cleared too before starting the backup.
+ *
+ * This also ensures that we have emitted a WAL page header that has
+ * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
+ * Therefore, if a WAL archiver (such as pglesslog) is trying to
+ * compress out removable backup blocks, it won't remove any that
+ * occur after this point.
+ *
+ * During recovery, we skip forcing XLOG file switch, which means that
+ * the backup taken during recovery is not available for the special
+ * recovery case described above.
+ */
+ if (!backup_started_in_recovery)
+ RequestXLogSwitch(false);
+
+ do
+ {
+ bool checkpointfpw;
+
+ /*
+ * Force a CHECKPOINT. Aside from being necessary to prevent torn
+ * page problems, this guarantees that two successive backup runs
+ * will have different checkpoint positions and hence different
+ * history file names, even if nothing happened in between.
+ *
+ * During recovery, establish a restartpoint if possible. We use
+ * the last restartpoint as the backup starting checkpoint. This
+ * means that two successive backup runs can have same checkpoint
+ * positions.
+ *
+ * Since the fact that we are executing do_pg_backup_start()
+ * during recovery means that checkpointer is running, we can use
+ * RequestCheckpoint() to establish a restartpoint.
+ *
+ * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+ * passing fast = true). Otherwise this can take awhile.
+ */
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+ (fast ? CHECKPOINT_IMMEDIATE : 0));
+
+ /*
+ * Now we need to fetch the checkpoint record location, and also
+ * its REDO pointer. The oldest point in WAL that would be needed
+ * to restore starting from the checkpoint is precisely the REDO
+ * pointer.
+ */
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ checkpointloc = ControlFile->checkPoint;
+ startpoint = ControlFile->checkPointCopy.redo;
+ starttli = ControlFile->checkPointCopy.ThisTimeLineID;
+ checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
+ LWLockRelease(ControlFileLock);
+
+ if (backup_started_in_recovery)
+ {
+ XLogRecPtr recptr;
+
+ /*
+ * Check to see if all WAL replayed during online backup
+ * (i.e., since last restartpoint used as backup starting
+ * checkpoint) contain full-page writes.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ recptr = XLogCtl->lastFpwDisableRecPtr;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (!checkpointfpw || startpoint <= recptr)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL generated with full_page_writes=off was replayed "
+ "since last restartpoint"),
+ errhint("This means that the backup being taken on the standby "
+ "is corrupt and should not be used. "
+ "Enable full_page_writes and run CHECKPOINT on the primary, "
+ "and then try an online backup again.")));
+
+ /*
+ * During recovery, since we don't use the end-of-backup WAL
+ * record and don't write the backup history file, the
+ * starting WAL location doesn't need to be unique. This means
+ * that two base backups started at the same time might use
+ * the same checkpoint as starting locations.
+ */
+ gotUniqueStartpoint = true;
+ }
+
+ /*
+ * If two base backups are started at the same time (in WAL sender
+ * processes), we need to make sure that they use different
+ * checkpoints as starting locations, because we use the starting
+ * WAL location as a unique identifier for the base backup in the
+ * end-of-backup WAL record and when we write the backup history
+ * file. Perhaps it would be better generate a separate unique ID
+ * for each backup instead of forcing another checkpoint, but
+ * taking a checkpoint right after another is not that expensive
+ * either because only few buffers have been dirtied yet.
+ */
+ WALInsertLockAcquireExclusive();
+ if (XLogCtl->Insert.lastBackupStart < startpoint)
+ {
+ XLogCtl->Insert.lastBackupStart = startpoint;
+ gotUniqueStartpoint = true;
+ }
+ WALInsertLockRelease();
+ } while (!gotUniqueStartpoint);
+
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
+
+ /*
+ * Construct tablespace_map file.
+ */
+ datadirpathlen = strlen(DataDir);
+
+ /* Collect information about all tablespaces */
+ tblspcdir = AllocateDir("pg_tblspc");
+ while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
+ {
+ char fullpath[MAXPGPATH + 10];
+ char linkpath[MAXPGPATH];
+ char *relpath = NULL;
+ int rllen;
+ StringInfoData escapedpath;
+ char *s;
+
+ /* Skip anything that doesn't look like a tablespace */
+ if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+ continue;
+
+ snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
+
+ /*
+ * Skip anything that isn't a symlink/junction. For testing only,
+ * we sometimes use allow_in_place_tablespaces to create
+ * directories directly under pg_tblspc, which would fail below.
+ */
+ if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
+ continue;
+
+#if defined(HAVE_READLINK) || defined(WIN32)
+ rllen = readlink(fullpath, linkpath, sizeof(linkpath));
+ if (rllen < 0)
+ {
+ ereport(WARNING,
+ (errmsg("could not read symbolic link \"%s\": %m",
+ fullpath)));
+ continue;
+ }
+ else if (rllen >= sizeof(linkpath))
+ {
+ ereport(WARNING,
+ (errmsg("symbolic link \"%s\" target is too long",
+ fullpath)));
+ continue;
+ }
+ linkpath[rllen] = '\0';
+
+ /*
+ * Build a backslash-escaped version of the link path to include
+ * in the tablespace map file.
+ */
+ initStringInfo(&escapedpath);
+ for (s = linkpath; *s; s++)
+ {
+ if (*s == '\n' || *s == '\r' || *s == '\\')
+ appendStringInfoChar(&escapedpath, '\\');
+ appendStringInfoChar(&escapedpath, *s);
+ }
+
+ /*
+ * Relpath holds the relative path of the tablespace directory
+ * when it's located within PGDATA, or NULL if it's located
+ * elsewhere.
+ */
+ if (rllen > datadirpathlen &&
+ strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
+ IS_DIR_SEP(linkpath[datadirpathlen]))
+ relpath = linkpath + datadirpathlen + 1;
+
+ ti = palloc(sizeof(tablespaceinfo));
+ ti->oid = pstrdup(de->d_name);
+ ti->path = pstrdup(linkpath);
+ ti->rpath = relpath ? pstrdup(relpath) : NULL;
+ ti->size = -1;
+
+ if (tablespaces)
+ *tablespaces = lappend(*tablespaces, ti);
+
+ appendStringInfo(tblspcmapfile, "%s %s\n",
+ ti->oid, escapedpath.data);
+
+ pfree(escapedpath.data);
+#else
+
+ /*
+ * If the platform does not have symbolic links, it should not be
+ * possible to have tablespaces - clearly somebody else created
+ * them. Warn about it and ignore.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("tablespaces are not supported on this platform")));
+#endif
+ }
+ FreeDir(tblspcdir);
+
+ /*
+ * Construct backup label file.
+ */
+
+ /* Use the log timezone here, not the session timezone */
+ stamp_time = (pg_time_t) time(NULL);
+ pg_strftime(strfbuf, sizeof(strfbuf),
+ "%Y-%m-%d %H:%M:%S %Z",
+ pg_localtime(&stamp_time, log_timezone));
+ appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
+ LSN_FORMAT_ARGS(startpoint), xlogfilename);
+ appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
+ LSN_FORMAT_ARGS(checkpointloc));
+ appendStringInfo(labelfile, "BACKUP METHOD: streamed\n");
+ appendStringInfo(labelfile, "BACKUP FROM: %s\n",
+ backup_started_in_recovery ? "standby" : "primary");
+ appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
+ appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
+ appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0);
+
+ /*
+ * Mark that the start phase has correctly finished for the backup.
+ */
+ sessionBackupState = SESSION_BACKUP_RUNNING;
+
+ /*
+ * We're done. As a convenience, return the starting WAL location.
+ */
+ if (starttli_p)
+ *starttli_p = starttli;
+ return startpoint;
+}
+
+/* Error cleanup callback for pg_backup_start */
+static void
+pg_backup_start_callback(int code, Datum arg)
+{
+ /* Update backup counters and forcePageWrites on failure */
+ WALInsertLockAcquireExclusive();
+
+ Assert(XLogCtl->Insert.runningBackups > 0);
+ XLogCtl->Insert.runningBackups--;
+
+ if (XLogCtl->Insert.runningBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
+ WALInsertLockRelease();
+}
+
+/*
+ * Utility routine to fetch the session-level status of a backup running.
+ */
+SessionBackupState
+get_backup_status(void)
+{
+ return sessionBackupState;
+}
+
+/*
+ * do_pg_backup_stop
+ *
+ * Utility function called at the end of an online backup. It cleans up the
+ * backup state and can optionally wait for WAL segments to be archived.
+ *
+ * Returns the last WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *stoptli_p.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
+ */
+XLogRecPtr
+do_pg_backup_stop(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
+{
+ bool backup_started_in_recovery = false;
+ XLogRecPtr startpoint;
+ XLogRecPtr stoppoint;
+ TimeLineID stoptli;
+ pg_time_t stamp_time;
+ char strfbuf[128];
+ char histfilepath[MAXPGPATH];
+ char startxlogfilename[MAXFNAMELEN];
+ char stopxlogfilename[MAXFNAMELEN];
+ char lastxlogfilename[MAXFNAMELEN];
+ char histfilename[MAXFNAMELEN];
+ char backupfrom[20];
+ XLogSegNo _logSegNo;
+ FILE *fp;
+ char ch;
+ int seconds_before_warning;
+ int waits = 0;
+ bool reported_waiting = false;
+ char *remaining;
+ char *ptr;
+ uint32 hi,
+ lo;
+
+ backup_started_in_recovery = RecoveryInProgress();
+
+ /*
+ * During recovery, we don't need to check WAL level. Because, if WAL
+ * level is not sufficient, it's impossible to get here during recovery.
+ */
+ if (!backup_started_in_recovery && !XLogIsNeeded())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL level not sufficient for making an online backup"),
+ errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+ /*
+ * OK to update backup counters, forcePageWrites, and session-level lock.
+ *
+ * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
+ * Otherwise they can be updated inconsistently, and which might cause
+ * do_pg_abort_backup() to fail.
+ */
+ WALInsertLockAcquireExclusive();
+
+ /*
+ * It is expected that each do_pg_backup_start() call is matched by
+ * exactly one do_pg_backup_stop() call.
+ */
+ Assert(XLogCtl->Insert.runningBackups > 0);
+ XLogCtl->Insert.runningBackups--;
+
+ if (XLogCtl->Insert.runningBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
+
+ /*
+ * Clean up session-level lock.
+ *
+ * You might think that WALInsertLockRelease() can be called before
+ * cleaning up session-level lock because session-level lock doesn't need
+ * to be protected with WAL insertion lock. But since
+ * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
+ * cleaned up before it.
+ */
+ sessionBackupState = SESSION_BACKUP_NONE;
+
+ WALInsertLockRelease();
+
+ /*
+ * Read and parse the START WAL LOCATION line (this code is pretty crude,
+ * but we are not expecting any variability in the file format).
+ */
+ if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
+ &hi, &lo, startxlogfilename,
+ &ch) != 4 || ch != '\n')
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ startpoint = ((uint64) hi) << 32 | lo;
+ remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
+
+ /*
+ * Parse the BACKUP FROM line. If we are taking an online backup from the
+ * standby, we confirm that the standby has not been promoted during the
+ * backup.
+ */
+ ptr = strstr(remaining, "BACKUP FROM:");
+ if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("the standby was promoted during online backup"),
+ errhint("This means that the backup being taken is corrupt "
+ "and should not be used. "
+ "Try taking another online backup.")));
+
+ /*
+ * During recovery, we don't write an end-of-backup record. We assume that
+ * pg_control was backed up last and its minimum recovery point can be
+ * available as the backup end location. Since we don't have an
+ * end-of-backup record, we use the pg_control value to check whether
+ * we've reached the end of backup when starting recovery from this
+ * backup. We have no way of checking if pg_control wasn't backed up last
+ * however.
+ *
+ * We don't force a switch to new WAL file but it is still possible to
+ * wait for all the required files to be archived if waitforarchive is
+ * true. This is okay if we use the backup to start a standby and fetch
+ * the missing WAL using streaming replication. But in the case of an
+ * archive recovery, a user should set waitforarchive to true and wait for
+ * them to be archived to ensure that all the required files are
+ * available.
+ *
+ * We return the current minimum recovery point as the backup end
+ * location. Note that it can be greater than the exact backup end
+ * location if the minimum recovery point is updated after the backup of
+ * pg_control. This is harmless for current uses.
+ *
+ * XXX currently a backup history file is for informational and debug
+ * purposes only. It's not essential for an online backup. Furthermore,
+ * even if it's created, it will not be archived during recovery because
+ * an archiver is not invoked. So it doesn't seem worthwhile to write a
+ * backup history file during recovery.
+ */
+ if (backup_started_in_recovery)
+ {
+ XLogRecPtr recptr;
+
+ /*
+ * Check to see if all WAL replayed during online backup contain
+ * full-page writes.
+ */
+ SpinLockAcquire(&XLogCtl->info_lck);
+ recptr = XLogCtl->lastFpwDisableRecPtr;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ if (startpoint <= recptr)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL generated with full_page_writes=off was replayed "
+ "during online backup"),
+ errhint("This means that the backup being taken on the standby "
+ "is corrupt and should not be used. "
+ "Enable full_page_writes and run CHECKPOINT on the primary, "
+ "and then try an online backup again.")));
+
+
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ stoppoint = ControlFile->minRecoveryPoint;
+ stoptli = ControlFile->minRecoveryPointTLI;
+ LWLockRelease(ControlFileLock);
+ }
+ else
+ {
+ /*
+ * Write the backup-end xlog record
+ */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
+ stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
+
+ /*
+ * Given that we're not in recovery, InsertTimeLineID is set and can't
+ * change, so we can read it without a lock.
+ */
+ stoptli = XLogCtl->InsertTimeLineID;
+
+ /*
+ * Force a switch to a new xlog segment file, so that the backup is
+ * valid as soon as archiver moves out the current segment file.
+ */
+ RequestXLogSwitch(false);
+
+ XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+ XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
+
+ /* Use the log timezone here, not the session timezone */
+ stamp_time = (pg_time_t) time(NULL);
+ pg_strftime(strfbuf, sizeof(strfbuf),
+ "%Y-%m-%d %H:%M:%S %Z",
+ pg_localtime(&stamp_time, log_timezone));
+
+ /*
+ * Write the backup history file
+ */
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
+ startpoint, wal_segment_size);
+ fp = AllocateFile(histfilepath, "w");
+ if (!fp)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m",
+ histfilepath)));
+ fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
+ LSN_FORMAT_ARGS(startpoint), startxlogfilename);
+ fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
+ LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
+
+ /*
+ * Transfer remaining lines including label and start timeline to
+ * history file.
+ */
+ fprintf(fp, "%s", remaining);
+ fprintf(fp, "STOP TIME: %s\n", strfbuf);
+ fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
+ if (fflush(fp) || ferror(fp) || FreeFile(fp))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ histfilepath)));
+
+ /*
+ * Clean out any no-longer-needed history files. As a side effect,
+ * this will post a .ready file for the newly created history file,
+ * notifying the archiver that history file may be archived
+ * immediately.
+ */
+ CleanupBackupHistory();
+ }
+
+ /*
+ * If archiving is enabled, wait for all the required WAL files to be
+ * archived before returning. If archiving isn't enabled, the required WAL
+ * needs to be transported via streaming replication (hopefully with
+ * wal_keep_size set high enough), or some more exotic mechanism like
+ * polling and copying files from pg_wal with script. We have no knowledge
+ * of those mechanisms, so it's up to the user to ensure that he gets all
+ * the required WAL.
+ *
+ * We wait until both the last WAL file filled during backup and the
+ * history file have been archived, and assume that the alphabetic sorting
+ * property of the WAL files ensures any earlier WAL files are safely
+ * archived as well.
+ *
+ * We wait forever, since archive_command is supposed to work and we
+ * assume the admin wanted his backup to work completely. If you don't
+ * wish to wait, then either waitforarchive should be passed in as false,
+ * or you can set statement_timeout. Also, some notices are issued to
+ * clue in anyone who might be doing this interactively.
+ */
+
+ if (waitforarchive &&
+ ((!backup_started_in_recovery && XLogArchivingActive()) ||
+ (backup_started_in_recovery && XLogArchivingAlways())))
+ {
+ XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+ XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
+
+ XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+ BackupHistoryFileName(histfilename, stoptli, _logSegNo,
+ startpoint, wal_segment_size);
+
+ seconds_before_warning = 60;
+ waits = 0;
+
+ while (XLogArchiveIsBusy(lastxlogfilename) ||
+ XLogArchiveIsBusy(histfilename))
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (!reported_waiting && waits > 5)
+ {
+ ereport(NOTICE,
+ (errmsg("base backup done, waiting for required WAL segments to be archived")));
+ reported_waiting = true;
+ }
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ 1000L,
+ WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
+ ResetLatch(MyLatch);
+
+ if (++waits >= seconds_before_warning)
+ {
+ seconds_before_warning *= 2; /* This wraps in >10 years... */
+ ereport(WARNING,
+ (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
+ waits),
+ errhint("Check that your archive_command is executing properly. "
+ "You can safely cancel this backup, "
+ "but the database backup will not be usable without all the WAL segments.")));
+ }
+ }
+
+ ereport(NOTICE,
+ (errmsg("all required WAL segments have been archived")));
+ }
+ else if (waitforarchive)
+ ereport(NOTICE,
+ (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
+
+ /*
+ * We're done. As a convenience, return the ending WAL location.
+ */
+ if (stoptli_p)
+ *stoptli_p = stoptli;
+ return stoppoint;
+}
+
+
+/*
+ * do_pg_abort_backup: abort a running backup
+ *
+ * This does just the most basic steps of do_pg_backup_stop(), by taking the
+ * system out of backup mode, thus making it a lot more safe to call from
+ * an error handler.
+ *
+ * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
+ * is emitted.
+ *
+ * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
+ * signature.
+ */
+void
+do_pg_abort_backup(int code, Datum arg)
+{
+ bool emit_warning = DatumGetBool(arg);
+
+ /*
+ * Quick exit if session does not have a running backup.
+ */
+ if (sessionBackupState != SESSION_BACKUP_RUNNING)
+ return;
+
+ WALInsertLockAcquireExclusive();
+ Assert(XLogCtl->Insert.runningBackups > 0);
+ XLogCtl->Insert.runningBackups--;
+
+ if (XLogCtl->Insert.runningBackups == 0)
+ {
+ XLogCtl->Insert.forcePageWrites = false;
+ }
+
+ sessionBackupState = SESSION_BACKUP_NONE;
+ WALInsertLockRelease();
+
+ if (emit_warning)
+ ereport(WARNING,
+ (errmsg("aborting backup due to backend exiting before pg_backup_stop was called")));
+}
+
+/*
+ * Register a handler that will warn about unterminated backups at end of
+ * session, unless this has already been done.
+ */
+void
+register_persistent_abort_backup_handler(void)
+{
+ static bool already_done = false;
+
+ if (already_done)
+ return;
+ before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
+ already_done = true;
+}
+
+/*
+ * Get latest WAL insert pointer
+ */
+XLogRecPtr
+GetXLogInsertRecPtr(void)
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint64 current_bytepos;
+
+ SpinLockAcquire(&Insert->insertpos_lck);
+ current_bytepos = Insert->CurrBytePos;
+ SpinLockRelease(&Insert->insertpos_lck);
+
+ return XLogBytePosToRecPtr(current_bytepos);
+}
+
+/*
+ * Get latest WAL write pointer
+ */
+XLogRecPtr
+GetXLogWriteRecPtr(void)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return LogwrtResult.Write;
+}
+
+/*
+ * Returns the redo pointer of the last checkpoint or restartpoint. This is
+ * the oldest point in WAL that we still need, if we have to restart recovery.
+ */
+void
+GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
+{
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ *oldrecptr = ControlFile->checkPointCopy.redo;
+ *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
+ LWLockRelease(ControlFileLock);
+}
+
+/* Thin wrapper around ShutdownWalRcv(). */
+void
+XLogShutdownWalRcv(void)
+{
+ ShutdownWalRcv();
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ XLogCtl->InstallXLogFileSegmentActive = false;
+ LWLockRelease(ControlFileLock);
+}
+
+/* Enable WAL file recycling and preallocation. */
+void
+SetInstallXLogFileSegmentActive(void)
+{
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ XLogCtl->InstallXLogFileSegmentActive = true;
+ LWLockRelease(ControlFileLock);
+}
+
+bool
+IsInstallXLogFileSegmentActive(void)
+{
+ bool result;
+
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ result = XLogCtl->InstallXLogFileSegmentActive;
+ LWLockRelease(ControlFileLock);
+
+ return result;
+}
+
+/*
+ * Update the WalWriterSleeping flag.
+ */
+void
+SetWalWriterSleeping(bool sleeping)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->WalWriterSleeping = sleeping;
+ SpinLockRelease(&XLogCtl->info_lck);
+}
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
new file mode 100644
index 0000000..6516a74
--- /dev/null
+++ b/src/backend/access/transam/xlogarchive.c
@@ -0,0 +1,762 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogarchive.c
+ * Functions for archiving WAL files and restoring from the archive.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogarchive.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "common/archive.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/pgarch.h"
+#include "replication/walsender.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+
+/*
+ * Attempt to retrieve the specified file from off-line archival storage.
+ * If successful, fill "path" with its complete path (note that this will be
+ * a temp file name that doesn't follow the normal naming convention), and
+ * return true.
+ *
+ * If not successful, fill "path" with the name of the normal on-line file
+ * (which may or may not actually exist, but we'll try to use it), and return
+ * false.
+ *
+ * For fixed-size files, the caller may pass the expected size as an
+ * additional crosscheck on successful recovery. If the file size is not
+ * known, set expectedSize = 0.
+ *
+ * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments
+ * in the archive. This is used when fetching the initial checkpoint record,
+ * when we are not yet sure how far back we need the WAL.
+ */
+bool
+RestoreArchivedFile(char *path, const char *xlogfname,
+ const char *recovername, off_t expectedSize,
+ bool cleanupEnabled)
+{
+ char xlogpath[MAXPGPATH];
+ char *xlogRestoreCmd;
+ char lastRestartPointFname[MAXPGPATH];
+ int rc;
+ struct stat stat_buf;
+ XLogSegNo restartSegNo;
+ XLogRecPtr restartRedoPtr;
+ TimeLineID restartTli;
+
+ /*
+ * Ignore restore_command when not in archive recovery (meaning we are in
+ * crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ goto not_available;
+
+ /* In standby mode, restore_command might not be supplied */
+ if (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)
+ goto not_available;
+
+ /*
+ * When doing archive recovery, we always prefer an archived log file even
+ * if a file of the same name exists in XLOGDIR. The reason is that the
+ * file in XLOGDIR could be an old, un-filled or partly-filled version
+ * that was copied and restored as part of backing up $PGDATA.
+ *
+ * We could try to optimize this slightly by checking the local copy
+ * lastchange timestamp against the archived copy, but we have no API to
+ * do this, nor can we guarantee that the lastchange timestamp was
+ * preserved correctly when we copied to archive. Our aim is robustness,
+ * so we elect not to do this.
+ *
+ * If we cannot obtain the log file from the archive, however, we will try
+ * to use the XLOGDIR file if it exists. This is so that we can make use
+ * of log segments that weren't yet transferred to the archive.
+ *
+ * Notice that we don't actually overwrite any files when we copy back
+ * from archive because the restore_command may inadvertently restore
+ * inappropriate xlogs, or they may be corrupt, so we may wish to fallback
+ * to the segments remaining in current XLOGDIR later. The
+ * copy-from-archive filename is always the same, ensuring that we don't
+ * run out of disk space on long recoveries.
+ */
+ snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
+
+ /*
+ * Make sure there is no existing file named recovername.
+ */
+ if (stat(xlogpath, &stat_buf) != 0)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ xlogpath)));
+ }
+ else
+ {
+ if (unlink(xlogpath) != 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ xlogpath)));
+ }
+
+ /*
+ * Calculate the archive file cutoff point for use during log shipping
+ * replication. All files earlier than this point can be deleted from the
+ * archive, though there is no requirement to do so.
+ *
+ * If cleanup is not enabled, initialise this with the filename of
+ * InvalidXLogRecPtr, which will prevent the deletion of any WAL files
+ * from the archive because of the alphabetic sorting property of WAL
+ * filenames.
+ *
+ * Once we have successfully located the redo pointer of the checkpoint
+ * from which we start recovery we never request a file prior to the redo
+ * pointer of the last restartpoint. When redo begins we know that we have
+ * successfully located it, so there is no need for additional status
+ * flags to signify the point when we can begin deleting WAL files from
+ * the archive.
+ */
+ if (cleanupEnabled)
+ {
+ GetOldestRestartPoint(&restartRedoPtr, &restartTli);
+ XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
+ XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
+ wal_segment_size);
+ /* we shouldn't need anything earlier than last restart point */
+ Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
+ }
+ else
+ XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size);
+
+ /* Build the restore command to execute */
+ xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand,
+ xlogpath, xlogfname,
+ lastRestartPointFname);
+ if (xlogRestoreCmd == NULL)
+ elog(ERROR, "could not build restore command \"%s\"",
+ recoveryRestoreCommand);
+
+ ereport(DEBUG3,
+ (errmsg_internal("executing restore command \"%s\"",
+ xlogRestoreCmd)));
+
+ pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND);
+
+ /*
+ * PreRestoreCommand() informs the SIGTERM handler for the startup process
+ * that it should proc_exit() right away. This is done for the duration
+ * of the system() call because there isn't a good way to break out while
+ * it is executing. Since we might call proc_exit() in a signal handler,
+ * it is best to put any additional logic before or after the
+ * PreRestoreCommand()/PostRestoreCommand() section.
+ */
+ PreRestoreCommand();
+
+ /*
+ * Copy xlog from archival storage to XLOGDIR
+ */
+ rc = system(xlogRestoreCmd);
+
+ PostRestoreCommand();
+
+ pgstat_report_wait_end();
+ pfree(xlogRestoreCmd);
+
+ if (rc == 0)
+ {
+ /*
+ * command apparently succeeded, but let's make sure the file is
+ * really there now and has the correct size.
+ */
+ if (stat(xlogpath, &stat_buf) == 0)
+ {
+ if (expectedSize > 0 && stat_buf.st_size != expectedSize)
+ {
+ int elevel;
+
+ /*
+ * If we find a partial file in standby mode, we assume it's
+ * because it's just being copied to the archive, and keep
+ * trying.
+ *
+ * Otherwise treat a wrong-sized file as FATAL to ensure the
+ * DBA would notice it, but is that too strong? We could try
+ * to plow ahead with a local copy of the file ... but the
+ * problem is that there probably isn't one, and we'd
+ * incorrectly conclude we've reached the end of WAL and we're
+ * done recovering ...
+ */
+ if (StandbyMode && stat_buf.st_size < expectedSize)
+ elevel = DEBUG1;
+ else
+ elevel = FATAL;
+ ereport(elevel,
+ (errmsg("archive file \"%s\" has wrong size: %lld instead of %lld",
+ xlogfname,
+ (long long int) stat_buf.st_size,
+ (long long int) expectedSize)));
+ return false;
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("restored log file \"%s\" from archive",
+ xlogfname)));
+ strcpy(path, xlogpath);
+ return true;
+ }
+ }
+ else
+ {
+ /* stat failed */
+ int elevel = (errno == ENOENT) ? LOG : FATAL;
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", xlogpath),
+ errdetail("restore_command returned a zero exit status, but stat() failed.")));
+ }
+ }
+
+ /*
+ * Remember, we rollforward UNTIL the restore fails so failure here is
+ * just part of the process... that makes it difficult to determine
+ * whether the restore failed because there isn't an archive to restore,
+ * or because the administrator has specified the restore program
+ * incorrectly. We have to assume the former.
+ *
+ * However, if the failure was due to any sort of signal, it's best to
+ * punt and abort recovery. (If we "return false" here, upper levels will
+ * assume that recovery is complete and start up the database!) It's
+ * essential to abort on child SIGINT and SIGQUIT, because per spec
+ * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
+ * those it's a good bet we should have gotten it too.
+ *
+ * On SIGTERM, assume we have received a fast shutdown request, and exit
+ * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+ * child process. If we receive it first, the signal handler will call
+ * proc_exit, otherwise we do it here. If we or the child process received
+ * SIGTERM for any other reason than a fast shutdown request, postmaster
+ * will perform an immediate shutdown when it sees us exiting
+ * unexpectedly.
+ *
+ * We treat hard shell errors such as "command not found" as fatal, too.
+ */
+ if (wait_result_is_signal(rc, SIGTERM))
+ proc_exit(1);
+
+ ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
+ (errmsg("could not restore file \"%s\" from archive: %s",
+ xlogfname, wait_result_to_str(rc))));
+
+not_available:
+
+ /*
+ * if an archived file is not available, there might still be a version of
+ * this file in XLOGDIR, so return that as the filename to open.
+ *
+ * In many recovery scenarios we expect this to fail also, but if so that
+ * just means we've reached the end of WAL.
+ */
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+ return false;
+}
+
+/*
+ * Attempt to execute an external shell command during recovery.
+ *
+ * 'command' is the shell command to be executed, 'commandName' is a
+ * human-readable name describing the command emitted in the logs. If
+ * 'failOnSignal' is true and the command is killed by a signal, a FATAL
+ * error is thrown. Otherwise a WARNING is emitted.
+ *
+ * This is currently used for recovery_end_command and archive_cleanup_command.
+ */
+void
+ExecuteRecoveryCommand(const char *command, const char *commandName,
+ bool failOnSignal, uint32 wait_event_info)
+{
+ char xlogRecoveryCmd[MAXPGPATH];
+ char lastRestartPointFname[MAXPGPATH];
+ char *dp;
+ char *endp;
+ const char *sp;
+ int rc;
+ XLogSegNo restartSegNo;
+ XLogRecPtr restartRedoPtr;
+ TimeLineID restartTli;
+
+ Assert(command && commandName);
+
+ /*
+ * Calculate the archive file cutoff point for use during log shipping
+ * replication. All files earlier than this point can be deleted from the
+ * archive, though there is no requirement to do so.
+ */
+ GetOldestRestartPoint(&restartRedoPtr, &restartTli);
+ XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
+ XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
+ wal_segment_size);
+
+ /*
+ * construct the command to be executed
+ */
+ dp = xlogRecoveryCmd;
+ endp = xlogRecoveryCmd + MAXPGPATH - 1;
+ *endp = '\0';
+
+ for (sp = command; *sp; sp++)
+ {
+ if (*sp == '%')
+ {
+ switch (sp[1])
+ {
+ case 'r':
+ /* %r: filename of last restartpoint */
+ sp++;
+ strlcpy(dp, lastRestartPointFname, endp - dp);
+ dp += strlen(dp);
+ break;
+ case '%':
+ /* convert %% to a single % */
+ sp++;
+ if (dp < endp)
+ *dp++ = *sp;
+ break;
+ default:
+ /* otherwise treat the % as not special */
+ if (dp < endp)
+ *dp++ = *sp;
+ break;
+ }
+ }
+ else
+ {
+ if (dp < endp)
+ *dp++ = *sp;
+ }
+ }
+ *dp = '\0';
+
+ ereport(DEBUG3,
+ (errmsg_internal("executing %s \"%s\"", commandName, command)));
+
+ /*
+ * execute the constructed command
+ */
+ pgstat_report_wait_start(wait_event_info);
+ rc = system(xlogRecoveryCmd);
+ pgstat_report_wait_end();
+
+ if (rc != 0)
+ {
+ /*
+ * If the failure was due to any sort of signal, it's best to punt and
+ * abort recovery. See comments in RestoreArchivedFile().
+ */
+ ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
+ /*------
+ translator: First %s represents a postgresql.conf parameter name like
+ "recovery_end_command", the 2nd is the value of that parameter, the
+ third an already translated error message. */
+ (errmsg("%s \"%s\": %s", commandName,
+ command, wait_result_to_str(rc))));
+ }
+}
+
+
+/*
+ * A file was restored from the archive under a temporary filename (path),
+ * and now we want to keep it. Rename it under the permanent filename in
+ * pg_wal (xlogfname), replacing any existing file with the same name.
+ */
+void
+KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
+{
+ char xlogfpath[MAXPGPATH];
+ bool reload = false;
+ struct stat statbuf;
+
+ snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+
+ if (stat(xlogfpath, &statbuf) == 0)
+ {
+ char oldpath[MAXPGPATH];
+
+#ifdef WIN32
+ static unsigned int deletedcounter = 1;
+
+ /*
+ * On Windows, if another process (e.g a walsender process) holds the
+ * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the
+ * file will still show up in directory listing until the last handle
+ * is closed, and we cannot rename the new file in its place until
+ * that. To avoid that problem, rename the old file to a temporary
+ * name first. Use a counter to create a unique filename, because the
+ * same file might be restored from the archive multiple times, and a
+ * walsender could still be holding onto an old deleted version of it.
+ */
+ snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
+ xlogfpath, deletedcounter++);
+ if (rename(xlogfpath, oldpath) != 0)
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ xlogfpath, oldpath)));
+ }
+#else
+ /* same-size buffers, so this never truncates */
+ strlcpy(oldpath, xlogfpath, MAXPGPATH);
+#endif
+ if (unlink(oldpath) != 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ xlogfpath)));
+ reload = true;
+ }
+
+ durable_rename(path, xlogfpath, ERROR);
+
+ /*
+ * Create .done file forcibly to prevent the restored segment from being
+ * archived again later.
+ */
+ if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
+ XLogArchiveForceDone(xlogfname);
+ else
+ XLogArchiveNotify(xlogfname);
+
+ /*
+ * If the existing file was replaced, since walsenders might have it open,
+ * request them to reload a currently-open segment. This is only required
+ * for WAL segments, walsenders don't hold other files open, but there's
+ * no harm in doing this too often, and we don't know what kind of a file
+ * we're dealing with here.
+ */
+ if (reload)
+ WalSndRqstFileReload();
+
+ /*
+ * Signal walsender that new WAL has arrived. Again, this isn't necessary
+ * if we restored something other than a WAL segment, but it does no harm
+ * either.
+ */
+ WalSndWakeup();
+}
+
+/*
+ * XLogArchiveNotify
+ *
+ * Create an archive notification file
+ *
+ * The name of the notification file is the message that will be picked up
+ * by the archiver, e.g. we write 0000000100000001000000C6.ready
+ * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
+ * then when complete, rename it to 0000000100000001000000C6.done
+ */
+void
+XLogArchiveNotify(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+ FILE *fd;
+
+ /* insert an otherwise empty file called <XLOG>.ready */
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ fd = AllocateFile(archiveStatusPath, "w");
+ if (fd == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not create archive status file \"%s\": %m",
+ archiveStatusPath)));
+ return;
+ }
+ if (FreeFile(fd))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write archive status file \"%s\": %m",
+ archiveStatusPath)));
+ return;
+ }
+
+ /*
+ * Timeline history files are given the highest archival priority to lower
+ * the chance that a promoted standby will choose a timeline that is
+ * already in use. However, the archiver ordinarily tries to gather
+ * multiple files to archive from each scan of the archive_status
+ * directory, which means that newly created timeline history files could
+ * be left unarchived for a while. To ensure that the archiver picks up
+ * timeline history files as soon as possible, we force the archiver to
+ * scan the archive_status directory the next time it looks for a file to
+ * archive.
+ */
+ if (IsTLHistoryFileName(xlog))
+ PgArchForceDirScan();
+
+ /* Notify archiver that it's got something to do */
+ if (IsUnderPostmaster)
+ PgArchWakeup();
+}
+
+/*
+ * Convenience routine to notify using segment number representation of filename
+ */
+void
+XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli)
+{
+ char xlog[MAXFNAMELEN];
+
+ Assert(tli != 0);
+
+ XLogFileName(xlog, tli, segno, wal_segment_size);
+ XLogArchiveNotify(xlog);
+}
+
+/*
+ * XLogArchiveForceDone
+ *
+ * Emit notification forcibly that an XLOG segment file has been successfully
+ * archived, by creating <XLOG>.done regardless of whether <XLOG>.ready
+ * exists or not.
+ */
+void
+XLogArchiveForceDone(const char *xlog)
+{
+ char archiveReady[MAXPGPATH];
+ char archiveDone[MAXPGPATH];
+ struct stat stat_buf;
+ FILE *fd;
+
+ /* Exit if already known done */
+ StatusFilePath(archiveDone, xlog, ".done");
+ if (stat(archiveDone, &stat_buf) == 0)
+ return;
+
+ /* If .ready exists, rename it to .done */
+ StatusFilePath(archiveReady, xlog, ".ready");
+ if (stat(archiveReady, &stat_buf) == 0)
+ {
+ (void) durable_rename(archiveReady, archiveDone, WARNING);
+ return;
+ }
+
+ /* insert an otherwise empty file called <XLOG>.done */
+ fd = AllocateFile(archiveDone, "w");
+ if (fd == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not create archive status file \"%s\": %m",
+ archiveDone)));
+ return;
+ }
+ if (FreeFile(fd))
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write archive status file \"%s\": %m",
+ archiveDone)));
+ return;
+ }
+}
+
+/*
+ * XLogArchiveCheckDone
+ *
+ * This is called when we are ready to delete or recycle an old XLOG segment
+ * file or backup history file. If it is okay to delete it then return true.
+ * If it is not time to delete it, make sure a .ready file exists, and return
+ * false.
+ *
+ * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
+ * then return false; else create <XLOG>.ready and return false.
+ *
+ * The reason we do things this way is so that if the original attempt to
+ * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
+ */
+bool
+XLogArchiveCheckDone(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+ struct stat stat_buf;
+
+ /* The file is always deletable if archive_mode is "off". */
+ if (!XLogArchivingActive())
+ return true;
+
+ /*
+ * During archive recovery, the file is deletable if archive_mode is not
+ * "always".
+ */
+ if (!XLogArchivingAlways() &&
+ GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
+ return true;
+
+ /*
+ * At this point of the logic, note that we are either a primary with
+ * archive_mode set to "on" or "always", or a standby with archive_mode
+ * set to "always".
+ */
+
+ /* First check for .done --- this means archiver is done with it */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ /* check for .ready --- this means archiver is still busy with it */
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return false;
+
+ /* Race condition --- maybe archiver just finished, so recheck */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ /* Retry creation of the .ready file */
+ XLogArchiveNotify(xlog);
+ return false;
+}
+
+/*
+ * XLogArchiveIsBusy
+ *
+ * Check to see if an XLOG segment file is still unarchived.
+ * This is almost but not quite the inverse of XLogArchiveCheckDone: in
+ * the first place we aren't chartered to recreate the .ready file, and
+ * in the second place we should consider that if the file is already gone
+ * then it's not busy. (This check is needed to handle the race condition
+ * that a checkpoint already deleted the no-longer-needed file.)
+ */
+bool
+XLogArchiveIsBusy(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+ struct stat stat_buf;
+
+ /* First check for .done --- this means archiver is done with it */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return false;
+
+ /* check for .ready --- this means archiver is still busy with it */
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ /* Race condition --- maybe archiver just finished, so recheck */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return false;
+
+ /*
+ * Check to see if the WAL file has been removed by checkpoint, which
+ * implies it has already been archived, and explains why we can't see a
+ * status file for it.
+ */
+ snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
+ if (stat(archiveStatusPath, &stat_buf) != 0 &&
+ errno == ENOENT)
+ return false;
+
+ return true;
+}
+
+/*
+ * XLogArchiveIsReadyOrDone
+ *
+ * Check to see if an XLOG segment file has a .ready or .done file.
+ * This is similar to XLogArchiveIsBusy(), but returns true if the file
+ * is already archived or is about to be archived.
+ *
+ * This is currently only used at recovery. During normal operation this
+ * would be racy: the file might get removed or marked with .ready as we're
+ * checking it, or immediately after we return.
+ */
+bool
+XLogArchiveIsReadyOrDone(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+ struct stat stat_buf;
+
+ /* First check for .done --- this means archiver is done with it */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ /* check for .ready --- this means archiver is still busy with it */
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ /* Race condition --- maybe archiver just finished, so recheck */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * XLogArchiveIsReady
+ *
+ * Check to see if an XLOG segment file has an archive notification (.ready)
+ * file.
+ */
+bool
+XLogArchiveIsReady(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+ struct stat stat_buf;
+
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ if (stat(archiveStatusPath, &stat_buf) == 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * XLogArchiveCleanup
+ *
+ * Cleanup archive notification file(s) for a particular xlog segment
+ */
+void
+XLogArchiveCleanup(const char *xlog)
+{
+ char archiveStatusPath[MAXPGPATH];
+
+ /* Remove the .done file */
+ StatusFilePath(archiveStatusPath, xlog, ".done");
+ unlink(archiveStatusPath);
+ /* should we complain about failure? */
+
+ /* Remove the .ready file if present --- normally it shouldn't be */
+ StatusFilePath(archiveStatusPath, xlog, ".ready");
+ unlink(archiveStatusPath);
+ /* should we complain about failure? */
+}
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
new file mode 100644
index 0000000..02bd919
--- /dev/null
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -0,0 +1,648 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogfuncs.c
+ *
+ * PostgreSQL write-ahead log manager user interface functions
+ *
+ * This file contains WAL control and information functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/htup_details.h"
+#include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/numeric.h"
+#include "utils/pg_lsn.h"
+#include "utils/timestamp.h"
+#include "utils/tuplestore.h"
+
+/*
+ * Store label file and tablespace map during backups.
+ */
+static StringInfo label_file;
+static StringInfo tblspc_map_file;
+
+/*
+ * pg_backup_start: set up for taking an on-line backup dump
+ *
+ * Essentially what this does is to create a backup label file in $PGDATA,
+ * where it will be archived as part of the backup dump. The label file
+ * contains the user-supplied label string (typically this would be used
+ * to tell where the backup dump will be stored) and the starting time and
+ * starting WAL location for the dump.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_backup_start(PG_FUNCTION_ARGS)
+{
+ text *backupid = PG_GETARG_TEXT_PP(0);
+ bool fast = PG_GETARG_BOOL(1);
+ char *backupidstr;
+ XLogRecPtr startpoint;
+ SessionBackupState status = get_backup_status();
+ MemoryContext oldcontext;
+
+ backupidstr = text_to_cstring(backupid);
+
+ if (status == SESSION_BACKUP_RUNNING)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is already in progress in this session")));
+
+ /*
+ * Label file and tablespace map file need to be long-lived, since they
+ * are read in pg_backup_stop.
+ */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ label_file = makeStringInfo();
+ tblspc_map_file = makeStringInfo();
+ MemoryContextSwitchTo(oldcontext);
+
+ register_persistent_abort_backup_handler();
+
+ startpoint = do_pg_backup_start(backupidstr, fast, NULL, label_file,
+ NULL, tblspc_map_file);
+
+ PG_RETURN_LSN(startpoint);
+}
+
+
+/*
+ * pg_backup_stop: finish taking an on-line backup.
+ *
+ * The first parameter (variable 'waitforarchive'), which is optional,
+ * allows the user to choose if they want to wait for the WAL to be archived
+ * or if we should just return as soon as the WAL record is written.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_backup_stop(PG_FUNCTION_ARGS)
+{
+#define PG_STOP_BACKUP_V2_COLS 3
+ TupleDesc tupdesc;
+ Datum values[PG_STOP_BACKUP_V2_COLS];
+ bool nulls[PG_STOP_BACKUP_V2_COLS];
+
+ bool waitforarchive = PG_GETARG_BOOL(0);
+ XLogRecPtr stoppoint;
+ SessionBackupState status = get_backup_status();
+
+ /* Initialize attributes information in the tuple descriptor */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ MemSet(values, 0, sizeof(values));
+ MemSet(nulls, 0, sizeof(nulls));
+
+ if (status != SESSION_BACKUP_RUNNING)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("backup is not in progress"),
+ errhint("Did you call pg_backup_start()?")));
+
+ /*
+ * Stop the backup. Return a copy of the backup label and tablespace map
+ * so they can be written to disk by the caller.
+ */
+ stoppoint = do_pg_backup_stop(label_file->data, waitforarchive, NULL);
+
+ values[0] = LSNGetDatum(stoppoint);
+ values[1] = CStringGetTextDatum(label_file->data);
+ values[2] = CStringGetTextDatum(tblspc_map_file->data);
+
+ /* Free structures allocated in TopMemoryContext */
+ pfree(label_file->data);
+ pfree(label_file);
+ label_file = NULL;
+ pfree(tblspc_map_file->data);
+ pfree(tblspc_map_file);
+ tblspc_map_file = NULL;
+
+ /* Returns the record as Datum */
+ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * pg_switch_wal: switch to next xlog file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_switch_wal(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr switchpoint;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
+ switchpoint = RequestXLogSwitch(false);
+
+ /*
+ * As a convenience, return the WAL location of the switch record
+ */
+ PG_RETURN_LSN(switchpoint);
+}
+
+/*
+ * pg_create_restore_point: a named point for restore
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_create_restore_point(PG_FUNCTION_ARGS)
+{
+ text *restore_name = PG_GETARG_TEXT_PP(0);
+ char *restore_name_str;
+ XLogRecPtr restorepoint;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
+ if (!XLogIsNeeded())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL level not sufficient for creating a restore point"),
+ errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+ restore_name_str = text_to_cstring(restore_name);
+
+ if (strlen(restore_name_str) >= MAXFNAMELEN)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
+
+ restorepoint = XLogRestorePoint(restore_name_str);
+
+ /*
+ * As a convenience, return the WAL location of the restore point record
+ */
+ PG_RETURN_LSN(restorepoint);
+}
+
+/*
+ * Report the current WAL write location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is visible to an external
+ * archiving process. Note that the data before this point is written out
+ * to the kernel, but is not necessarily synced to disk.
+ */
+Datum
+pg_current_wal_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr current_recptr;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
+ current_recptr = GetXLogWriteRecPtr();
+
+ PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the current WAL insert location (same format as pg_backup_start etc)
+ *
+ * This function is mostly for debugging purposes.
+ */
+Datum
+pg_current_wal_insert_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr current_recptr;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
+ current_recptr = GetXLogInsertRecPtr();
+
+ PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the current WAL flush location (same format as pg_backup_start etc)
+ *
+ * This function is mostly for debugging purposes.
+ */
+Datum
+pg_current_wal_flush_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr current_recptr;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("WAL control functions cannot be executed during recovery.")));
+
+ current_recptr = GetFlushRecPtr(NULL);
+
+ PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the last WAL receive location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is guaranteed to be received
+ * and synced to disk by walreceiver.
+ */
+Datum
+pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr recptr;
+
+ recptr = GetWalRcvFlushRecPtr(NULL, NULL);
+
+ if (recptr == 0)
+ PG_RETURN_NULL();
+
+ PG_RETURN_LSN(recptr);
+}
+
+/*
+ * Report the last WAL replay location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is visible to read-only
+ * connections during recovery.
+ */
+Datum
+pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
+{
+ XLogRecPtr recptr;
+
+ recptr = GetXLogReplayRecPtr(NULL);
+
+ if (recptr == 0)
+ PG_RETURN_NULL();
+
+ PG_RETURN_LSN(recptr);
+}
+
+/*
+ * Compute an xlog file name and decimal byte offset given a WAL location,
+ * such as is returned by pg_backup_stop() or pg_switch_wal().
+ *
+ * Note that a location exactly at a segment boundary is taken to be in
+ * the previous segment. This is usually the right thing, since the
+ * expected usage is to determine which xlog file(s) are ready to archive.
+ */
+Datum
+pg_walfile_name_offset(PG_FUNCTION_ARGS)
+{
+ XLogSegNo xlogsegno;
+ uint32 xrecoff;
+ XLogRecPtr locationpoint = PG_GETARG_LSN(0);
+ char xlogfilename[MAXFNAMELEN];
+ Datum values[2];
+ bool isnull[2];
+ TupleDesc resultTupleDesc;
+ HeapTuple resultHeapTuple;
+ Datum result;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("%s cannot be executed during recovery.",
+ "pg_walfile_name_offset()")));
+
+ /*
+ * Construct a tuple descriptor for the result row. This must match this
+ * function's pg_proc entry!
+ */
+ resultTupleDesc = CreateTemplateTupleDesc(2);
+ TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
+ INT4OID, -1, 0);
+
+ resultTupleDesc = BlessTupleDesc(resultTupleDesc);
+
+ /*
+ * xlogfilename
+ */
+ XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+ XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
+ wal_segment_size);
+
+ values[0] = CStringGetTextDatum(xlogfilename);
+ isnull[0] = false;
+
+ /*
+ * offset
+ */
+ xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size);
+
+ values[1] = UInt32GetDatum(xrecoff);
+ isnull[1] = false;
+
+ /*
+ * Tuple jam: Having first prepared your Datums, then squash together
+ */
+ resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
+
+ result = HeapTupleGetDatum(resultHeapTuple);
+
+ PG_RETURN_DATUM(result);
+}
+
+/*
+ * Compute an xlog file name given a WAL location,
+ * such as is returned by pg_backup_stop() or pg_switch_wal().
+ */
+Datum
+pg_walfile_name(PG_FUNCTION_ARGS)
+{
+ XLogSegNo xlogsegno;
+ XLogRecPtr locationpoint = PG_GETARG_LSN(0);
+ char xlogfilename[MAXFNAMELEN];
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("%s cannot be executed during recovery.",
+ "pg_walfile_name()")));
+
+ XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+ XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
+ wal_segment_size);
+
+ PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
+}
+
+/*
+ * pg_wal_replay_pause - Request to pause recovery
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_wal_replay_pause(PG_FUNCTION_ARGS)
+{
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ if (PromoteIsTriggered())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("standby promotion is ongoing"),
+ errhint("%s cannot be executed after promotion is triggered.",
+ "pg_wal_replay_pause()")));
+
+ SetRecoveryPause(true);
+
+ /* wake up the recovery process so that it can process the pause request */
+ WakeupRecovery();
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * pg_wal_replay_resume - resume recovery now
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_wal_replay_resume(PG_FUNCTION_ARGS)
+{
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ if (PromoteIsTriggered())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("standby promotion is ongoing"),
+ errhint("%s cannot be executed after promotion is triggered.",
+ "pg_wal_replay_resume()")));
+
+ SetRecoveryPause(false);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * pg_is_wal_replay_paused
+ */
+Datum
+pg_is_wal_replay_paused(PG_FUNCTION_ARGS)
+{
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ PG_RETURN_BOOL(GetRecoveryPauseState() != RECOVERY_NOT_PAUSED);
+}
+
+/*
+ * pg_get_wal_replay_pause_state - Returns the recovery pause state.
+ *
+ * Returned values:
+ *
+ * 'not paused' - if pause is not requested
+ * 'pause requested' - if pause is requested but recovery is not yet paused
+ * 'paused' - if recovery is paused
+ */
+Datum
+pg_get_wal_replay_pause_state(PG_FUNCTION_ARGS)
+{
+ char *statestr = NULL;
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ /* get the recovery pause state */
+ switch (GetRecoveryPauseState())
+ {
+ case RECOVERY_NOT_PAUSED:
+ statestr = "not paused";
+ break;
+ case RECOVERY_PAUSE_REQUESTED:
+ statestr = "pause requested";
+ break;
+ case RECOVERY_PAUSED:
+ statestr = "paused";
+ break;
+ }
+
+ Assert(statestr != NULL);
+ PG_RETURN_TEXT_P(cstring_to_text(statestr));
+}
+
+/*
+ * Returns timestamp of latest processed commit/abort record.
+ *
+ * When the server has been started normally without recovery the function
+ * returns NULL.
+ */
+Datum
+pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
+{
+ TimestampTz xtime;
+
+ xtime = GetLatestXTime();
+ if (xtime == 0)
+ PG_RETURN_NULL();
+
+ PG_RETURN_TIMESTAMPTZ(xtime);
+}
+
+/*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_BOOL(RecoveryInProgress());
+}
+
+/*
+ * Compute the difference in bytes between two WAL locations.
+ */
+Datum
+pg_wal_lsn_diff(PG_FUNCTION_ARGS)
+{
+ Datum result;
+
+ result = DirectFunctionCall2(pg_lsn_mi,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1));
+
+ PG_RETURN_NUMERIC(result);
+}
+
+/*
+ * Promotes a standby server.
+ *
+ * A result of "true" means that promotion has been completed if "wait" is
+ * "true", or initiated if "wait" is false.
+ */
+Datum
+pg_promote(PG_FUNCTION_ARGS)
+{
+ bool wait = PG_GETARG_BOOL(0);
+ int wait_seconds = PG_GETARG_INT32(1);
+ FILE *promote_file;
+ int i;
+
+ if (!RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is not in progress"),
+ errhint("Recovery control functions can only be executed during recovery.")));
+
+ if (wait_seconds <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("\"wait_seconds\" must not be negative or zero")));
+
+ /* create the promote signal file */
+ promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w");
+ if (!promote_file)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m",
+ PROMOTE_SIGNAL_FILE)));
+
+ if (FreeFile(promote_file))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ PROMOTE_SIGNAL_FILE)));
+
+ /* signal the postmaster */
+ if (kill(PostmasterPid, SIGUSR1) != 0)
+ {
+ ereport(WARNING,
+ (errmsg("failed to send signal to postmaster: %m")));
+ (void) unlink(PROMOTE_SIGNAL_FILE);
+ PG_RETURN_BOOL(false);
+ }
+
+ /* return immediately if waiting was not requested */
+ if (!wait)
+ PG_RETURN_BOOL(true);
+
+ /* wait for the amount of time wanted until promotion */
+#define WAITS_PER_SECOND 10
+ for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++)
+ {
+ int rc;
+
+ ResetLatch(MyLatch);
+
+ if (!RecoveryInProgress())
+ PG_RETURN_BOOL(true);
+
+ CHECK_FOR_INTERRUPTS();
+
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ 1000L / WAITS_PER_SECOND,
+ WAIT_EVENT_PROMOTE);
+
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (rc & WL_POSTMASTER_DEATH)
+ PG_RETURN_BOOL(false);
+ }
+
+ ereport(WARNING,
+ (errmsg_plural("server did not promote within %d second",
+ "server did not promote within %d seconds",
+ wait_seconds,
+ wait_seconds)));
+ PG_RETURN_BOOL(false);
+}
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
new file mode 100644
index 0000000..35cc055
--- /dev/null
+++ b/src/backend/access/transam/xloginsert.c
@@ -0,0 +1,1318 @@
+/*-------------------------------------------------------------------------
+ *
+ * xloginsert.c
+ * Functions for constructing WAL records
+ *
+ * Constructing a WAL record begins with a call to XLogBeginInsert,
+ * followed by a number of XLogRegister* calls. The registered data is
+ * collected in private working memory, and finally assembled into a chain
+ * of XLogRecData structs by a call to XLogRecordAssemble(). See
+ * access/transam/README for details.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xloginsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#ifdef USE_LZ4
+#include <lz4.h>
+#endif
+
+#ifdef USE_ZSTD
+#include <zstd.h>
+#endif
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "replication/origin.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+/*
+ * Guess the maximum buffer size required to store a compressed version of
+ * backup block image.
+ */
+#ifdef USE_LZ4
+#define LZ4_MAX_BLCKSZ LZ4_COMPRESSBOUND(BLCKSZ)
+#else
+#define LZ4_MAX_BLCKSZ 0
+#endif
+
+#ifdef USE_ZSTD
+#define ZSTD_MAX_BLCKSZ ZSTD_COMPRESSBOUND(BLCKSZ)
+#else
+#define ZSTD_MAX_BLCKSZ 0
+#endif
+
+#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
+
+/* Buffer size required to store a compressed version of backup block image */
+#define COMPRESS_BUFSIZE Max(Max(PGLZ_MAX_BLCKSZ, LZ4_MAX_BLCKSZ), ZSTD_MAX_BLCKSZ)
+
+/*
+ * For each block reference registered with XLogRegisterBuffer, we fill in
+ * a registered_buffer struct.
+ */
+typedef struct
+{
+ bool in_use; /* is this slot in use? */
+ uint8 flags; /* REGBUF_* flags */
+ RelFileNode rnode; /* identifies the relation and block */
+ ForkNumber forkno;
+ BlockNumber block;
+ Page page; /* page content */
+ uint32 rdata_len; /* total length of data in rdata chain */
+ XLogRecData *rdata_head; /* head of the chain of data registered with
+ * this block */
+ XLogRecData *rdata_tail; /* last entry in the chain, or &rdata_head if
+ * empty */
+
+ XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to
+ * backup block data in XLogRecordAssemble() */
+
+ /* buffer to store a compressed version of backup block image */
+ char compressed_page[COMPRESS_BUFSIZE];
+} registered_buffer;
+
+static registered_buffer *registered_buffers;
+static int max_registered_buffers; /* allocated size */
+static int max_registered_block_id = 0; /* highest block_id + 1 currently
+ * registered */
+
+/*
+ * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered
+ * with XLogRegisterData(...).
+ */
+static XLogRecData *mainrdata_head;
+static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head;
+static uint32 mainrdata_len; /* total # of bytes in chain */
+
+/* flags for the in-progress insertion */
+static uint8 curinsert_flags = 0;
+
+/*
+ * These are used to hold the record header while constructing a record.
+ * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization,
+ * because we want it to be MAXALIGNed and padding bytes zeroed.
+ *
+ * For simplicity, it's allocated large enough to hold the headers for any
+ * WAL record.
+ */
+static XLogRecData hdr_rdt;
+static char *hdr_scratch = NULL;
+
+#define SizeOfXlogOrigin (sizeof(RepOriginId) + sizeof(char))
+#define SizeOfXLogTransactionId (sizeof(TransactionId) + sizeof(char))
+
+#define HEADER_SCRATCH_SIZE \
+ (SizeOfXLogRecord + \
+ MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \
+ SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \
+ SizeOfXLogTransactionId)
+
+/*
+ * An array of XLogRecData structs, to hold registered data.
+ */
+static XLogRecData *rdatas;
+static int num_rdatas; /* entries currently used */
+static int max_rdatas; /* allocated size */
+
+static bool begininsert_called = false;
+
+/* Memory context to hold the registered buffer and data references. */
+static MemoryContext xloginsert_cxt;
+
+static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
+ XLogRecPtr RedoRecPtr, bool doPageWrites,
+ XLogRecPtr *fpw_lsn, int *num_fpi,
+ bool *topxid_included);
+static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
+ uint16 hole_length, char *dest, uint16 *dlen);
+
+/*
+ * Begin constructing a WAL record. This must be called before the
+ * XLogRegister* functions and XLogInsert().
+ */
+void
+XLogBeginInsert(void)
+{
+ Assert(max_registered_block_id == 0);
+ Assert(mainrdata_last == (XLogRecData *) &mainrdata_head);
+ Assert(mainrdata_len == 0);
+
+ /* cross-check on whether we should be here or not */
+ if (!XLogInsertAllowed())
+ elog(ERROR, "cannot make new WAL entries during recovery");
+
+ if (begininsert_called)
+ elog(ERROR, "XLogBeginInsert was already called");
+
+ begininsert_called = true;
+}
+
+/*
+ * Ensure that there are enough buffer and data slots in the working area,
+ * for subsequent XLogRegisterBuffer, XLogRegisterData and XLogRegisterBufData
+ * calls.
+ *
+ * There is always space for a small number of buffers and data chunks, enough
+ * for most record types. This function is for the exceptional cases that need
+ * more.
+ */
+void
+XLogEnsureRecordSpace(int max_block_id, int ndatas)
+{
+ int nbuffers;
+
+ /*
+ * This must be called before entering a critical section, because
+ * allocating memory inside a critical section can fail. repalloc() will
+ * check the same, but better to check it here too so that we fail
+ * consistently even if the arrays happen to be large enough already.
+ */
+ Assert(CritSectionCount == 0);
+
+ /* the minimum values can't be decreased */
+ if (max_block_id < XLR_NORMAL_MAX_BLOCK_ID)
+ max_block_id = XLR_NORMAL_MAX_BLOCK_ID;
+ if (ndatas < XLR_NORMAL_RDATAS)
+ ndatas = XLR_NORMAL_RDATAS;
+
+ if (max_block_id > XLR_MAX_BLOCK_ID)
+ elog(ERROR, "maximum number of WAL record block references exceeded");
+ nbuffers = max_block_id + 1;
+
+ if (nbuffers > max_registered_buffers)
+ {
+ registered_buffers = (registered_buffer *)
+ repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers);
+
+ /*
+ * At least the padding bytes in the structs must be zeroed, because
+ * they are included in WAL data, but initialize it all for tidiness.
+ */
+ MemSet(&registered_buffers[max_registered_buffers], 0,
+ (nbuffers - max_registered_buffers) * sizeof(registered_buffer));
+ max_registered_buffers = nbuffers;
+ }
+
+ if (ndatas > max_rdatas)
+ {
+ rdatas = (XLogRecData *) repalloc(rdatas, sizeof(XLogRecData) * ndatas);
+ max_rdatas = ndatas;
+ }
+}
+
+/*
+ * Reset WAL record construction buffers.
+ */
+void
+XLogResetInsertion(void)
+{
+ int i;
+
+ for (i = 0; i < max_registered_block_id; i++)
+ registered_buffers[i].in_use = false;
+
+ num_rdatas = 0;
+ max_registered_block_id = 0;
+ mainrdata_len = 0;
+ mainrdata_last = (XLogRecData *) &mainrdata_head;
+ curinsert_flags = 0;
+ begininsert_called = false;
+}
+
+/*
+ * Register a reference to a buffer with the WAL record being constructed.
+ * This must be called for every page that the WAL-logged operation modifies.
+ */
+void
+XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
+{
+ registered_buffer *regbuf;
+
+ /* NO_IMAGE doesn't make sense with FORCE_IMAGE */
+ Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE))));
+ Assert(begininsert_called);
+
+ if (block_id >= max_registered_block_id)
+ {
+ if (block_id >= max_registered_buffers)
+ elog(ERROR, "too many registered buffers");
+ max_registered_block_id = block_id + 1;
+ }
+
+ regbuf = &registered_buffers[block_id];
+
+ BufferGetTag(buffer, &regbuf->rnode, &regbuf->forkno, &regbuf->block);
+ regbuf->page = BufferGetPage(buffer);
+ regbuf->flags = flags;
+ regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
+ regbuf->rdata_len = 0;
+
+ /*
+ * Check that this page hasn't already been registered with some other
+ * block_id.
+ */
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ for (i = 0; i < max_registered_block_id; i++)
+ {
+ registered_buffer *regbuf_old = &registered_buffers[i];
+
+ if (i == block_id || !regbuf_old->in_use)
+ continue;
+
+ Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) ||
+ regbuf_old->forkno != regbuf->forkno ||
+ regbuf_old->block != regbuf->block);
+ }
+ }
+#endif
+
+ regbuf->in_use = true;
+}
+
+/*
+ * Like XLogRegisterBuffer, but for registering a block that's not in the
+ * shared buffer pool (i.e. when you don't have a Buffer for it).
+ */
+void
+XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
+ BlockNumber blknum, Page page, uint8 flags)
+{
+ registered_buffer *regbuf;
+
+ Assert(begininsert_called);
+
+ if (block_id >= max_registered_block_id)
+ max_registered_block_id = block_id + 1;
+
+ if (block_id >= max_registered_buffers)
+ elog(ERROR, "too many registered buffers");
+
+ regbuf = &registered_buffers[block_id];
+
+ regbuf->rnode = *rnode;
+ regbuf->forkno = forknum;
+ regbuf->block = blknum;
+ regbuf->page = page;
+ regbuf->flags = flags;
+ regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
+ regbuf->rdata_len = 0;
+
+ /*
+ * Check that this page hasn't already been registered with some other
+ * block_id.
+ */
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ for (i = 0; i < max_registered_block_id; i++)
+ {
+ registered_buffer *regbuf_old = &registered_buffers[i];
+
+ if (i == block_id || !regbuf_old->in_use)
+ continue;
+
+ Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) ||
+ regbuf_old->forkno != regbuf->forkno ||
+ regbuf_old->block != regbuf->block);
+ }
+ }
+#endif
+
+ regbuf->in_use = true;
+}
+
+/*
+ * Add data to the WAL record that's being constructed.
+ *
+ * The data is appended to the "main chunk", available at replay with
+ * XLogRecGetData().
+ */
+void
+XLogRegisterData(char *data, int len)
+{
+ XLogRecData *rdata;
+
+ Assert(begininsert_called);
+
+ if (num_rdatas >= max_rdatas)
+ elog(ERROR, "too much WAL data");
+ rdata = &rdatas[num_rdatas++];
+
+ rdata->data = data;
+ rdata->len = len;
+
+ /*
+ * we use the mainrdata_last pointer to track the end of the chain, so no
+ * need to clear 'next' here.
+ */
+
+ mainrdata_last->next = rdata;
+ mainrdata_last = rdata;
+
+ mainrdata_len += len;
+}
+
+/*
+ * Add buffer-specific data to the WAL record that's being constructed.
+ *
+ * Block_id must reference a block previously registered with
+ * XLogRegisterBuffer(). If this is called more than once for the same
+ * block_id, the data is appended.
+ *
+ * The maximum amount of data that can be registered per block is 65535
+ * bytes. That should be plenty; if you need more than BLCKSZ bytes to
+ * reconstruct the changes to the page, you might as well just log a full
+ * copy of it. (the "main data" that's not associated with a block is not
+ * limited)
+ */
+void
+XLogRegisterBufData(uint8 block_id, char *data, int len)
+{
+ registered_buffer *regbuf;
+ XLogRecData *rdata;
+
+ Assert(begininsert_called);
+
+ /* find the registered buffer struct */
+ regbuf = &registered_buffers[block_id];
+ if (!regbuf->in_use)
+ elog(ERROR, "no block with id %d registered with WAL insertion",
+ block_id);
+
+ if (num_rdatas >= max_rdatas)
+ elog(ERROR, "too much WAL data");
+ rdata = &rdatas[num_rdatas++];
+
+ rdata->data = data;
+ rdata->len = len;
+
+ regbuf->rdata_tail->next = rdata;
+ regbuf->rdata_tail = rdata;
+ regbuf->rdata_len += len;
+}
+
+/*
+ * Set insert status flags for the upcoming WAL record.
+ *
+ * The flags that can be used here are:
+ * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be
+ * included in the record.
+ * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for
+ * durability, which allows to avoid triggering WAL archiving and other
+ * background activity.
+ */
+void
+XLogSetRecordFlags(uint8 flags)
+{
+ Assert(begininsert_called);
+ curinsert_flags |= flags;
+}
+
+/*
+ * Insert an XLOG record having the specified RMID and info bytes, with the
+ * body of the record being the data and buffer references registered earlier
+ * with XLogRegister* calls.
+ *
+ * Returns XLOG pointer to end of record (beginning of next record).
+ * This can be used as LSN for data pages affected by the logged action.
+ * (LSN is the XLOG point up to which the XLOG must be flushed to disk
+ * before the data page can be written out. This implements the basic
+ * WAL rule "write the log before the data".)
+ */
+XLogRecPtr
+XLogInsert(RmgrId rmid, uint8 info)
+{
+ XLogRecPtr EndPos;
+
+ /* XLogBeginInsert() must have been called. */
+ if (!begininsert_called)
+ elog(ERROR, "XLogBeginInsert was not called");
+
+ /*
+ * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
+ * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
+ */
+ if ((info & ~(XLR_RMGR_INFO_MASK |
+ XLR_SPECIAL_REL_UPDATE |
+ XLR_CHECK_CONSISTENCY)) != 0)
+ elog(PANIC, "invalid xlog info mask %02X", info);
+
+ TRACE_POSTGRESQL_WAL_INSERT(rmid, info);
+
+ /*
+ * In bootstrap mode, we don't actually log anything but XLOG resources;
+ * return a phony record pointer.
+ */
+ if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
+ {
+ XLogResetInsertion();
+ EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */
+ return EndPos;
+ }
+
+ do
+ {
+ XLogRecPtr RedoRecPtr;
+ bool doPageWrites;
+ bool topxid_included = false;
+ XLogRecPtr fpw_lsn;
+ XLogRecData *rdt;
+ int num_fpi = 0;
+
+ /*
+ * Get values needed to decide whether to do full-page writes. Since
+ * we don't yet have an insertion lock, these could change under us,
+ * but XLogInsertRecord will recheck them once it has a lock.
+ */
+ GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites);
+
+ rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites,
+ &fpw_lsn, &num_fpi, &topxid_included);
+
+ EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi,
+ topxid_included);
+ } while (EndPos == InvalidXLogRecPtr);
+
+ XLogResetInsertion();
+
+ return EndPos;
+}
+
+/*
+ * Assemble a WAL record from the registered data and buffers into an
+ * XLogRecData chain, ready for insertion with XLogInsertRecord().
+ *
+ * The record header fields are filled in, except for the xl_prev field. The
+ * calculated CRC does not include the record header yet.
+ *
+ * If there are any registered buffers, and a full-page image was not taken
+ * of all of them, *fpw_lsn is set to the lowest LSN among such pages. This
+ * signals that the assembled record is only good for insertion on the
+ * assumption that the RedoRecPtr and doPageWrites values were up-to-date.
+ *
+ * *topxid_included is set if the topmost transaction ID is logged with the
+ * current subtransaction.
+ */
+static XLogRecData *
+XLogRecordAssemble(RmgrId rmid, uint8 info,
+ XLogRecPtr RedoRecPtr, bool doPageWrites,
+ XLogRecPtr *fpw_lsn, int *num_fpi, bool *topxid_included)
+{
+ XLogRecData *rdt;
+ uint32 total_len = 0;
+ int block_id;
+ pg_crc32c rdata_crc;
+ registered_buffer *prev_regbuf = NULL;
+ XLogRecData *rdt_datas_last;
+ XLogRecord *rechdr;
+ char *scratch = hdr_scratch;
+
+ /*
+ * Note: this function can be called multiple times for the same record.
+ * All the modifications we do to the rdata chains below must handle that.
+ */
+
+ /* The record begins with the fixed-size header */
+ rechdr = (XLogRecord *) scratch;
+ scratch += SizeOfXLogRecord;
+
+ hdr_rdt.next = NULL;
+ rdt_datas_last = &hdr_rdt;
+ hdr_rdt.data = hdr_scratch;
+
+ /*
+ * Enforce consistency checks for this record if user is looking for it.
+ * Do this before at the beginning of this routine to give the possibility
+ * for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for
+ * a record.
+ */
+ if (wal_consistency_checking[rmid])
+ info |= XLR_CHECK_CONSISTENCY;
+
+ /*
+ * Make an rdata chain containing all the data portions of all block
+ * references. This includes the data for full-page images. Also append
+ * the headers for the block references in the scratch buffer.
+ */
+ *fpw_lsn = InvalidXLogRecPtr;
+ for (block_id = 0; block_id < max_registered_block_id; block_id++)
+ {
+ registered_buffer *regbuf = &registered_buffers[block_id];
+ bool needs_backup;
+ bool needs_data;
+ XLogRecordBlockHeader bkpb;
+ XLogRecordBlockImageHeader bimg;
+ XLogRecordBlockCompressHeader cbimg = {0};
+ bool samerel;
+ bool is_compressed = false;
+ bool include_image;
+
+ if (!regbuf->in_use)
+ continue;
+
+ /* Determine if this block needs to be backed up */
+ if (regbuf->flags & REGBUF_FORCE_IMAGE)
+ needs_backup = true;
+ else if (regbuf->flags & REGBUF_NO_IMAGE)
+ needs_backup = false;
+ else if (!doPageWrites)
+ needs_backup = false;
+ else
+ {
+ /*
+ * We assume page LSN is first data on *every* page that can be
+ * passed to XLogInsert, whether it has the standard page layout
+ * or not.
+ */
+ XLogRecPtr page_lsn = PageGetLSN(regbuf->page);
+
+ needs_backup = (page_lsn <= RedoRecPtr);
+ if (!needs_backup)
+ {
+ if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn)
+ *fpw_lsn = page_lsn;
+ }
+ }
+
+ /* Determine if the buffer data needs to included */
+ if (regbuf->rdata_len == 0)
+ needs_data = false;
+ else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0)
+ needs_data = true;
+ else
+ needs_data = !needs_backup;
+
+ bkpb.id = block_id;
+ bkpb.fork_flags = regbuf->forkno;
+ bkpb.data_length = 0;
+
+ if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
+ bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
+
+ /*
+ * If needs_backup is true or WAL checking is enabled for current
+ * resource manager, log a full-page write for the current block.
+ */
+ include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
+
+ if (include_image)
+ {
+ Page page = regbuf->page;
+ uint16 compressed_len = 0;
+
+ /*
+ * The page needs to be backed up, so calculate its hole length
+ * and offset.
+ */
+ if (regbuf->flags & REGBUF_STANDARD)
+ {
+ /* Assume we can omit data between pd_lower and pd_upper */
+ uint16 lower = ((PageHeader) page)->pd_lower;
+ uint16 upper = ((PageHeader) page)->pd_upper;
+
+ if (lower >= SizeOfPageHeaderData &&
+ upper > lower &&
+ upper <= BLCKSZ)
+ {
+ bimg.hole_offset = lower;
+ cbimg.hole_length = upper - lower;
+ }
+ else
+ {
+ /* No "hole" to remove */
+ bimg.hole_offset = 0;
+ cbimg.hole_length = 0;
+ }
+ }
+ else
+ {
+ /* Not a standard page header, don't try to eliminate "hole" */
+ bimg.hole_offset = 0;
+ cbimg.hole_length = 0;
+ }
+
+ /*
+ * Try to compress a block image if wal_compression is enabled
+ */
+ if (wal_compression != WAL_COMPRESSION_NONE)
+ {
+ is_compressed =
+ XLogCompressBackupBlock(page, bimg.hole_offset,
+ cbimg.hole_length,
+ regbuf->compressed_page,
+ &compressed_len);
+ }
+
+ /*
+ * Fill in the remaining fields in the XLogRecordBlockHeader
+ * struct
+ */
+ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;
+
+ /* Report a full page image constructed for the WAL record */
+ *num_fpi += 1;
+
+ /*
+ * Construct XLogRecData entries for the page content.
+ */
+ rdt_datas_last->next = &regbuf->bkp_rdatas[0];
+ rdt_datas_last = rdt_datas_last->next;
+
+ bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
+
+ /*
+ * If WAL consistency checking is enabled for the resource manager
+ * of this WAL record, a full-page image is included in the record
+ * for the block modified. During redo, the full-page is replayed
+ * only if BKPIMAGE_APPLY is set.
+ */
+ if (needs_backup)
+ bimg.bimg_info |= BKPIMAGE_APPLY;
+
+ if (is_compressed)
+ {
+ /* The current compression is stored in the WAL record */
+ bimg.length = compressed_len;
+
+ /* Set the compression method used for this block */
+ switch ((WalCompression) wal_compression)
+ {
+ case WAL_COMPRESSION_PGLZ:
+ bimg.bimg_info |= BKPIMAGE_COMPRESS_PGLZ;
+ break;
+
+ case WAL_COMPRESSION_LZ4:
+#ifdef USE_LZ4
+ bimg.bimg_info |= BKPIMAGE_COMPRESS_LZ4;
+#else
+ elog(ERROR, "LZ4 is not supported by this build");
+#endif
+ break;
+
+ case WAL_COMPRESSION_ZSTD:
+#ifdef USE_ZSTD
+ bimg.bimg_info |= BKPIMAGE_COMPRESS_ZSTD;
+#else
+ elog(ERROR, "zstd is not supported by this build");
+#endif
+ break;
+
+ case WAL_COMPRESSION_NONE:
+ Assert(false); /* cannot happen */
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ rdt_datas_last->data = regbuf->compressed_page;
+ rdt_datas_last->len = compressed_len;
+ }
+ else
+ {
+ bimg.length = BLCKSZ - cbimg.hole_length;
+
+ if (cbimg.hole_length == 0)
+ {
+ rdt_datas_last->data = page;
+ rdt_datas_last->len = BLCKSZ;
+ }
+ else
+ {
+ /* must skip the hole */
+ rdt_datas_last->data = page;
+ rdt_datas_last->len = bimg.hole_offset;
+
+ rdt_datas_last->next = &regbuf->bkp_rdatas[1];
+ rdt_datas_last = rdt_datas_last->next;
+
+ rdt_datas_last->data =
+ page + (bimg.hole_offset + cbimg.hole_length);
+ rdt_datas_last->len =
+ BLCKSZ - (bimg.hole_offset + cbimg.hole_length);
+ }
+ }
+
+ total_len += bimg.length;
+ }
+
+ if (needs_data)
+ {
+ /*
+ * Link the caller-supplied rdata chain for this buffer to the
+ * overall list.
+ */
+ bkpb.fork_flags |= BKPBLOCK_HAS_DATA;
+ bkpb.data_length = regbuf->rdata_len;
+ total_len += regbuf->rdata_len;
+
+ rdt_datas_last->next = regbuf->rdata_head;
+ rdt_datas_last = regbuf->rdata_tail;
+ }
+
+ if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode))
+ {
+ samerel = true;
+ bkpb.fork_flags |= BKPBLOCK_SAME_REL;
+ }
+ else
+ samerel = false;
+ prev_regbuf = regbuf;
+
+ /* Ok, copy the header to the scratch buffer */
+ memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
+ scratch += SizeOfXLogRecordBlockHeader;
+ if (include_image)
+ {
+ memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
+ scratch += SizeOfXLogRecordBlockImageHeader;
+ if (cbimg.hole_length != 0 && is_compressed)
+ {
+ memcpy(scratch, &cbimg,
+ SizeOfXLogRecordBlockCompressHeader);
+ scratch += SizeOfXLogRecordBlockCompressHeader;
+ }
+ }
+ if (!samerel)
+ {
+ memcpy(scratch, &regbuf->rnode, sizeof(RelFileNode));
+ scratch += sizeof(RelFileNode);
+ }
+ memcpy(scratch, &regbuf->block, sizeof(BlockNumber));
+ scratch += sizeof(BlockNumber);
+ }
+
+ /* followed by the record's origin, if any */
+ if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) &&
+ replorigin_session_origin != InvalidRepOriginId)
+ {
+ *(scratch++) = (char) XLR_BLOCK_ID_ORIGIN;
+ memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin));
+ scratch += sizeof(replorigin_session_origin);
+ }
+
+ /* followed by toplevel XID, if not already included in previous record */
+ if (IsSubxactTopXidLogPending())
+ {
+ TransactionId xid = GetTopTransactionIdIfAny();
+
+ /* Set the flag that the top xid is included in the WAL */
+ *topxid_included = true;
+
+ *(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID;
+ memcpy(scratch, &xid, sizeof(TransactionId));
+ scratch += sizeof(TransactionId);
+ }
+
+ /* followed by main data, if any */
+ if (mainrdata_len > 0)
+ {
+ if (mainrdata_len > 255)
+ {
+ *(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
+ memcpy(scratch, &mainrdata_len, sizeof(uint32));
+ scratch += sizeof(uint32);
+ }
+ else
+ {
+ *(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
+ *(scratch++) = (uint8) mainrdata_len;
+ }
+ rdt_datas_last->next = mainrdata_head;
+ rdt_datas_last = mainrdata_last;
+ total_len += mainrdata_len;
+ }
+ rdt_datas_last->next = NULL;
+
+ hdr_rdt.len = (scratch - hdr_scratch);
+ total_len += hdr_rdt.len;
+
+ /*
+ * Calculate CRC of the data
+ *
+ * Note that the record header isn't added into the CRC initially since we
+ * don't know the prev-link yet. Thus, the CRC will represent the CRC of
+ * the whole record in the order: rdata, then backup blocks, then record
+ * header.
+ */
+ INIT_CRC32C(rdata_crc);
+ COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
+ for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next)
+ COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
+
+ /*
+ * Fill in the fields in the record header. Prev-link is filled in later,
+ * once we know where in the WAL the record will be inserted. The CRC does
+ * not include the record header yet.
+ */
+ rechdr->xl_xid = GetCurrentTransactionIdIfAny();
+ rechdr->xl_tot_len = total_len;
+ rechdr->xl_info = info;
+ rechdr->xl_rmid = rmid;
+ rechdr->xl_prev = InvalidXLogRecPtr;
+ rechdr->xl_crc = rdata_crc;
+
+ return &hdr_rdt;
+}
+
+/*
+ * Create a compressed version of a backup block image.
+ *
+ * Returns false if compression fails (i.e., compressed result is actually
+ * bigger than original). Otherwise, returns true and sets 'dlen' to
+ * the length of compressed block image.
+ */
+static bool
+XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length,
+ char *dest, uint16 *dlen)
+{
+ int32 orig_len = BLCKSZ - hole_length;
+ int32 len = -1;
+ int32 extra_bytes = 0;
+ char *source;
+ PGAlignedBlock tmp;
+
+ if (hole_length != 0)
+ {
+ /* must skip the hole */
+ source = tmp.data;
+ memcpy(source, page, hole_offset);
+ memcpy(source + hole_offset,
+ page + (hole_offset + hole_length),
+ BLCKSZ - (hole_length + hole_offset));
+
+ /*
+ * Extra data needs to be stored in WAL record for the compressed
+ * version of block image if the hole exists.
+ */
+ extra_bytes = SizeOfXLogRecordBlockCompressHeader;
+ }
+ else
+ source = page;
+
+ switch ((WalCompression) wal_compression)
+ {
+ case WAL_COMPRESSION_PGLZ:
+ len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default);
+ break;
+
+ case WAL_COMPRESSION_LZ4:
+#ifdef USE_LZ4
+ len = LZ4_compress_default(source, dest, orig_len,
+ COMPRESS_BUFSIZE);
+ if (len <= 0)
+ len = -1; /* failure */
+#else
+ elog(ERROR, "LZ4 is not supported by this build");
+#endif
+ break;
+
+ case WAL_COMPRESSION_ZSTD:
+#ifdef USE_ZSTD
+ len = ZSTD_compress(dest, COMPRESS_BUFSIZE, source, orig_len,
+ ZSTD_CLEVEL_DEFAULT);
+ if (ZSTD_isError(len))
+ len = -1; /* failure */
+#else
+ elog(ERROR, "zstd is not supported by this build");
+#endif
+ break;
+
+ case WAL_COMPRESSION_NONE:
+ Assert(false); /* cannot happen */
+ break;
+ /* no default case, so that compiler will warn */
+ }
+
+ /*
+ * We recheck the actual size even if compression reports success and see
+ * if the number of bytes saved by compression is larger than the length
+ * of extra data needed for the compressed version of block image.
+ */
+ if (len >= 0 &&
+ len + extra_bytes < orig_len)
+ {
+ *dlen = (uint16) len; /* successful compression */
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Determine whether the buffer referenced has to be backed up.
+ *
+ * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
+ * could change later, so the result should be used for optimization purposes
+ * only.
+ */
+bool
+XLogCheckBufferNeedsBackup(Buffer buffer)
+{
+ XLogRecPtr RedoRecPtr;
+ bool doPageWrites;
+ Page page;
+
+ GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites);
+
+ page = BufferGetPage(buffer);
+
+ if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+ return true; /* buffer requires backup */
+
+ return false; /* buffer does not need to be backed up */
+}
+
+/*
+ * Write a backup block if needed when we are setting a hint. Note that
+ * this may be called for a variety of page types, not just heaps.
+ *
+ * Callable while holding just share lock on the buffer content.
+ *
+ * We can't use the plain backup block mechanism since that relies on the
+ * Buffer being exclusively locked. Since some modifications (setting LSN, hint
+ * bits) are allowed in a sharelocked buffer that can lead to wal checksum
+ * failures. So instead we copy the page and insert the copied data as normal
+ * record data.
+ *
+ * We only need to do something if page has not yet been full page written in
+ * this checkpoint round. The LSN of the inserted wal record is returned if we
+ * had to write, InvalidXLogRecPtr otherwise.
+ *
+ * It is possible that multiple concurrent backends could attempt to write WAL
+ * records. In that case, multiple copies of the same block would be recorded
+ * in separate WAL records by different backends, though that is still OK from
+ * a correctness perspective.
+ */
+XLogRecPtr
+XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
+{
+ XLogRecPtr recptr = InvalidXLogRecPtr;
+ XLogRecPtr lsn;
+ XLogRecPtr RedoRecPtr;
+
+ /*
+ * Ensure no checkpoint can change our view of RedoRecPtr.
+ */
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) != 0);
+
+ /*
+ * Update RedoRecPtr so that we can make the right decision
+ */
+ RedoRecPtr = GetRedoRecPtr();
+
+ /*
+ * We assume page LSN is first data on *every* page that can be passed to
+ * XLogInsert, whether it has the standard page layout or not. Since we're
+ * only holding a share-lock on the page, we must take the buffer header
+ * lock when we look at the LSN.
+ */
+ lsn = BufferGetLSNAtomic(buffer);
+
+ if (lsn <= RedoRecPtr)
+ {
+ int flags = 0;
+ PGAlignedBlock copied_buffer;
+ char *origdata = (char *) BufferGetBlock(buffer);
+ RelFileNode rnode;
+ ForkNumber forkno;
+ BlockNumber blkno;
+
+ /*
+ * Copy buffer so we don't have to worry about concurrent hint bit or
+ * lsn updates. We assume pd_lower/upper cannot be changed without an
+ * exclusive lock, so the contents bkp are not racy.
+ */
+ if (buffer_std)
+ {
+ /* Assume we can omit data between pd_lower and pd_upper */
+ Page page = BufferGetPage(buffer);
+ uint16 lower = ((PageHeader) page)->pd_lower;
+ uint16 upper = ((PageHeader) page)->pd_upper;
+
+ memcpy(copied_buffer.data, origdata, lower);
+ memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+ }
+ else
+ memcpy(copied_buffer.data, origdata, BLCKSZ);
+
+ XLogBeginInsert();
+
+ if (buffer_std)
+ flags |= REGBUF_STANDARD;
+
+ BufferGetTag(buffer, &rnode, &forkno, &blkno);
+ XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags);
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
+ }
+
+ return recptr;
+}
+
+/*
+ * Write a WAL record containing a full image of a page. Caller is responsible
+ * for writing the page to disk after calling this routine.
+ *
+ * Note: If you're using this function, you should be building pages in private
+ * memory and writing them directly to smgr. If you're using buffers, call
+ * log_newpage_buffer instead.
+ *
+ * If the page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL record, making it smaller.
+ */
+XLogRecPtr
+log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+ Page page, bool page_std)
+{
+ int flags;
+ XLogRecPtr recptr;
+
+ flags = REGBUF_FORCE_IMAGE;
+ if (page_std)
+ flags |= REGBUF_STANDARD;
+
+ XLogBeginInsert();
+ XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags);
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+ /*
+ * The page may be uninitialized. If so, we can't set the LSN because that
+ * would corrupt the page.
+ */
+ if (!PageIsNew(page))
+ {
+ PageSetLSN(page, recptr);
+ }
+
+ return recptr;
+}
+
+/*
+ * Like log_newpage(), but allows logging multiple pages in one operation.
+ * It is more efficient than calling log_newpage() for each page separately,
+ * because we can write multiple pages in a single WAL record.
+ */
+void
+log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages,
+ BlockNumber *blknos, Page *pages, bool page_std)
+{
+ int flags;
+ XLogRecPtr recptr;
+ int i;
+ int j;
+
+ flags = REGBUF_FORCE_IMAGE;
+ if (page_std)
+ flags |= REGBUF_STANDARD;
+
+ /*
+ * Iterate over all the pages. They are collected into batches of
+ * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+ * batch.
+ */
+ XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
+
+ i = 0;
+ while (i < num_pages)
+ {
+ int batch_start = i;
+ int nbatch;
+
+ XLogBeginInsert();
+
+ nbatch = 0;
+ while (nbatch < XLR_MAX_BLOCK_ID && i < num_pages)
+ {
+ XLogRegisterBlock(nbatch, rnode, forkNum, blknos[i], pages[i], flags);
+ i++;
+ nbatch++;
+ }
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+ for (j = batch_start; j < i; j++)
+ {
+ /*
+ * The page may be uninitialized. If so, we can't set the LSN
+ * because that would corrupt the page.
+ */
+ if (!PageIsNew(pages[j]))
+ {
+ PageSetLSN(pages[j], recptr);
+ }
+ }
+ }
+}
+
+/*
+ * Write a WAL record containing a full image of a page.
+ *
+ * Caller should initialize the buffer and mark it dirty before calling this
+ * function. This function will set the page LSN.
+ *
+ * If the page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL record, making it smaller.
+ */
+XLogRecPtr
+log_newpage_buffer(Buffer buffer, bool page_std)
+{
+ Page page = BufferGetPage(buffer);
+ RelFileNode rnode;
+ ForkNumber forkNum;
+ BlockNumber blkno;
+
+ /* Shared buffers should be modified in a critical section. */
+ Assert(CritSectionCount > 0);
+
+ BufferGetTag(buffer, &rnode, &forkNum, &blkno);
+
+ return log_newpage(&rnode, forkNum, blkno, page, page_std);
+}
+
+/*
+ * WAL-log a range of blocks in a relation.
+ *
+ * An image of all pages with block numbers 'startblk' <= X < 'endblk' is
+ * written to the WAL. If the range is large, this is done in multiple WAL
+ * records.
+ *
+ * If all page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL records, making them smaller.
+ *
+ * NOTE: This function acquires exclusive-locks on the pages. Typically, this
+ * is used on a newly-built relation, and the caller is holding a
+ * AccessExclusiveLock on it, so no other backend can be accessing it at the
+ * same time. If that's not the case, you must ensure that this does not
+ * cause a deadlock through some other means.
+ */
+void
+log_newpage_range(Relation rel, ForkNumber forkNum,
+ BlockNumber startblk, BlockNumber endblk,
+ bool page_std)
+{
+ int flags;
+ BlockNumber blkno;
+
+ flags = REGBUF_FORCE_IMAGE;
+ if (page_std)
+ flags |= REGBUF_STANDARD;
+
+ /*
+ * Iterate over all the pages in the range. They are collected into
+ * batches of XLR_MAX_BLOCK_ID pages, and a single WAL-record is written
+ * for each batch.
+ */
+ XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
+
+ blkno = startblk;
+ while (blkno < endblk)
+ {
+ Buffer bufpack[XLR_MAX_BLOCK_ID];
+ XLogRecPtr recptr;
+ int nbufs;
+ int i;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Collect a batch of blocks. */
+ nbufs = 0;
+ while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
+ {
+ Buffer buf = ReadBufferExtended(rel, forkNum, blkno,
+ RBM_NORMAL, NULL);
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * Completely empty pages are not WAL-logged. Writing a WAL record
+ * would change the LSN, and we don't want that. We want the page
+ * to stay empty.
+ */
+ if (!PageIsNew(BufferGetPage(buf)))
+ bufpack[nbufs++] = buf;
+ else
+ UnlockReleaseBuffer(buf);
+ blkno++;
+ }
+
+ /* Nothing more to do if all remaining blocks were empty. */
+ if (nbufs == 0)
+ break;
+
+ /* Write WAL record for this batch. */
+ XLogBeginInsert();
+
+ START_CRIT_SECTION();
+ for (i = 0; i < nbufs; i++)
+ {
+ XLogRegisterBuffer(i, bufpack[i], flags);
+ MarkBufferDirty(bufpack[i]);
+ }
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+ for (i = 0; i < nbufs; i++)
+ {
+ PageSetLSN(BufferGetPage(bufpack[i]), recptr);
+ UnlockReleaseBuffer(bufpack[i]);
+ }
+ END_CRIT_SECTION();
+ }
+}
+
+/*
+ * Allocate working buffers needed for WAL record construction.
+ */
+void
+InitXLogInsert(void)
+{
+ /* Initialize the working areas */
+ if (xloginsert_cxt == NULL)
+ {
+ xloginsert_cxt = AllocSetContextCreate(TopMemoryContext,
+ "WAL record construction",
+ ALLOCSET_DEFAULT_SIZES);
+ }
+
+ if (registered_buffers == NULL)
+ {
+ registered_buffers = (registered_buffer *)
+ MemoryContextAllocZero(xloginsert_cxt,
+ sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1));
+ max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1;
+ }
+ if (rdatas == NULL)
+ {
+ rdatas = MemoryContextAlloc(xloginsert_cxt,
+ sizeof(XLogRecData) * XLR_NORMAL_RDATAS);
+ max_rdatas = XLR_NORMAL_RDATAS;
+ }
+
+ /*
+ * Allocate a buffer to hold the header information for a WAL record.
+ */
+ if (hdr_scratch == NULL)
+ hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
+ HEADER_SCRATCH_SIZE);
+}
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
new file mode 100644
index 0000000..b98b319
--- /dev/null
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -0,0 +1,1105 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogprefetcher.c
+ * Prefetching support for recovery.
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/xlogprefetcher.c
+ *
+ * This module provides a drop-in replacement for an XLogReader that tries to
+ * minimize I/O stalls by looking ahead in the WAL. If blocks that will be
+ * accessed in the near future are not already in the buffer pool, it initiates
+ * I/Os that might complete before the caller eventually needs the data. When
+ * referenced blocks are found in the buffer pool already, the buffer is
+ * recorded in the decoded record so that XLogReadBufferForRedo() can try to
+ * avoid a second buffer mapping table lookup.
+ *
+ * Currently, only the main fork is considered for prefetching. Currently,
+ * prefetching is only effective on systems where BufferPrefetch() does
+ * something useful (mainly Linux).
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_control.h"
+#include "catalog/storage_xlog.h"
+#include "commands/dbcommands_xlog.h"
+#include "utils/fmgrprotos.h"
+#include "utils/timestamp.h"
+#include "funcapi.h"
+#include "pgstat.h"
+#include "miscadmin.h"
+#include "port/atomics.h"
+#include "storage/bufmgr.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+
+/*
+ * Every time we process this much WAL, we'll update the values in
+ * pg_stat_recovery_prefetch.
+ */
+#define XLOGPREFETCHER_STATS_DISTANCE BLCKSZ
+
+/*
+ * To detect repeated access to the same block and skip useless extra system
+ * calls, we remember a small window of recently prefetched blocks.
+ */
+#define XLOGPREFETCHER_SEQ_WINDOW_SIZE 4
+
+/*
+ * When maintenance_io_concurrency is not saturated, we're prepared to look
+ * ahead up to N times that number of block references.
+ */
+#define XLOGPREFETCHER_DISTANCE_MULTIPLIER 4
+
+/* Define to log internal debugging messages. */
+/* #define XLOGPREFETCHER_DEBUG_LEVEL LOG */
+
+/* GUCs */
+int recovery_prefetch = RECOVERY_PREFETCH_TRY;
+
+#ifdef USE_PREFETCH
+#define RecoveryPrefetchEnabled() \
+ (recovery_prefetch != RECOVERY_PREFETCH_OFF && \
+ maintenance_io_concurrency > 0)
+#else
+#define RecoveryPrefetchEnabled() false
+#endif
+
+static int XLogPrefetchReconfigureCount = 0;
+
+/*
+ * Enum used to report whether an IO should be started.
+ */
+typedef enum
+{
+ LRQ_NEXT_NO_IO,
+ LRQ_NEXT_IO,
+ LRQ_NEXT_AGAIN
+} LsnReadQueueNextStatus;
+
+/*
+ * Type of callback that can decide which block to prefetch next. For now
+ * there is only one.
+ */
+typedef LsnReadQueueNextStatus (*LsnReadQueueNextFun) (uintptr_t lrq_private,
+ XLogRecPtr *lsn);
+
+/*
+ * A simple circular queue of LSNs, using to control the number of
+ * (potentially) inflight IOs. This stands in for a later more general IO
+ * control mechanism, which is why it has the apparently unnecessary
+ * indirection through a function pointer.
+ */
+typedef struct LsnReadQueue
+{
+ LsnReadQueueNextFun next;
+ uintptr_t lrq_private;
+ uint32 max_inflight;
+ uint32 inflight;
+ uint32 completed;
+ uint32 head;
+ uint32 tail;
+ uint32 size;
+ struct
+ {
+ bool io;
+ XLogRecPtr lsn;
+ } queue[FLEXIBLE_ARRAY_MEMBER];
+} LsnReadQueue;
+
+/*
+ * A prefetcher. This is a mechanism that wraps an XLogReader, prefetching
+ * blocks that will be soon be referenced, to try to avoid IO stalls.
+ */
+struct XLogPrefetcher
+{
+ /* WAL reader and current reading state. */
+ XLogReaderState *reader;
+ DecodedXLogRecord *record;
+ int next_block_id;
+
+ /* When to publish stats. */
+ XLogRecPtr next_stats_shm_lsn;
+
+ /* Book-keeping to avoid accessing blocks that don't exist yet. */
+ HTAB *filter_table;
+ dlist_head filter_queue;
+
+ /* Book-keeping to avoid repeat prefetches. */
+ RelFileNode recent_rnode[XLOGPREFETCHER_SEQ_WINDOW_SIZE];
+ BlockNumber recent_block[XLOGPREFETCHER_SEQ_WINDOW_SIZE];
+ int recent_idx;
+
+ /* Book-keeping to disable prefetching temporarily. */
+ XLogRecPtr no_readahead_until;
+
+ /* IO depth manager. */
+ LsnReadQueue *streaming_read;
+
+ XLogRecPtr begin_ptr;
+
+ int reconfigure_count;
+};
+
+/*
+ * A temporary filter used to track block ranges that haven't been created
+ * yet, whole relations that haven't been created yet, and whole relations
+ * that (we assume) have already been dropped, or will be created by bulk WAL
+ * operators.
+ */
+typedef struct XLogPrefetcherFilter
+{
+ RelFileNode rnode;
+ XLogRecPtr filter_until_replayed;
+ BlockNumber filter_from_block;
+ dlist_node link;
+} XLogPrefetcherFilter;
+
+/*
+ * Counters exposed in shared memory for pg_stat_recovery_prefetch.
+ */
+typedef struct XLogPrefetchStats
+{
+ pg_atomic_uint64 reset_time; /* Time of last reset. */
+ pg_atomic_uint64 prefetch; /* Prefetches initiated. */
+ pg_atomic_uint64 hit; /* Blocks already in cache. */
+ pg_atomic_uint64 skip_init; /* Zero-inited blocks skipped. */
+ pg_atomic_uint64 skip_new; /* New/missing blocks filtered. */
+ pg_atomic_uint64 skip_fpw; /* FPWs skipped. */
+ pg_atomic_uint64 skip_rep; /* Repeat accesses skipped. */
+
+ /* Dynamic values */
+ int wal_distance; /* Number of WAL bytes ahead. */
+ int block_distance; /* Number of block references ahead. */
+ int io_depth; /* Number of I/Os in progress. */
+} XLogPrefetchStats;
+
+static inline void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher,
+ RelFileNode rnode,
+ BlockNumber blockno,
+ XLogRecPtr lsn);
+static inline bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher,
+ RelFileNode rnode,
+ BlockNumber blockno);
+static inline void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher,
+ XLogRecPtr replaying_lsn);
+static LsnReadQueueNextStatus XLogPrefetcherNextBlock(uintptr_t pgsr_private,
+ XLogRecPtr *lsn);
+
+static XLogPrefetchStats *SharedStats;
+
+static inline LsnReadQueue *
+lrq_alloc(uint32 max_distance,
+ uint32 max_inflight,
+ uintptr_t lrq_private,
+ LsnReadQueueNextFun next)
+{
+ LsnReadQueue *lrq;
+ uint32 size;
+
+ Assert(max_distance >= max_inflight);
+
+ size = max_distance + 1; /* full ring buffer has a gap */
+ lrq = palloc(offsetof(LsnReadQueue, queue) + sizeof(lrq->queue[0]) * size);
+ lrq->lrq_private = lrq_private;
+ lrq->max_inflight = max_inflight;
+ lrq->size = size;
+ lrq->next = next;
+ lrq->head = 0;
+ lrq->tail = 0;
+ lrq->inflight = 0;
+ lrq->completed = 0;
+
+ return lrq;
+}
+
+static inline void
+lrq_free(LsnReadQueue *lrq)
+{
+ pfree(lrq);
+}
+
+static inline uint32
+lrq_inflight(LsnReadQueue *lrq)
+{
+ return lrq->inflight;
+}
+
+static inline uint32
+lrq_completed(LsnReadQueue *lrq)
+{
+ return lrq->completed;
+}
+
+static inline void
+lrq_prefetch(LsnReadQueue *lrq)
+{
+ /* Try to start as many IOs as we can within our limits. */
+ while (lrq->inflight < lrq->max_inflight &&
+ lrq->inflight + lrq->completed < lrq->size - 1)
+ {
+ Assert(((lrq->head + 1) % lrq->size) != lrq->tail);
+ switch (lrq->next(lrq->lrq_private, &lrq->queue[lrq->head].lsn))
+ {
+ case LRQ_NEXT_AGAIN:
+ return;
+ case LRQ_NEXT_IO:
+ lrq->queue[lrq->head].io = true;
+ lrq->inflight++;
+ break;
+ case LRQ_NEXT_NO_IO:
+ lrq->queue[lrq->head].io = false;
+ lrq->completed++;
+ break;
+ }
+ lrq->head++;
+ if (lrq->head == lrq->size)
+ lrq->head = 0;
+ }
+}
+
+static inline void
+lrq_complete_lsn(LsnReadQueue *lrq, XLogRecPtr lsn)
+{
+ /*
+ * We know that LSNs before 'lsn' have been replayed, so we can now assume
+ * that any IOs that were started before then have finished.
+ */
+ while (lrq->tail != lrq->head &&
+ lrq->queue[lrq->tail].lsn < lsn)
+ {
+ if (lrq->queue[lrq->tail].io)
+ lrq->inflight--;
+ else
+ lrq->completed--;
+ lrq->tail++;
+ if (lrq->tail == lrq->size)
+ lrq->tail = 0;
+ }
+ if (RecoveryPrefetchEnabled())
+ lrq_prefetch(lrq);
+}
+
+size_t
+XLogPrefetchShmemSize(void)
+{
+ return sizeof(XLogPrefetchStats);
+}
+
+/*
+ * Reset all counters to zero.
+ */
+void
+XLogPrefetchResetStats(void)
+{
+ pg_atomic_write_u64(&SharedStats->reset_time, GetCurrentTimestamp());
+ pg_atomic_write_u64(&SharedStats->prefetch, 0);
+ pg_atomic_write_u64(&SharedStats->hit, 0);
+ pg_atomic_write_u64(&SharedStats->skip_init, 0);
+ pg_atomic_write_u64(&SharedStats->skip_new, 0);
+ pg_atomic_write_u64(&SharedStats->skip_fpw, 0);
+ pg_atomic_write_u64(&SharedStats->skip_rep, 0);
+}
+
+void
+XLogPrefetchShmemInit(void)
+{
+ bool found;
+
+ SharedStats = (XLogPrefetchStats *)
+ ShmemInitStruct("XLogPrefetchStats",
+ sizeof(XLogPrefetchStats),
+ &found);
+
+ if (!found)
+ {
+ pg_atomic_init_u64(&SharedStats->reset_time, GetCurrentTimestamp());
+ pg_atomic_init_u64(&SharedStats->prefetch, 0);
+ pg_atomic_init_u64(&SharedStats->hit, 0);
+ pg_atomic_init_u64(&SharedStats->skip_init, 0);
+ pg_atomic_init_u64(&SharedStats->skip_new, 0);
+ pg_atomic_init_u64(&SharedStats->skip_fpw, 0);
+ pg_atomic_init_u64(&SharedStats->skip_rep, 0);
+ }
+}
+
+/*
+ * Called when any GUC is changed that affects prefetching.
+ */
+void
+XLogPrefetchReconfigure(void)
+{
+ XLogPrefetchReconfigureCount++;
+}
+
+/*
+ * Increment a counter in shared memory. This is equivalent to *counter++ on a
+ * plain uint64 without any memory barrier or locking, except on platforms
+ * where readers can't read uint64 without possibly observing a torn value.
+ */
+static inline void
+XLogPrefetchIncrement(pg_atomic_uint64 *counter)
+{
+ Assert(AmStartupProcess() || !IsUnderPostmaster);
+ pg_atomic_write_u64(counter, pg_atomic_read_u64(counter) + 1);
+}
+
+/*
+ * Create a prefetcher that is ready to begin prefetching blocks referenced by
+ * WAL records.
+ */
+XLogPrefetcher *
+XLogPrefetcherAllocate(XLogReaderState *reader)
+{
+ XLogPrefetcher *prefetcher;
+ static HASHCTL hash_table_ctl = {
+ .keysize = sizeof(RelFileNode),
+ .entrysize = sizeof(XLogPrefetcherFilter)
+ };
+
+ prefetcher = palloc0(sizeof(XLogPrefetcher));
+
+ prefetcher->reader = reader;
+ prefetcher->filter_table = hash_create("XLogPrefetcherFilterTable", 1024,
+ &hash_table_ctl,
+ HASH_ELEM | HASH_BLOBS);
+ dlist_init(&prefetcher->filter_queue);
+
+ SharedStats->wal_distance = 0;
+ SharedStats->block_distance = 0;
+ SharedStats->io_depth = 0;
+
+ /* First usage will cause streaming_read to be allocated. */
+ prefetcher->reconfigure_count = XLogPrefetchReconfigureCount - 1;
+
+ return prefetcher;
+}
+
+/*
+ * Destroy a prefetcher and release all resources.
+ */
+void
+XLogPrefetcherFree(XLogPrefetcher *prefetcher)
+{
+ lrq_free(prefetcher->streaming_read);
+ hash_destroy(prefetcher->filter_table);
+ pfree(prefetcher);
+}
+
+/*
+ * Provide access to the reader.
+ */
+XLogReaderState *
+XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
+{
+ return prefetcher->reader;
+}
+
+/*
+ * Update the statistics visible in the pg_stat_recovery_prefetch view.
+ */
+void
+XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
+{
+ uint32 io_depth;
+ uint32 completed;
+ int64 wal_distance;
+
+
+ /* How far ahead of replay are we now? */
+ if (prefetcher->reader->decode_queue_tail)
+ {
+ wal_distance =
+ prefetcher->reader->decode_queue_tail->lsn -
+ prefetcher->reader->decode_queue_head->lsn;
+ }
+ else
+ {
+ wal_distance = 0;
+ }
+
+ /* How many IOs are currently in flight and completed? */
+ io_depth = lrq_inflight(prefetcher->streaming_read);
+ completed = lrq_completed(prefetcher->streaming_read);
+
+ /* Update the instantaneous stats visible in pg_stat_recovery_prefetch. */
+ SharedStats->io_depth = io_depth;
+ SharedStats->block_distance = io_depth + completed;
+ SharedStats->wal_distance = wal_distance;
+
+ prefetcher->next_stats_shm_lsn =
+ prefetcher->reader->ReadRecPtr + XLOGPREFETCHER_STATS_DISTANCE;
+}
+
+/*
+ * A callback that examines the next block reference in the WAL, and possibly
+ * starts an IO so that a later read will be fast.
+ *
+ * Returns LRQ_NEXT_AGAIN if no more WAL data is available yet.
+ *
+ * Returns LRQ_NEXT_IO if the next block reference is for a main fork block
+ * that isn't in the buffer pool, and the kernel has been asked to start
+ * reading it to make a future read system call faster. An LSN is written to
+ * *lsn, and the I/O will be considered to have completed once that LSN is
+ * replayed.
+ *
+ * Returns LRQ_NO_IO if we examined the next block reference and found that it
+ * was already in the buffer pool, or we decided for various reasons not to
+ * prefetch.
+ */
+static LsnReadQueueNextStatus
+XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
+{
+ XLogPrefetcher *prefetcher = (XLogPrefetcher *) pgsr_private;
+ XLogReaderState *reader = prefetcher->reader;
+ XLogRecPtr replaying_lsn = reader->ReadRecPtr;
+
+ /*
+ * We keep track of the record and block we're up to between calls with
+ * prefetcher->record and prefetcher->next_block_id.
+ */
+ for (;;)
+ {
+ DecodedXLogRecord *record;
+
+ /* Try to read a new future record, if we don't already have one. */
+ if (prefetcher->record == NULL)
+ {
+ bool nonblocking;
+
+ /*
+ * If there are already records or an error queued up that could
+ * be replayed, we don't want to block here. Otherwise, it's OK
+ * to block waiting for more data: presumably the caller has
+ * nothing else to do.
+ */
+ nonblocking = XLogReaderHasQueuedRecordOrError(reader);
+
+ /* Readahead is disabled until we replay past a certain point. */
+ if (nonblocking && replaying_lsn <= prefetcher->no_readahead_until)
+ return LRQ_NEXT_AGAIN;
+
+ record = XLogReadAhead(prefetcher->reader, nonblocking);
+ if (record == NULL)
+ {
+ /*
+ * We can't read any more, due to an error or lack of data in
+ * nonblocking mode. Don't try to read ahead again until
+ * we've replayed everything already decoded.
+ */
+ if (nonblocking && prefetcher->reader->decode_queue_tail)
+ prefetcher->no_readahead_until =
+ prefetcher->reader->decode_queue_tail->lsn;
+
+ return LRQ_NEXT_AGAIN;
+ }
+
+ /*
+ * If prefetching is disabled, we don't need to analyze the record
+ * or issue any prefetches. We just need to cause one record to
+ * be decoded.
+ */
+ if (!RecoveryPrefetchEnabled())
+ {
+ *lsn = InvalidXLogRecPtr;
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /* We have a new record to process. */
+ prefetcher->record = record;
+ prefetcher->next_block_id = 0;
+ }
+ else
+ {
+ /* Continue to process from last call, or last loop. */
+ record = prefetcher->record;
+ }
+
+ /*
+ * Check for operations that require us to filter out block ranges, or
+ * pause readahead completely.
+ */
+ if (replaying_lsn < record->lsn)
+ {
+ uint8 rmid = record->header.xl_rmid;
+ uint8 record_type = record->header.xl_info & ~XLR_INFO_MASK;
+
+ if (rmid == RM_XLOG_ID)
+ {
+ if (record_type == XLOG_CHECKPOINT_SHUTDOWN ||
+ record_type == XLOG_END_OF_RECOVERY)
+ {
+ /*
+ * These records might change the TLI. Avoid potential
+ * bugs if we were to allow "read TLI" and "replay TLI" to
+ * differ without more analysis.
+ */
+ prefetcher->no_readahead_until = record->lsn;
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing all readahead until %X/%X is replayed due to possible TLI change",
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+
+ /* Fall through so we move past this record. */
+ }
+ }
+ else if (rmid == RM_DBASE_ID)
+ {
+ /*
+ * When databases are created with the file-copy strategy,
+ * there are no WAL records to tell us about the creation of
+ * individual relations.
+ */
+ if (record_type == XLOG_DBASE_CREATE_FILE_COPY)
+ {
+ xl_dbase_create_file_copy_rec *xlrec =
+ (xl_dbase_create_file_copy_rec *) record->main_data;
+ RelFileNode rnode = {InvalidOid, xlrec->db_id, InvalidOid};
+
+ /*
+ * Don't try to prefetch anything in this database until
+ * it has been created, or we might confuse the blocks of
+ * different generations, if a database OID or relfilenode
+ * is reused. It's also more efficient than discovering
+ * that relations don't exist on disk yet with ENOENT
+ * errors.
+ */
+ XLogPrefetcherAddFilter(prefetcher, rnode, 0, record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy",
+ rnode.dbNode,
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+ }
+ }
+ else if (rmid == RM_SMGR_ID)
+ {
+ if (record_type == XLOG_SMGR_CREATE)
+ {
+ xl_smgr_create *xlrec = (xl_smgr_create *)
+ record->main_data;
+
+ if (xlrec->forkNum == MAIN_FORKNUM)
+ {
+ /*
+ * Don't prefetch anything for this whole relation
+ * until it has been created. Otherwise we might
+ * confuse the blocks of different generations, if a
+ * relfilenode is reused. This also avoids the need
+ * to discover the problem via extra syscalls that
+ * report ENOENT.
+ */
+ XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, 0,
+ record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+ xlrec->rnode.spcNode,
+ xlrec->rnode.dbNode,
+ xlrec->rnode.relNode,
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+ }
+ }
+ else if (record_type == XLOG_SMGR_TRUNCATE)
+ {
+ xl_smgr_truncate *xlrec = (xl_smgr_truncate *)
+ record->main_data;
+
+ /*
+ * Don't consider prefetching anything in the truncated
+ * range until the truncation has been performed.
+ */
+ XLogPrefetcherAddFilter(prefetcher, xlrec->rnode,
+ xlrec->blkno,
+ record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+ xlrec->rnode.spcNode,
+ xlrec->rnode.dbNode,
+ xlrec->rnode.relNode,
+ xlrec->blkno,
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+ }
+ }
+ }
+
+ /* Scan the block references, starting where we left off last time. */
+ while (prefetcher->next_block_id <= record->max_block_id)
+ {
+ int block_id = prefetcher->next_block_id++;
+ DecodedBkpBlock *block = &record->blocks[block_id];
+ SMgrRelation reln;
+ PrefetchBufferResult result;
+
+ if (!block->in_use)
+ continue;
+
+ Assert(!BufferIsValid(block->prefetch_buffer));;
+
+ /*
+ * Record the LSN of this record. When it's replayed,
+ * LsnReadQueue will consider any IOs submitted for earlier LSNs
+ * to be finished.
+ */
+ *lsn = record->lsn;
+
+ /* We don't try to prefetch anything but the main fork for now. */
+ if (block->forknum != MAIN_FORKNUM)
+ {
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /*
+ * If there is a full page image attached, we won't be reading the
+ * page, so don't bother trying to prefetch.
+ */
+ if (block->has_image)
+ {
+ XLogPrefetchIncrement(&SharedStats->skip_fpw);
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /* There is no point in reading a page that will be zeroed. */
+ if (block->flags & BKPBLOCK_WILL_INIT)
+ {
+ XLogPrefetchIncrement(&SharedStats->skip_init);
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /* Should we skip prefetching this block due to a filter? */
+ if (XLogPrefetcherIsFiltered(prefetcher, block->rnode, block->blkno))
+ {
+ XLogPrefetchIncrement(&SharedStats->skip_new);
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /* There is no point in repeatedly prefetching the same block. */
+ for (int i = 0; i < XLOGPREFETCHER_SEQ_WINDOW_SIZE; ++i)
+ {
+ if (block->blkno == prefetcher->recent_block[i] &&
+ RelFileNodeEquals(block->rnode, prefetcher->recent_rnode[i]))
+ {
+ /*
+ * XXX If we also remembered where it was, we could set
+ * recent_buffer so that recovery could skip smgropen()
+ * and a buffer table lookup.
+ */
+ XLogPrefetchIncrement(&SharedStats->skip_rep);
+ return LRQ_NEXT_NO_IO;
+ }
+ }
+ prefetcher->recent_rnode[prefetcher->recent_idx] = block->rnode;
+ prefetcher->recent_block[prefetcher->recent_idx] = block->blkno;
+ prefetcher->recent_idx =
+ (prefetcher->recent_idx + 1) % XLOGPREFETCHER_SEQ_WINDOW_SIZE;
+
+ /*
+ * We could try to have a fast path for repeated references to the
+ * same relation (with some scheme to handle invalidations
+ * safely), but for now we'll call smgropen() every time.
+ */
+ reln = smgropen(block->rnode, InvalidBackendId);
+
+ /*
+ * If the relation file doesn't exist on disk, for example because
+ * we're replaying after a crash and the file will be created and
+ * then unlinked by WAL that hasn't been replayed yet, suppress
+ * further prefetching in the relation until this record is
+ * replayed.
+ */
+ if (!smgrexists(reln, MAIN_FORKNUM))
+ {
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+ reln->smgr_rnode.node.spcNode,
+ reln->smgr_rnode.node.dbNode,
+ reln->smgr_rnode.node.relNode,
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+ XLogPrefetcherAddFilter(prefetcher, block->rnode, 0,
+ record->lsn);
+ XLogPrefetchIncrement(&SharedStats->skip_new);
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /*
+ * If the relation isn't big enough to contain the referenced
+ * block yet, suppress prefetching of this block and higher until
+ * this record is replayed.
+ */
+ if (block->blkno >= smgrnblocks(reln, block->forknum))
+ {
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+ reln->smgr_rnode.node.spcNode,
+ reln->smgr_rnode.node.dbNode,
+ reln->smgr_rnode.node.relNode,
+ block->blkno,
+ LSN_FORMAT_ARGS(record->lsn));
+#endif
+ XLogPrefetcherAddFilter(prefetcher, block->rnode, block->blkno,
+ record->lsn);
+ XLogPrefetchIncrement(&SharedStats->skip_new);
+ return LRQ_NEXT_NO_IO;
+ }
+
+ /* Try to initiate prefetching. */
+ result = PrefetchSharedBuffer(reln, block->forknum, block->blkno);
+ if (BufferIsValid(result.recent_buffer))
+ {
+ /* Cache hit, nothing to do. */
+ XLogPrefetchIncrement(&SharedStats->hit);
+ block->prefetch_buffer = result.recent_buffer;
+ return LRQ_NEXT_NO_IO;
+ }
+ else if (result.initiated_io)
+ {
+ /* Cache miss, I/O (presumably) started. */
+ XLogPrefetchIncrement(&SharedStats->prefetch);
+ block->prefetch_buffer = InvalidBuffer;
+ return LRQ_NEXT_IO;
+ }
+ else
+ {
+ /*
+ * This shouldn't be possible, because we already determined
+ * that the relation exists on disk and is big enough.
+ * Something is wrong with the cache invalidation for
+ * smgrexists(), smgrnblocks(), or the file was unlinked or
+ * truncated beneath our feet?
+ */
+ elog(ERROR,
+ "could not prefetch relation %u/%u/%u block %u",
+ reln->smgr_rnode.node.spcNode,
+ reln->smgr_rnode.node.dbNode,
+ reln->smgr_rnode.node.relNode,
+ block->blkno);
+ }
+ }
+
+ /*
+ * Several callsites need to be able to read exactly one record
+ * without any internal readahead. Examples: xlog.c reading
+ * checkpoint records with emode set to PANIC, which might otherwise
+ * cause XLogPageRead() to panic on some future page, and xlog.c
+ * determining where to start writing WAL next, which depends on the
+ * contents of the reader's internal buffer after reading one record.
+ * Therefore, don't even think about prefetching until the first
+ * record after XLogPrefetcherBeginRead() has been consumed.
+ */
+ if (prefetcher->reader->decode_queue_tail &&
+ prefetcher->reader->decode_queue_tail->lsn == prefetcher->begin_ptr)
+ return LRQ_NEXT_AGAIN;
+
+ /* Advance to the next record. */
+ prefetcher->record = NULL;
+ }
+ pg_unreachable();
+}
+
+/*
+ * Expose statistics about recovery prefetching.
+ */
+Datum
+pg_stat_get_recovery_prefetch(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_RECOVERY_PREFETCH_COLS 10
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ Datum values[PG_STAT_GET_RECOVERY_PREFETCH_COLS];
+ bool nulls[PG_STAT_GET_RECOVERY_PREFETCH_COLS];
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ for (int i = 0; i < PG_STAT_GET_RECOVERY_PREFETCH_COLS; ++i)
+ nulls[i] = false;
+
+ values[0] = TimestampTzGetDatum(pg_atomic_read_u64(&SharedStats->reset_time));
+ values[1] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->prefetch));
+ values[2] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->hit));
+ values[3] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_init));
+ values[4] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_new));
+ values[5] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_fpw));
+ values[6] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_rep));
+ values[7] = Int32GetDatum(SharedStats->wal_distance);
+ values[8] = Int32GetDatum(SharedStats->block_distance);
+ values[9] = Int32GetDatum(SharedStats->io_depth);
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+
+ return (Datum) 0;
+}
+
+/*
+ * Don't prefetch any blocks >= 'blockno' from a given 'rnode', until 'lsn'
+ * has been replayed.
+ */
+static inline void
+XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, RelFileNode rnode,
+ BlockNumber blockno, XLogRecPtr lsn)
+{
+ XLogPrefetcherFilter *filter;
+ bool found;
+
+ filter = hash_search(prefetcher->filter_table, &rnode, HASH_ENTER, &found);
+ if (!found)
+ {
+ /*
+ * Don't allow any prefetching of this block or higher until replayed.
+ */
+ filter->filter_until_replayed = lsn;
+ filter->filter_from_block = blockno;
+ dlist_push_head(&prefetcher->filter_queue, &filter->link);
+ }
+ else
+ {
+ /*
+ * We were already filtering this rnode. Extend the filter's lifetime
+ * to cover this WAL record, but leave the lower of the block numbers
+ * there because we don't want to have to track individual blocks.
+ */
+ filter->filter_until_replayed = lsn;
+ dlist_delete(&filter->link);
+ dlist_push_head(&prefetcher->filter_queue, &filter->link);
+ filter->filter_from_block = Min(filter->filter_from_block, blockno);
+ }
+}
+
+/*
+ * Have we replayed any records that caused us to begin filtering a block
+ * range? That means that relations should have been created, extended or
+ * dropped as required, so we can stop filtering out accesses to a given
+ * relfilenode.
+ */
+static inline void
+XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn)
+{
+ while (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
+ {
+ XLogPrefetcherFilter *filter = dlist_tail_element(XLogPrefetcherFilter,
+ link,
+ &prefetcher->filter_queue);
+
+ if (filter->filter_until_replayed >= replaying_lsn)
+ break;
+
+ dlist_delete(&filter->link);
+ hash_search(prefetcher->filter_table, filter, HASH_REMOVE, NULL);
+ }
+}
+
+/*
+ * Check if a given block should be skipped due to a filter.
+ */
+static inline bool
+XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileNode rnode,
+ BlockNumber blockno)
+{
+ /*
+ * Test for empty queue first, because we expect it to be empty most of
+ * the time and we can avoid the hash table lookup in that case.
+ */
+ if (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
+ {
+ XLogPrefetcherFilter *filter;
+
+ /* See if the block range is filtered. */
+ filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
+ if (filter && filter->filter_from_block <= blockno)
+ {
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+ rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
+ LSN_FORMAT_ARGS(filter->filter_until_replayed),
+ filter->filter_from_block);
+#endif
+ return true;
+ }
+
+ /* See if the whole database is filtered. */
+ rnode.relNode = InvalidOid;
+ rnode.spcNode = InvalidOid;
+ filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
+ if (filter)
+ {
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+ elog(XLOGPREFETCHER_DEBUG_LEVEL,
+ "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+ rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
+ LSN_FORMAT_ARGS(filter->filter_until_replayed));
+#endif
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * A wrapper for XLogBeginRead() that also resets the prefetcher.
+ */
+void
+XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
+{
+ /* This will forget about any in-flight IO. */
+ prefetcher->reconfigure_count--;
+
+ /* Book-keeping to avoid readahead on first read. */
+ prefetcher->begin_ptr = recPtr;
+
+ prefetcher->no_readahead_until = 0;
+
+ /* This will forget about any queued up records in the decoder. */
+ XLogBeginRead(prefetcher->reader, recPtr);
+}
+
+/*
+ * A wrapper for XLogReadRecord() that provides the same interface, but also
+ * tries to initiate I/O for blocks referenced in future WAL records.
+ */
+XLogRecord *
+XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
+{
+ DecodedXLogRecord *record;
+ XLogRecPtr replayed_up_to;
+
+ /*
+ * See if it's time to reset the prefetching machinery, because a relevant
+ * GUC was changed.
+ */
+ if (unlikely(XLogPrefetchReconfigureCount != prefetcher->reconfigure_count))
+ {
+ uint32 max_distance;
+ uint32 max_inflight;
+
+ if (prefetcher->streaming_read)
+ lrq_free(prefetcher->streaming_read);
+
+ if (RecoveryPrefetchEnabled())
+ {
+ Assert(maintenance_io_concurrency > 0);
+ max_inflight = maintenance_io_concurrency;
+ max_distance = max_inflight * XLOGPREFETCHER_DISTANCE_MULTIPLIER;
+ }
+ else
+ {
+ max_inflight = 1;
+ max_distance = 1;
+ }
+
+ prefetcher->streaming_read = lrq_alloc(max_distance,
+ max_inflight,
+ (uintptr_t) prefetcher,
+ XLogPrefetcherNextBlock);
+
+ prefetcher->reconfigure_count = XLogPrefetchReconfigureCount;
+ }
+
+ /*
+ * Release last returned record, if there is one, as it's now been
+ * replayed.
+ */
+ replayed_up_to = XLogReleasePreviousRecord(prefetcher->reader);
+
+ /*
+ * Can we drop any filters yet? If we were waiting for a relation to be
+ * created or extended, it is now OK to access blocks in the covered
+ * range.
+ */
+ XLogPrefetcherCompleteFilters(prefetcher, replayed_up_to);
+
+ /*
+ * All IO initiated by earlier WAL is now completed. This might trigger
+ * further prefetching.
+ */
+ lrq_complete_lsn(prefetcher->streaming_read, replayed_up_to);
+
+ /*
+ * If there's nothing queued yet, then start prefetching to cause at least
+ * one record to be queued.
+ */
+ if (!XLogReaderHasQueuedRecordOrError(prefetcher->reader))
+ {
+ Assert(lrq_inflight(prefetcher->streaming_read) == 0);
+ Assert(lrq_completed(prefetcher->streaming_read) == 0);
+ lrq_prefetch(prefetcher->streaming_read);
+ }
+
+ /* Read the next record. */
+ record = XLogNextRecord(prefetcher->reader, errmsg);
+ if (!record)
+ return NULL;
+
+ /*
+ * The record we just got is the "current" one, for the benefit of the
+ * XLogRecXXX() macros.
+ */
+ Assert(record == prefetcher->reader->record);
+
+ /*
+ * If maintenance_io_concurrency is set very low, we might have started
+ * prefetching some but not all of the blocks referenced in the record
+ * we're about to return. Forget about the rest of the blocks in this
+ * record by dropping the prefetcher's reference to it.
+ */
+ if (record == prefetcher->record)
+ prefetcher->record = NULL;
+
+ /*
+ * See if it's time to compute some statistics, because enough WAL has
+ * been processed.
+ */
+ if (unlikely(record->lsn >= prefetcher->next_stats_shm_lsn))
+ XLogPrefetcherComputeStats(prefetcher);
+
+ Assert(record == prefetcher->reader->record);
+
+ return &record->header;
+}
+
+bool
+check_recovery_prefetch(int *new_value, void **extra, GucSource source)
+{
+#ifndef USE_PREFETCH
+ if (*new_value == RECOVERY_PREFETCH_ON)
+ {
+ GUC_check_errdetail("recovery_prefetch is not supported on platforms that lack posix_fadvise().");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+void
+assign_recovery_prefetch(int new_value, void *extra)
+{
+ /* Reconfigure prefetching, because a setting it depends on changed. */
+ recovery_prefetch = new_value;
+ if (AmStartupProcess())
+ XLogPrefetchReconfigure();
+}
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
new file mode 100644
index 0000000..c15da9d
--- /dev/null
+++ b/src/backend/access/transam/xlogreader.c
@@ -0,0 +1,2165 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogreader.c
+ * Generic XLog reading facility
+ *
+ * Portions Copyright (c) 2013-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/xlogreader.c
+ *
+ * NOTES
+ * See xlogreader.h for more notes on this facility.
+ *
+ * This file is compiled as both front-end and backend code, so it
+ * may not use ereport, server-defined static variables, etc.
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#ifdef USE_LZ4
+#include <lz4.h>
+#endif
+#ifdef USE_ZSTD
+#include <zstd.h>
+#endif
+
+#include "access/transam.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecord.h"
+#include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
+#include "replication/origin.h"
+
+#ifndef FRONTEND
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#else
+#include "common/logging.h"
+#endif
+
+static void report_invalid_record(XLogReaderState *state, const char *fmt,...)
+ pg_attribute_printf(2, 3);
+static void allocate_recordbuf(XLogReaderState *state, uint32 reclength);
+static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr,
+ int reqLen);
+static void XLogReaderInvalReadState(XLogReaderState *state);
+static XLogPageReadResult XLogDecodeNextRecord(XLogReaderState *state, bool non_blocking);
+static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
+ XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess);
+static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record,
+ XLogRecPtr recptr);
+static void ResetDecoder(XLogReaderState *state);
+static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
+ int segsize, const char *waldir);
+
+/* size of the buffer allocated for error message. */
+#define MAX_ERRORMSG_LEN 1000
+
+/*
+ * Default size; large enough that typical users of XLogReader won't often need
+ * to use the 'oversized' memory allocation code path.
+ */
+#define DEFAULT_DECODE_BUFFER_SIZE (64 * 1024)
+
+/*
+ * Construct a string in state->errormsg_buf explaining what's wrong with
+ * the current record being read.
+ */
+static void
+report_invalid_record(XLogReaderState *state, const char *fmt,...)
+{
+ va_list args;
+
+ fmt = _(fmt);
+
+ va_start(args, fmt);
+ vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args);
+ va_end(args);
+
+ state->errormsg_deferred = true;
+}
+
+/*
+ * Set the size of the decoding buffer. A pointer to a caller supplied memory
+ * region may also be passed in, in which case non-oversized records will be
+ * decoded there.
+ */
+void
+XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
+{
+ Assert(state->decode_buffer == NULL);
+
+ state->decode_buffer = buffer;
+ state->decode_buffer_size = size;
+ state->decode_buffer_tail = buffer;
+ state->decode_buffer_head = buffer;
+}
+
+/*
+ * Allocate and initialize a new XLogReader.
+ *
+ * Returns NULL if the xlogreader couldn't be allocated.
+ */
+XLogReaderState *
+XLogReaderAllocate(int wal_segment_size, const char *waldir,
+ XLogReaderRoutine *routine, void *private_data)
+{
+ XLogReaderState *state;
+
+ state = (XLogReaderState *)
+ palloc_extended(sizeof(XLogReaderState),
+ MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+ if (!state)
+ return NULL;
+
+ /* initialize caller-provided support functions */
+ state->routine = *routine;
+
+ /*
+ * Permanently allocate readBuf. We do it this way, rather than just
+ * making a static array, for two reasons: (1) no need to waste the
+ * storage in most instantiations of the backend; (2) a static char array
+ * isn't guaranteed to have any particular alignment, whereas
+ * palloc_extended() will provide MAXALIGN'd storage.
+ */
+ state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ,
+ MCXT_ALLOC_NO_OOM);
+ if (!state->readBuf)
+ {
+ pfree(state);
+ return NULL;
+ }
+
+ /* Initialize segment info. */
+ WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size,
+ waldir);
+
+ /* system_identifier initialized to zeroes above */
+ state->private_data = private_data;
+ /* ReadRecPtr, EndRecPtr and readLen initialized to zeroes above */
+ state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1,
+ MCXT_ALLOC_NO_OOM);
+ if (!state->errormsg_buf)
+ {
+ pfree(state->readBuf);
+ pfree(state);
+ return NULL;
+ }
+ state->errormsg_buf[0] = '\0';
+
+ /*
+ * Allocate an initial readRecordBuf of minimal size, which can later be
+ * enlarged if necessary.
+ */
+ allocate_recordbuf(state, 0);
+ return state;
+}
+
+void
+XLogReaderFree(XLogReaderState *state)
+{
+ if (state->seg.ws_file != -1)
+ state->routine.segment_close(state);
+
+ if (state->decode_buffer && state->free_decode_buffer)
+ pfree(state->decode_buffer);
+
+ pfree(state->errormsg_buf);
+ if (state->readRecordBuf)
+ pfree(state->readRecordBuf);
+ pfree(state->readBuf);
+ pfree(state);
+}
+
+/*
+ * Allocate readRecordBuf to fit a record of at least the given length.
+ *
+ * readRecordBufSize is set to the new buffer size.
+ *
+ * To avoid useless small increases, round its size to a multiple of
+ * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start
+ * with. (That is enough for all "normal" records, but very large commit or
+ * abort records might need more space.)
+ *
+ * Note: This routine should *never* be called for xl_tot_len until the header
+ * of the record has been fully validated.
+ */
+static void
+allocate_recordbuf(XLogReaderState *state, uint32 reclength)
+{
+ uint32 newSize = reclength;
+
+ newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
+ newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ));
+
+ if (state->readRecordBuf)
+ pfree(state->readRecordBuf);
+ state->readRecordBuf = (char *) palloc(newSize);
+ state->readRecordBufSize = newSize;
+}
+
+/*
+ * Initialize the passed segment structs.
+ */
+static void
+WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
+ int segsize, const char *waldir)
+{
+ seg->ws_file = -1;
+ seg->ws_segno = 0;
+ seg->ws_tli = 0;
+
+ segcxt->ws_segsize = segsize;
+ if (waldir)
+ snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir);
+}
+
+/*
+ * Begin reading WAL at 'RecPtr'.
+ *
+ * 'RecPtr' should point to the beginning of a valid WAL record. Pointing at
+ * the beginning of a page is also OK, if there is a new record right after
+ * the page header, i.e. not a continuation.
+ *
+ * This does not make any attempt to read the WAL yet, and hence cannot fail.
+ * If the starting address is not correct, the first call to XLogReadRecord()
+ * will error out.
+ */
+void
+XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
+{
+ Assert(!XLogRecPtrIsInvalid(RecPtr));
+
+ ResetDecoder(state);
+
+ /* Begin at the passed-in record pointer. */
+ state->EndRecPtr = RecPtr;
+ state->NextRecPtr = RecPtr;
+ state->ReadRecPtr = InvalidXLogRecPtr;
+ state->DecodeRecPtr = InvalidXLogRecPtr;
+}
+
+/*
+ * Release the last record that was returned by XLogNextRecord(), if any, to
+ * free up space. Returns the LSN past the end of the record.
+ */
+XLogRecPtr
+XLogReleasePreviousRecord(XLogReaderState *state)
+{
+ DecodedXLogRecord *record;
+ XLogRecPtr next_lsn;
+
+ if (!state->record)
+ return InvalidXLogRecPtr;
+
+ /*
+ * Remove it from the decoded record queue. It must be the oldest item
+ * decoded, decode_queue_head.
+ */
+ record = state->record;
+ next_lsn = record->next_lsn;
+ Assert(record == state->decode_queue_head);
+ state->record = NULL;
+ state->decode_queue_head = record->next;
+
+ /* It might also be the newest item decoded, decode_queue_tail. */
+ if (state->decode_queue_tail == record)
+ state->decode_queue_tail = NULL;
+
+ /* Release the space. */
+ if (unlikely(record->oversized))
+ {
+ /* It's not in the decode buffer, so free it to release space. */
+ pfree(record);
+ }
+ else
+ {
+ /* It must be the head (oldest) record in the decode buffer. */
+ Assert(state->decode_buffer_head == (char *) record);
+
+ /*
+ * We need to update head to point to the next record that is in the
+ * decode buffer, if any, being careful to skip oversized ones
+ * (they're not in the decode buffer).
+ */
+ record = record->next;
+ while (unlikely(record && record->oversized))
+ record = record->next;
+
+ if (record)
+ {
+ /* Adjust head to release space up to the next record. */
+ state->decode_buffer_head = (char *) record;
+ }
+ else
+ {
+ /*
+ * Otherwise we might as well just reset head and tail to the
+ * start of the buffer space, because we're empty. This means
+ * we'll keep overwriting the same piece of memory if we're not
+ * doing any prefetching.
+ */
+ state->decode_buffer_head = state->decode_buffer;
+ state->decode_buffer_tail = state->decode_buffer;
+ }
+ }
+
+ return next_lsn;
+}
+
+/*
+ * Attempt to read an XLOG record.
+ *
+ * XLogBeginRead() or XLogFindNextRecord() and then XLogReadAhead() must be
+ * called before the first call to XLogNextRecord(). This functions returns
+ * records and errors that were put into an internal queue by XLogReadAhead().
+ *
+ * On success, a record is returned.
+ *
+ * The returned record (or *errormsg) points to an internal buffer that's
+ * valid until the next call to XLogNextRecord.
+ */
+DecodedXLogRecord *
+XLogNextRecord(XLogReaderState *state, char **errormsg)
+{
+ /* Release the last record returned by XLogNextRecord(). */
+ XLogReleasePreviousRecord(state);
+
+ if (state->decode_queue_head == NULL)
+ {
+ *errormsg = NULL;
+ if (state->errormsg_deferred)
+ {
+ if (state->errormsg_buf[0] != '\0')
+ *errormsg = state->errormsg_buf;
+ state->errormsg_deferred = false;
+ }
+
+ /*
+ * state->EndRecPtr is expected to have been set by the last call to
+ * XLogBeginRead() or XLogNextRecord(), and is the location of the
+ * error.
+ */
+ Assert(!XLogRecPtrIsInvalid(state->EndRecPtr));
+
+ return NULL;
+ }
+
+ /*
+ * Record this as the most recent record returned, so that we'll release
+ * it next time. This also exposes it to the traditional
+ * XLogRecXXX(xlogreader) macros, which work with the decoder rather than
+ * the record for historical reasons.
+ */
+ state->record = state->decode_queue_head;
+
+ /*
+ * Update the pointers to the beginning and one-past-the-end of this
+ * record, again for the benefit of historical code that expected the
+ * decoder to track this rather than accessing these fields of the record
+ * itself.
+ */
+ state->ReadRecPtr = state->record->lsn;
+ state->EndRecPtr = state->record->next_lsn;
+
+ *errormsg = NULL;
+
+ return state->record;
+}
+
+/*
+ * Attempt to read an XLOG record.
+ *
+ * XLogBeginRead() or XLogFindNextRecord() must be called before the first call
+ * to XLogReadRecord().
+ *
+ * If the page_read callback fails to read the requested data, NULL is
+ * returned. The callback is expected to have reported the error; errormsg
+ * is set to NULL.
+ *
+ * If the reading fails for some other reason, NULL is also returned, and
+ * *errormsg is set to a string with details of the failure.
+ *
+ * The returned pointer (or *errormsg) points to an internal buffer that's
+ * valid until the next call to XLogReadRecord.
+ */
+XLogRecord *
+XLogReadRecord(XLogReaderState *state, char **errormsg)
+{
+ DecodedXLogRecord *decoded;
+
+ /*
+ * Release last returned record, if there is one. We need to do this so
+ * that we can check for empty decode queue accurately.
+ */
+ XLogReleasePreviousRecord(state);
+
+ /*
+ * Call XLogReadAhead() in blocking mode to make sure there is something
+ * in the queue, though we don't use the result.
+ */
+ if (!XLogReaderHasQueuedRecordOrError(state))
+ XLogReadAhead(state, false /* nonblocking */ );
+
+ /* Consume the head record or error. */
+ decoded = XLogNextRecord(state, errormsg);
+ if (decoded)
+ {
+ /*
+ * This function returns a pointer to the record's header, not the
+ * actual decoded record. The caller will access the decoded record
+ * through the XLogRecGetXXX() macros, which reach the decoded
+ * recorded as xlogreader->record.
+ */
+ Assert(state->record == decoded);
+ return &decoded->header;
+ }
+
+ return NULL;
+}
+
+/*
+ * Allocate space for a decoded record. The only member of the returned
+ * object that is initialized is the 'oversized' flag, indicating that the
+ * decoded record wouldn't fit in the decode buffer and must eventually be
+ * freed explicitly.
+ *
+ * The caller is responsible for adjusting decode_buffer_tail with the real
+ * size after successfully decoding a record into this space. This way, if
+ * decoding fails, then there is nothing to undo unless the 'oversized' flag
+ * was set and pfree() must be called.
+ *
+ * Return NULL if there is no space in the decode buffer and allow_oversized
+ * is false, or if memory allocation fails for an oversized buffer.
+ */
+static DecodedXLogRecord *
+XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized)
+{
+ size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len);
+ DecodedXLogRecord *decoded = NULL;
+
+ /* Allocate a circular decode buffer if we don't have one already. */
+ if (unlikely(state->decode_buffer == NULL))
+ {
+ if (state->decode_buffer_size == 0)
+ state->decode_buffer_size = DEFAULT_DECODE_BUFFER_SIZE;
+ state->decode_buffer = palloc(state->decode_buffer_size);
+ state->decode_buffer_head = state->decode_buffer;
+ state->decode_buffer_tail = state->decode_buffer;
+ state->free_decode_buffer = true;
+ }
+
+ /* Try to allocate space in the circular decode buffer. */
+ if (state->decode_buffer_tail >= state->decode_buffer_head)
+ {
+ /* Empty, or tail is to the right of head. */
+ if (state->decode_buffer_tail + required_space <=
+ state->decode_buffer + state->decode_buffer_size)
+ {
+ /* There is space between tail and end. */
+ decoded = (DecodedXLogRecord *) state->decode_buffer_tail;
+ decoded->oversized = false;
+ return decoded;
+ }
+ else if (state->decode_buffer + required_space <
+ state->decode_buffer_head)
+ {
+ /* There is space between start and head. */
+ decoded = (DecodedXLogRecord *) state->decode_buffer;
+ decoded->oversized = false;
+ return decoded;
+ }
+ }
+ else
+ {
+ /* Tail is to the left of head. */
+ if (state->decode_buffer_tail + required_space <
+ state->decode_buffer_head)
+ {
+ /* There is space between tail and head. */
+ decoded = (DecodedXLogRecord *) state->decode_buffer_tail;
+ decoded->oversized = false;
+ return decoded;
+ }
+ }
+
+ /* Not enough space in the decode buffer. Are we allowed to allocate? */
+ if (allow_oversized)
+ {
+ decoded = palloc(required_space);
+ decoded->oversized = true;
+ return decoded;
+ }
+
+ return NULL;
+}
+
+static XLogPageReadResult
+XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
+{
+ XLogRecPtr RecPtr;
+ XLogRecord *record;
+ XLogRecPtr targetPagePtr;
+ bool randAccess;
+ uint32 len,
+ total_len;
+ uint32 targetRecOff;
+ uint32 pageHeaderSize;
+ bool assembled;
+ bool gotheader;
+ int readOff;
+ DecodedXLogRecord *decoded;
+ char *errormsg; /* not used */
+
+ /*
+ * randAccess indicates whether to verify the previous-record pointer of
+ * the record we're reading. We only do this if we're reading
+ * sequentially, which is what we initially assume.
+ */
+ randAccess = false;
+
+ /* reset error state */
+ state->errormsg_buf[0] = '\0';
+ decoded = NULL;
+
+ state->abortedRecPtr = InvalidXLogRecPtr;
+ state->missingContrecPtr = InvalidXLogRecPtr;
+
+ RecPtr = state->NextRecPtr;
+
+ if (state->DecodeRecPtr != InvalidXLogRecPtr)
+ {
+ /* read the record after the one we just read */
+
+ /*
+ * NextRecPtr is pointing to end+1 of the previous WAL record. If
+ * we're at a page boundary, no more records can fit on the current
+ * page. We must skip over the page header, but we can't do that until
+ * we've read in the page, since the header size is variable.
+ */
+ }
+ else
+ {
+ /*
+ * Caller supplied a position to start at.
+ *
+ * In this case, NextRecPtr should already be pointing to a valid
+ * record starting position.
+ */
+ Assert(XRecOffIsValid(RecPtr));
+ randAccess = true;
+ }
+
+restart:
+ state->nonblocking = nonblocking;
+ state->currRecPtr = RecPtr;
+ assembled = false;
+
+ targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ);
+ targetRecOff = RecPtr % XLOG_BLCKSZ;
+
+ /*
+ * Read the page containing the record into state->readBuf. Request enough
+ * byte to cover the whole record header, or at least the part of it that
+ * fits on the same page.
+ */
+ readOff = ReadPageInternal(state, targetPagePtr,
+ Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ));
+ if (readOff == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readOff < 0)
+ goto err;
+
+ /*
+ * ReadPageInternal always returns at least the page header, so we can
+ * examine it now.
+ */
+ pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
+ if (targetRecOff == 0)
+ {
+ /*
+ * At page start, so skip over page header.
+ */
+ RecPtr += pageHeaderSize;
+ targetRecOff = pageHeaderSize;
+ }
+ else if (targetRecOff < pageHeaderSize)
+ {
+ report_invalid_record(state, "invalid record offset at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr));
+ goto err;
+ }
+
+ if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
+ targetRecOff == pageHeaderSize)
+ {
+ report_invalid_record(state, "contrecord is requested by %X/%X",
+ LSN_FORMAT_ARGS(RecPtr));
+ goto err;
+ }
+
+ /* ReadPageInternal has verified the page header */
+ Assert(pageHeaderSize <= readOff);
+
+ /*
+ * Read the record length.
+ *
+ * NB: Even though we use an XLogRecord pointer here, the whole record
+ * header might not fit on this page. xl_tot_len is the first field of the
+ * struct, so it must be on this page (the records are MAXALIGNed), but we
+ * cannot access any other fields until we've verified that we got the
+ * whole header.
+ */
+ record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
+ total_len = record->xl_tot_len;
+
+ /*
+ * If the whole record header is on this page, validate it immediately.
+ * Otherwise do just a basic sanity check on xl_tot_len, and validate the
+ * rest of the header after reading it from the next page. The xl_tot_len
+ * check is necessary here to ensure that we enter the "Need to reassemble
+ * record" code path below; otherwise we might fail to apply
+ * ValidXLogRecordHeader at all.
+ */
+ if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord)
+ {
+ if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record,
+ randAccess))
+ goto err;
+ gotheader = true;
+ }
+ else
+ {
+ /* There may be no next page if it's too small. */
+ if (total_len < SizeOfXLogRecord)
+ {
+ report_invalid_record(state,
+ "invalid record length at %X/%X: wanted %u, got %u",
+ LSN_FORMAT_ARGS(RecPtr),
+ (uint32) SizeOfXLogRecord, total_len);
+ goto err;
+ }
+ /* We'll validate the header once we have the next page. */
+ gotheader = false;
+ }
+
+ /*
+ * Try to find space to decode this record, if we can do so without
+ * calling palloc. If we can't, we'll try again below after we've
+ * validated that total_len isn't garbage bytes from a recycled WAL page.
+ */
+ decoded = XLogReadRecordAlloc(state,
+ total_len,
+ false /* allow_oversized */ );
+ if (decoded == NULL && nonblocking)
+ {
+ /*
+ * There is no space in the circular decode buffer, and the caller is
+ * only reading ahead. The caller should consume existing records to
+ * make space.
+ */
+ return XLREAD_WOULDBLOCK;
+ }
+
+ len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ;
+ if (total_len > len)
+ {
+ /* Need to reassemble record */
+ char *contdata;
+ XLogPageHeader pageHeader;
+ char *buffer;
+ uint32 gotlen;
+
+ assembled = true;
+
+ /*
+ * We always have space for a couple of pages, enough to validate a
+ * boundary-spanning record header.
+ */
+ Assert(state->readRecordBufSize >= XLOG_BLCKSZ * 2);
+ Assert(state->readRecordBufSize >= len);
+
+ /* Copy the first fragment of the record from the first page. */
+ memcpy(state->readRecordBuf,
+ state->readBuf + RecPtr % XLOG_BLCKSZ, len);
+ buffer = state->readRecordBuf + len;
+ gotlen = len;
+
+ do
+ {
+ /* Calculate pointer to beginning of next page */
+ targetPagePtr += XLOG_BLCKSZ;
+
+ /* Wait for the next page to become available */
+ readOff = ReadPageInternal(state, targetPagePtr,
+ Min(total_len - gotlen + SizeOfXLogShortPHD,
+ XLOG_BLCKSZ));
+
+ if (readOff == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readOff < 0)
+ goto err;
+
+ Assert(SizeOfXLogShortPHD <= readOff);
+
+ pageHeader = (XLogPageHeader) state->readBuf;
+
+ /*
+ * If we were expecting a continuation record and got an
+ * "overwrite contrecord" flag, that means the continuation record
+ * was overwritten with a different record. Restart the read by
+ * assuming the address to read is the location where we found
+ * this flag; but keep track of the LSN of the record we were
+ * reading, for later verification.
+ */
+ if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD)
+ {
+ state->overwrittenRecPtr = RecPtr;
+ RecPtr = targetPagePtr;
+ goto restart;
+ }
+
+ /* Check that the continuation on next page looks valid */
+ if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
+ {
+ report_invalid_record(state,
+ "there is no contrecord flag at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr));
+ goto err;
+ }
+
+ /*
+ * Cross-check that xlp_rem_len agrees with how much of the record
+ * we expect there to be left.
+ */
+ if (pageHeader->xlp_rem_len == 0 ||
+ total_len != (pageHeader->xlp_rem_len + gotlen))
+ {
+ report_invalid_record(state,
+ "invalid contrecord length %u (expected %lld) at %X/%X",
+ pageHeader->xlp_rem_len,
+ ((long long) total_len) - gotlen,
+ LSN_FORMAT_ARGS(RecPtr));
+ goto err;
+ }
+
+ /* Append the continuation from this page to the buffer */
+ pageHeaderSize = XLogPageHeaderSize(pageHeader);
+
+ if (readOff < pageHeaderSize)
+ readOff = ReadPageInternal(state, targetPagePtr,
+ pageHeaderSize);
+
+ Assert(pageHeaderSize <= readOff);
+
+ contdata = (char *) state->readBuf + pageHeaderSize;
+ len = XLOG_BLCKSZ - pageHeaderSize;
+ if (pageHeader->xlp_rem_len < len)
+ len = pageHeader->xlp_rem_len;
+
+ if (readOff < pageHeaderSize + len)
+ readOff = ReadPageInternal(state, targetPagePtr,
+ pageHeaderSize + len);
+
+ memcpy(buffer, (char *) contdata, len);
+ buffer += len;
+ gotlen += len;
+
+ /* If we just reassembled the record header, validate it. */
+ if (!gotheader)
+ {
+ record = (XLogRecord *) state->readRecordBuf;
+ if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr,
+ record, randAccess))
+ goto err;
+ gotheader = true;
+ }
+
+ /*
+ * We might need a bigger buffer. We have validated the record
+ * header, in the case that it split over a page boundary. We've
+ * also cross-checked total_len against xlp_rem_len on the second
+ * page, and verified xlp_pageaddr on both.
+ */
+ if (total_len > state->readRecordBufSize)
+ {
+ char save_copy[XLOG_BLCKSZ * 2];
+
+ /*
+ * Save and restore the data we already had. It can't be more
+ * than two pages.
+ */
+ Assert(gotlen <= lengthof(save_copy));
+ Assert(gotlen <= state->readRecordBufSize);
+ memcpy(save_copy, state->readRecordBuf, gotlen);
+ allocate_recordbuf(state, total_len);
+ memcpy(state->readRecordBuf, save_copy, gotlen);
+ buffer = state->readRecordBuf + gotlen;
+ }
+ } while (gotlen < total_len);
+ Assert(gotheader);
+
+ record = (XLogRecord *) state->readRecordBuf;
+ if (!ValidXLogRecord(state, record, RecPtr))
+ goto err;
+
+ pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
+ state->DecodeRecPtr = RecPtr;
+ state->NextRecPtr = targetPagePtr + pageHeaderSize
+ + MAXALIGN(pageHeader->xlp_rem_len);
+ }
+ else
+ {
+ /* Wait for the record data to become available */
+ readOff = ReadPageInternal(state, targetPagePtr,
+ Min(targetRecOff + total_len, XLOG_BLCKSZ));
+ if (readOff == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readOff < 0)
+ goto err;
+
+ /* Record does not cross a page boundary */
+ if (!ValidXLogRecord(state, record, RecPtr))
+ goto err;
+
+ state->NextRecPtr = RecPtr + MAXALIGN(total_len);
+
+ state->DecodeRecPtr = RecPtr;
+ }
+
+ /*
+ * Special processing if it's an XLOG SWITCH record
+ */
+ if (record->xl_rmid == RM_XLOG_ID &&
+ (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH)
+ {
+ /* Pretend it extends to end of segment */
+ state->NextRecPtr += state->segcxt.ws_segsize - 1;
+ state->NextRecPtr -= XLogSegmentOffset(state->NextRecPtr, state->segcxt.ws_segsize);
+ }
+
+ /*
+ * If we got here without a DecodedXLogRecord, it means we needed to
+ * validate total_len before trusting it, but by now now we've done that.
+ */
+ if (decoded == NULL)
+ {
+ Assert(!nonblocking);
+ decoded = XLogReadRecordAlloc(state,
+ total_len,
+ true /* allow_oversized */ );
+ /* allocation should always happen under allow_oversized */
+ Assert(decoded != NULL);
+ }
+
+ if (DecodeXLogRecord(state, decoded, record, RecPtr, &errormsg))
+ {
+ /* Record the location of the next record. */
+ decoded->next_lsn = state->NextRecPtr;
+
+ /*
+ * If it's in the decode buffer, mark the decode buffer space as
+ * occupied.
+ */
+ if (!decoded->oversized)
+ {
+ /* The new decode buffer head must be MAXALIGNed. */
+ Assert(decoded->size == MAXALIGN(decoded->size));
+ if ((char *) decoded == state->decode_buffer)
+ state->decode_buffer_tail = state->decode_buffer + decoded->size;
+ else
+ state->decode_buffer_tail += decoded->size;
+ }
+
+ /* Insert it into the queue of decoded records. */
+ Assert(state->decode_queue_tail != decoded);
+ if (state->decode_queue_tail)
+ state->decode_queue_tail->next = decoded;
+ state->decode_queue_tail = decoded;
+ if (!state->decode_queue_head)
+ state->decode_queue_head = decoded;
+ return XLREAD_SUCCESS;
+ }
+
+err:
+ if (assembled)
+ {
+ /*
+ * We get here when a record that spans multiple pages needs to be
+ * assembled, but something went wrong -- perhaps a contrecord piece
+ * was lost. If caller is WAL replay, it will know where the aborted
+ * record was and where to direct followup WAL to be written, marking
+ * the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will
+ * in turn signal downstream WAL consumers that the broken WAL record
+ * is to be ignored.
+ */
+ state->abortedRecPtr = RecPtr;
+ state->missingContrecPtr = targetPagePtr;
+
+ /*
+ * If we got here without reporting an error, make sure an error is
+ * queued so that XLogPrefetcherReadRecord() doesn't bring us back a
+ * second time and clobber the above state.
+ */
+ state->errormsg_deferred = true;
+ }
+
+ if (decoded && decoded->oversized)
+ pfree(decoded);
+
+ /*
+ * Invalidate the read state. We might read from a different source after
+ * failure.
+ */
+ XLogReaderInvalReadState(state);
+
+ /*
+ * If an error was written to errmsg_buf, it'll be returned to the caller
+ * of XLogReadRecord() after all successfully decoded records from the
+ * read queue.
+ */
+
+ return XLREAD_FAIL;
+}
+
+/*
+ * Try to decode the next available record, and return it. The record will
+ * also be returned to XLogNextRecord(), which must be called to 'consume'
+ * each record.
+ *
+ * If nonblocking is true, may return NULL due to lack of data or WAL decoding
+ * space.
+ */
+DecodedXLogRecord *
+XLogReadAhead(XLogReaderState *state, bool nonblocking)
+{
+ XLogPageReadResult result;
+
+ if (state->errormsg_deferred)
+ return NULL;
+
+ result = XLogDecodeNextRecord(state, nonblocking);
+ if (result == XLREAD_SUCCESS)
+ {
+ Assert(state->decode_queue_tail != NULL);
+ return state->decode_queue_tail;
+ }
+
+ return NULL;
+}
+
+/*
+ * Read a single xlog page including at least [pageptr, reqLen] of valid data
+ * via the page_read() callback.
+ *
+ * Returns XLREAD_FAIL if the required page cannot be read for some
+ * reason; errormsg_buf is set in that case (unless the error occurs in the
+ * page_read callback).
+ *
+ * Returns XLREAD_WOULDBLOCK if the requested data can't be read without
+ * waiting. This can be returned only if the installed page_read callback
+ * respects the state->nonblocking flag, and cannot read the requested data
+ * immediately.
+ *
+ * We fetch the page from a reader-local cache if we know we have the required
+ * data and if there hasn't been any error since caching the data.
+ */
+static int
+ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
+{
+ int readLen;
+ uint32 targetPageOff;
+ XLogSegNo targetSegNo;
+ XLogPageHeader hdr;
+
+ Assert((pageptr % XLOG_BLCKSZ) == 0);
+
+ XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize);
+ targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize);
+
+ /* check whether we have all the requested data already */
+ if (targetSegNo == state->seg.ws_segno &&
+ targetPageOff == state->segoff && reqLen <= state->readLen)
+ return state->readLen;
+
+ /*
+ * Invalidate contents of internal buffer before read attempt. Just set
+ * the length to 0, rather than a full XLogReaderInvalReadState(), so we
+ * don't forget the segment we last successfully read.
+ */
+ state->readLen = 0;
+
+ /*
+ * Data is not in our buffer.
+ *
+ * Every time we actually read the segment, even if we looked at parts of
+ * it before, we need to do verification as the page_read callback might
+ * now be rereading data from a different source.
+ *
+ * Whenever switching to a new WAL segment, we read the first page of the
+ * file and validate its header, even if that's not where the target
+ * record is. This is so that we can check the additional identification
+ * info that is present in the first page's "long" header.
+ */
+ if (targetSegNo != state->seg.ws_segno && targetPageOff != 0)
+ {
+ XLogRecPtr targetSegmentPtr = pageptr - targetPageOff;
+
+ readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ,
+ state->currRecPtr,
+ state->readBuf);
+ if (readLen == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readLen < 0)
+ goto err;
+
+ /* we can be sure to have enough WAL available, we scrolled back */
+ Assert(readLen == XLOG_BLCKSZ);
+
+ if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
+ state->readBuf))
+ goto err;
+ }
+
+ /*
+ * First, read the requested data length, but at least a short page header
+ * so that we can validate it.
+ */
+ readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD),
+ state->currRecPtr,
+ state->readBuf);
+ if (readLen == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readLen < 0)
+ goto err;
+
+ Assert(readLen <= XLOG_BLCKSZ);
+
+ /* Do we have enough data to check the header length? */
+ if (readLen <= SizeOfXLogShortPHD)
+ goto err;
+
+ Assert(readLen >= reqLen);
+
+ hdr = (XLogPageHeader) state->readBuf;
+
+ /* still not enough */
+ if (readLen < XLogPageHeaderSize(hdr))
+ {
+ readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr),
+ state->currRecPtr,
+ state->readBuf);
+ if (readLen == XLREAD_WOULDBLOCK)
+ return XLREAD_WOULDBLOCK;
+ else if (readLen < 0)
+ goto err;
+ }
+
+ /*
+ * Now that we know we have the full header, validate it.
+ */
+ if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
+ goto err;
+
+ /* update read state information */
+ state->seg.ws_segno = targetSegNo;
+ state->segoff = targetPageOff;
+ state->readLen = readLen;
+
+ return readLen;
+
+err:
+ XLogReaderInvalReadState(state);
+
+ return XLREAD_FAIL;
+}
+
+/*
+ * Invalidate the xlogreader's read state to force a re-read.
+ */
+static void
+XLogReaderInvalReadState(XLogReaderState *state)
+{
+ state->seg.ws_segno = 0;
+ state->segoff = 0;
+ state->readLen = 0;
+}
+
+/*
+ * Validate an XLOG record header.
+ *
+ * This is just a convenience subroutine to avoid duplicated code in
+ * XLogReadRecord. It's not intended for use from anywhere else.
+ */
+static bool
+ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
+ XLogRecPtr PrevRecPtr, XLogRecord *record,
+ bool randAccess)
+{
+ if (record->xl_tot_len < SizeOfXLogRecord)
+ {
+ report_invalid_record(state,
+ "invalid record length at %X/%X: wanted %u, got %u",
+ LSN_FORMAT_ARGS(RecPtr),
+ (uint32) SizeOfXLogRecord, record->xl_tot_len);
+ return false;
+ }
+ if (!RmgrIdIsValid(record->xl_rmid))
+ {
+ report_invalid_record(state,
+ "invalid resource manager ID %u at %X/%X",
+ record->xl_rmid, LSN_FORMAT_ARGS(RecPtr));
+ return false;
+ }
+ if (randAccess)
+ {
+ /*
+ * We can't exactly verify the prev-link, but surely it should be less
+ * than the record's own address.
+ */
+ if (!(record->xl_prev < RecPtr))
+ {
+ report_invalid_record(state,
+ "record with incorrect prev-link %X/%X at %X/%X",
+ LSN_FORMAT_ARGS(record->xl_prev),
+ LSN_FORMAT_ARGS(RecPtr));
+ return false;
+ }
+ }
+ else
+ {
+ /*
+ * Record's prev-link should exactly match our previous location. This
+ * check guards against torn WAL pages where a stale but valid-looking
+ * WAL record starts on a sector boundary.
+ */
+ if (record->xl_prev != PrevRecPtr)
+ {
+ report_invalid_record(state,
+ "record with incorrect prev-link %X/%X at %X/%X",
+ LSN_FORMAT_ARGS(record->xl_prev),
+ LSN_FORMAT_ARGS(RecPtr));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+
+/*
+ * CRC-check an XLOG record. We do not believe the contents of an XLOG
+ * record (other than to the minimal extent of computing the amount of
+ * data to read in) until we've checked the CRCs.
+ *
+ * We assume all of the record (that is, xl_tot_len bytes) has been read
+ * into memory at *record. Also, ValidXLogRecordHeader() has accepted the
+ * record's header, which means in particular that xl_tot_len is at least
+ * SizeOfXLogRecord.
+ */
+static bool
+ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
+{
+ pg_crc32c crc;
+
+ Assert(record->xl_tot_len >= SizeOfXLogRecord);
+
+ /* Calculate the CRC */
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
+ /* include the record header last */
+ COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
+ FIN_CRC32C(crc);
+
+ if (!EQ_CRC32C(record->xl_crc, crc))
+ {
+ report_invalid_record(state,
+ "incorrect resource manager data checksum in record at %X/%X",
+ LSN_FORMAT_ARGS(recptr));
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Validate a page header.
+ *
+ * Check if 'phdr' is valid as the header of the XLog page at position
+ * 'recptr'.
+ */
+bool
+XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
+ char *phdr)
+{
+ XLogRecPtr recaddr;
+ XLogSegNo segno;
+ int32 offset;
+ XLogPageHeader hdr = (XLogPageHeader) phdr;
+
+ Assert((recptr % XLOG_BLCKSZ) == 0);
+
+ XLByteToSeg(recptr, segno, state->segcxt.ws_segsize);
+ offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+ XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr);
+
+ if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
+ {
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+ report_invalid_record(state,
+ "invalid magic number %04X in log segment %s, offset %u",
+ hdr->xlp_magic,
+ fname,
+ offset);
+ return false;
+ }
+
+ if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
+ {
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+ report_invalid_record(state,
+ "invalid info bits %04X in log segment %s, offset %u",
+ hdr->xlp_info,
+ fname,
+ offset);
+ return false;
+ }
+
+ if (hdr->xlp_info & XLP_LONG_HEADER)
+ {
+ XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
+
+ if (state->system_identifier &&
+ longhdr->xlp_sysid != state->system_identifier)
+ {
+ report_invalid_record(state,
+ "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu",
+ (unsigned long long) longhdr->xlp_sysid,
+ (unsigned long long) state->system_identifier);
+ return false;
+ }
+ else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize)
+ {
+ report_invalid_record(state,
+ "WAL file is from different database system: incorrect segment size in page header");
+ return false;
+ }
+ else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
+ {
+ report_invalid_record(state,
+ "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header");
+ return false;
+ }
+ }
+ else if (offset == 0)
+ {
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+ /* hmm, first page of file doesn't have a long header? */
+ report_invalid_record(state,
+ "invalid info bits %04X in log segment %s, offset %u",
+ hdr->xlp_info,
+ fname,
+ offset);
+ return false;
+ }
+
+ /*
+ * Check that the address on the page agrees with what we expected. This
+ * check typically fails when an old WAL segment is recycled, and hasn't
+ * yet been overwritten with new data yet.
+ */
+ if (hdr->xlp_pageaddr != recaddr)
+ {
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+ report_invalid_record(state,
+ "unexpected pageaddr %X/%X in log segment %s, offset %u",
+ LSN_FORMAT_ARGS(hdr->xlp_pageaddr),
+ fname,
+ offset);
+ return false;
+ }
+
+ /*
+ * Since child timelines are always assigned a TLI greater than their
+ * immediate parent's TLI, we should never see TLI go backwards across
+ * successive pages of a consistent WAL sequence.
+ *
+ * Sometimes we re-read a segment that's already been (partially) read. So
+ * we only verify TLIs for pages that are later than the last remembered
+ * LSN.
+ */
+ if (recptr > state->latestPagePtr)
+ {
+ if (hdr->xlp_tli < state->latestPageTLI)
+ {
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+ report_invalid_record(state,
+ "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u",
+ hdr->xlp_tli,
+ state->latestPageTLI,
+ fname,
+ offset);
+ return false;
+ }
+ }
+ state->latestPagePtr = recptr;
+ state->latestPageTLI = hdr->xlp_tli;
+
+ return true;
+}
+
+/*
+ * Forget about an error produced by XLogReaderValidatePageHeader().
+ */
+void
+XLogReaderResetError(XLogReaderState *state)
+{
+ state->errormsg_buf[0] = '\0';
+ state->errormsg_deferred = false;
+}
+
+/*
+ * Find the first record with an lsn >= RecPtr.
+ *
+ * This is different from XLogBeginRead() in that RecPtr doesn't need to point
+ * to a valid record boundary. Useful for checking whether RecPtr is a valid
+ * xlog address for reading, and to find the first valid address after some
+ * address when dumping records for debugging purposes.
+ *
+ * This positions the reader, like XLogBeginRead(), so that the next call to
+ * XLogReadRecord() will read the next valid record.
+ */
+XLogRecPtr
+XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
+{
+ XLogRecPtr tmpRecPtr;
+ XLogRecPtr found = InvalidXLogRecPtr;
+ XLogPageHeader header;
+ char *errormsg;
+
+ Assert(!XLogRecPtrIsInvalid(RecPtr));
+
+ /* Make sure ReadPageInternal() can't return XLREAD_WOULDBLOCK. */
+ state->nonblocking = false;
+
+ /*
+ * skip over potential continuation data, keeping in mind that it may span
+ * multiple pages
+ */
+ tmpRecPtr = RecPtr;
+ while (true)
+ {
+ XLogRecPtr targetPagePtr;
+ int targetRecOff;
+ uint32 pageHeaderSize;
+ int readLen;
+
+ /*
+ * Compute targetRecOff. It should typically be equal or greater than
+ * short page-header since a valid record can't start anywhere before
+ * that, except when caller has explicitly specified the offset that
+ * falls somewhere there or when we are skipping multi-page
+ * continuation record. It doesn't matter though because
+ * ReadPageInternal() is prepared to handle that and will read at
+ * least short page-header worth of data
+ */
+ targetRecOff = tmpRecPtr % XLOG_BLCKSZ;
+
+ /* scroll back to page boundary */
+ targetPagePtr = tmpRecPtr - targetRecOff;
+
+ /* Read the page containing the record */
+ readLen = ReadPageInternal(state, targetPagePtr, targetRecOff);
+ if (readLen < 0)
+ goto err;
+
+ header = (XLogPageHeader) state->readBuf;
+
+ pageHeaderSize = XLogPageHeaderSize(header);
+
+ /* make sure we have enough data for the page header */
+ readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize);
+ if (readLen < 0)
+ goto err;
+
+ /* skip over potential continuation data */
+ if (header->xlp_info & XLP_FIRST_IS_CONTRECORD)
+ {
+ /*
+ * If the length of the remaining continuation data is more than
+ * what can fit in this page, the continuation record crosses over
+ * this page. Read the next page and try again. xlp_rem_len in the
+ * next page header will contain the remaining length of the
+ * continuation data
+ *
+ * Note that record headers are MAXALIGN'ed
+ */
+ if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize))
+ tmpRecPtr = targetPagePtr + XLOG_BLCKSZ;
+ else
+ {
+ /*
+ * The previous continuation record ends in this page. Set
+ * tmpRecPtr to point to the first valid record
+ */
+ tmpRecPtr = targetPagePtr + pageHeaderSize
+ + MAXALIGN(header->xlp_rem_len);
+ break;
+ }
+ }
+ else
+ {
+ tmpRecPtr = targetPagePtr + pageHeaderSize;
+ break;
+ }
+ }
+
+ /*
+ * we know now that tmpRecPtr is an address pointing to a valid XLogRecord
+ * because either we're at the first record after the beginning of a page
+ * or we just jumped over the remaining data of a continuation.
+ */
+ XLogBeginRead(state, tmpRecPtr);
+ while (XLogReadRecord(state, &errormsg) != NULL)
+ {
+ /* past the record we've found, break out */
+ if (RecPtr <= state->ReadRecPtr)
+ {
+ /* Rewind the reader to the beginning of the last record. */
+ found = state->ReadRecPtr;
+ XLogBeginRead(state, found);
+ return found;
+ }
+ }
+
+err:
+ XLogReaderInvalReadState(state);
+
+ return InvalidXLogRecPtr;
+}
+
+/*
+ * Helper function to ease writing of XLogRoutine->page_read callbacks.
+ * If this function is used, caller must supply a segment_open callback in
+ * 'state', as that is used here.
+ *
+ * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
+ * fetched from timeline 'tli'.
+ *
+ * Returns true if succeeded, false if an error occurs, in which case
+ * 'errinfo' receives error details.
+ *
+ * XXX probably this should be improved to suck data directly from the
+ * WAL buffers when possible.
+ */
+bool
+WALRead(XLogReaderState *state,
+ char *buf, XLogRecPtr startptr, Size count, TimeLineID tli,
+ WALReadError *errinfo)
+{
+ char *p;
+ XLogRecPtr recptr;
+ Size nbytes;
+
+ p = buf;
+ recptr = startptr;
+ nbytes = count;
+
+ while (nbytes > 0)
+ {
+ uint32 startoff;
+ int segbytes;
+ int readbytes;
+
+ startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+ /*
+ * If the data we want is not in a segment we have open, close what we
+ * have (if anything) and open the next one, using the caller's
+ * provided openSegment callback.
+ */
+ if (state->seg.ws_file < 0 ||
+ !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
+ tli != state->seg.ws_tli)
+ {
+ XLogSegNo nextSegNo;
+
+ if (state->seg.ws_file >= 0)
+ state->routine.segment_close(state);
+
+ XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
+ state->routine.segment_open(state, nextSegNo, &tli);
+
+ /* This shouldn't happen -- indicates a bug in segment_open */
+ Assert(state->seg.ws_file >= 0);
+
+ /* Update the current segment info. */
+ state->seg.ws_tli = tli;
+ state->seg.ws_segno = nextSegNo;
+ }
+
+ /* How many bytes are within this segment? */
+ if (nbytes > (state->segcxt.ws_segsize - startoff))
+ segbytes = state->segcxt.ws_segsize - startoff;
+ else
+ segbytes = nbytes;
+
+#ifndef FRONTEND
+ pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#endif
+
+ /* Reset errno first; eases reporting non-errno-affecting errors */
+ errno = 0;
+ readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
+
+#ifndef FRONTEND
+ pgstat_report_wait_end();
+#endif
+
+ if (readbytes <= 0)
+ {
+ errinfo->wre_errno = errno;
+ errinfo->wre_req = segbytes;
+ errinfo->wre_read = readbytes;
+ errinfo->wre_off = startoff;
+ errinfo->wre_seg = state->seg;
+ return false;
+ }
+
+ /* Update state for read */
+ recptr += readbytes;
+ nbytes -= readbytes;
+ p += readbytes;
+ }
+
+ return true;
+}
+
+/* ----------------------------------------
+ * Functions for decoding the data and block references in a record.
+ * ----------------------------------------
+ */
+
+/*
+ * Private function to reset the state, forgetting all decoded records, if we
+ * are asked to move to a new read position.
+ */
+static void
+ResetDecoder(XLogReaderState *state)
+{
+ DecodedXLogRecord *r;
+
+ /* Reset the decoded record queue, freeing any oversized records. */
+ while ((r = state->decode_queue_head) != NULL)
+ {
+ state->decode_queue_head = r->next;
+ if (r->oversized)
+ pfree(r);
+ }
+ state->decode_queue_tail = NULL;
+ state->decode_queue_head = NULL;
+ state->record = NULL;
+
+ /* Reset the decode buffer to empty. */
+ state->decode_buffer_tail = state->decode_buffer;
+ state->decode_buffer_head = state->decode_buffer;
+
+ /* Clear error state. */
+ state->errormsg_buf[0] = '\0';
+ state->errormsg_deferred = false;
+}
+
+/*
+ * Compute the maximum possible amount of padding that could be required to
+ * decode a record, given xl_tot_len from the record's header. This is the
+ * amount of output buffer space that we need to decode a record, though we
+ * might not finish up using it all.
+ *
+ * This computation is pessimistic and assumes the maximum possible number of
+ * blocks, due to lack of better information.
+ */
+size_t
+DecodeXLogRecordRequiredSpace(size_t xl_tot_len)
+{
+ size_t size = 0;
+
+ /* Account for the fixed size part of the decoded record struct. */
+ size += offsetof(DecodedXLogRecord, blocks[0]);
+ /* Account for the flexible blocks array of maximum possible size. */
+ size += sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1);
+ /* Account for all the raw main and block data. */
+ size += xl_tot_len;
+ /* We might insert padding before main_data. */
+ size += (MAXIMUM_ALIGNOF - 1);
+ /* We might insert padding before each block's data. */
+ size += (MAXIMUM_ALIGNOF - 1) * (XLR_MAX_BLOCK_ID + 1);
+ /* We might insert padding at the end. */
+ size += (MAXIMUM_ALIGNOF - 1);
+
+ return size;
+}
+
+/*
+ * Decode a record. "decoded" must point to a MAXALIGNed memory area that has
+ * space for at least DecodeXLogRecordRequiredSpace(record) bytes. On
+ * success, decoded->size contains the actual space occupied by the decoded
+ * record, which may turn out to be less.
+ *
+ * Only decoded->oversized member must be initialized already, and will not be
+ * modified. Other members will be initialized as required.
+ *
+ * On error, a human-readable error message is returned in *errormsg, and
+ * the return value is false.
+ */
+bool
+DecodeXLogRecord(XLogReaderState *state,
+ DecodedXLogRecord *decoded,
+ XLogRecord *record,
+ XLogRecPtr lsn,
+ char **errormsg)
+{
+ /*
+ * read next _size bytes from record buffer, but check for overrun first.
+ */
+#define COPY_HEADER_FIELD(_dst, _size) \
+ do { \
+ if (remaining < _size) \
+ goto shortdata_err; \
+ memcpy(_dst, ptr, _size); \
+ ptr += _size; \
+ remaining -= _size; \
+ } while(0)
+
+ char *ptr;
+ char *out;
+ uint32 remaining;
+ uint32 datatotal;
+ RelFileNode *rnode = NULL;
+ uint8 block_id;
+
+ decoded->header = *record;
+ decoded->lsn = lsn;
+ decoded->next = NULL;
+ decoded->record_origin = InvalidRepOriginId;
+ decoded->toplevel_xid = InvalidTransactionId;
+ decoded->main_data = NULL;
+ decoded->main_data_len = 0;
+ decoded->max_block_id = -1;
+ ptr = (char *) record;
+ ptr += SizeOfXLogRecord;
+ remaining = record->xl_tot_len - SizeOfXLogRecord;
+
+ /* Decode the headers */
+ datatotal = 0;
+ while (remaining > datatotal)
+ {
+ COPY_HEADER_FIELD(&block_id, sizeof(uint8));
+
+ if (block_id == XLR_BLOCK_ID_DATA_SHORT)
+ {
+ /* XLogRecordDataHeaderShort */
+ uint8 main_data_len;
+
+ COPY_HEADER_FIELD(&main_data_len, sizeof(uint8));
+
+ decoded->main_data_len = main_data_len;
+ datatotal += main_data_len;
+ break; /* by convention, the main data fragment is
+ * always last */
+ }
+ else if (block_id == XLR_BLOCK_ID_DATA_LONG)
+ {
+ /* XLogRecordDataHeaderLong */
+ uint32 main_data_len;
+
+ COPY_HEADER_FIELD(&main_data_len, sizeof(uint32));
+ decoded->main_data_len = main_data_len;
+ datatotal += main_data_len;
+ break; /* by convention, the main data fragment is
+ * always last */
+ }
+ else if (block_id == XLR_BLOCK_ID_ORIGIN)
+ {
+ COPY_HEADER_FIELD(&decoded->record_origin, sizeof(RepOriginId));
+ }
+ else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID)
+ {
+ COPY_HEADER_FIELD(&decoded->toplevel_xid, sizeof(TransactionId));
+ }
+ else if (block_id <= XLR_MAX_BLOCK_ID)
+ {
+ /* XLogRecordBlockHeader */
+ DecodedBkpBlock *blk;
+ uint8 fork_flags;
+
+ /* mark any intervening block IDs as not in use */
+ for (int i = decoded->max_block_id + 1; i < block_id; ++i)
+ decoded->blocks[i].in_use = false;
+
+ if (block_id <= decoded->max_block_id)
+ {
+ report_invalid_record(state,
+ "out-of-order block_id %u at %X/%X",
+ block_id,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+ decoded->max_block_id = block_id;
+
+ blk = &decoded->blocks[block_id];
+ blk->in_use = true;
+ blk->apply_image = false;
+
+ COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
+ blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
+ blk->flags = fork_flags;
+ blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0);
+ blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0);
+
+ blk->prefetch_buffer = InvalidBuffer;
+
+ COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16));
+ /* cross-check that the HAS_DATA flag is set iff data_length > 0 */
+ if (blk->has_data && blk->data_len == 0)
+ {
+ report_invalid_record(state,
+ "BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+ if (!blk->has_data && blk->data_len != 0)
+ {
+ report_invalid_record(state,
+ "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
+ (unsigned int) blk->data_len,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+ datatotal += blk->data_len;
+
+ if (blk->has_image)
+ {
+ COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
+ COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
+ COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
+
+ blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
+
+ if (BKPIMAGE_COMPRESSED(blk->bimg_info))
+ {
+ if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
+ COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16));
+ else
+ blk->hole_length = 0;
+ }
+ else
+ blk->hole_length = BLCKSZ - blk->bimg_len;
+ datatotal += blk->bimg_len;
+
+ /*
+ * cross-check that hole_offset > 0, hole_length > 0 and
+ * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
+ */
+ if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+ (blk->hole_offset == 0 ||
+ blk->hole_length == 0 ||
+ blk->bimg_len == BLCKSZ))
+ {
+ report_invalid_record(state,
+ "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+ (unsigned int) blk->hole_offset,
+ (unsigned int) blk->hole_length,
+ (unsigned int) blk->bimg_len,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+
+ /*
+ * cross-check that hole_offset == 0 and hole_length == 0 if
+ * the HAS_HOLE flag is not set.
+ */
+ if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+ (blk->hole_offset != 0 || blk->hole_length != 0))
+ {
+ report_invalid_record(state,
+ "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+ (unsigned int) blk->hole_offset,
+ (unsigned int) blk->hole_length,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+
+ /*
+ * Cross-check that bimg_len < BLCKSZ if it is compressed.
+ */
+ if (BKPIMAGE_COMPRESSED(blk->bimg_info) &&
+ blk->bimg_len == BLCKSZ)
+ {
+ report_invalid_record(state,
+ "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X",
+ (unsigned int) blk->bimg_len,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+
+ /*
+ * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE is
+ * set nor COMPRESSED().
+ */
+ if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+ !BKPIMAGE_COMPRESSED(blk->bimg_info) &&
+ blk->bimg_len != BLCKSZ)
+ {
+ report_invalid_record(state,
+ "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X",
+ (unsigned int) blk->data_len,
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+ }
+ if (!(fork_flags & BKPBLOCK_SAME_REL))
+ {
+ COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode));
+ rnode = &blk->rnode;
+ }
+ else
+ {
+ if (rnode == NULL)
+ {
+ report_invalid_record(state,
+ "BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+
+ blk->rnode = *rnode;
+ }
+ COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber));
+ }
+ else
+ {
+ report_invalid_record(state,
+ "invalid block_id %u at %X/%X",
+ block_id, LSN_FORMAT_ARGS(state->ReadRecPtr));
+ goto err;
+ }
+ }
+
+ if (remaining != datatotal)
+ goto shortdata_err;
+
+ /*
+ * Ok, we've parsed the fragment headers, and verified that the total
+ * length of the payload in the fragments is equal to the amount of data
+ * left. Copy the data of each fragment to contiguous space after the
+ * blocks array, inserting alignment padding before the data fragments so
+ * they can be cast to struct pointers by REDO routines.
+ */
+ out = ((char *) decoded) +
+ offsetof(DecodedXLogRecord, blocks) +
+ sizeof(decoded->blocks[0]) * (decoded->max_block_id + 1);
+
+ /* block data first */
+ for (block_id = 0; block_id <= decoded->max_block_id; block_id++)
+ {
+ DecodedBkpBlock *blk = &decoded->blocks[block_id];
+
+ if (!blk->in_use)
+ continue;
+
+ Assert(blk->has_image || !blk->apply_image);
+
+ if (blk->has_image)
+ {
+ /* no need to align image */
+ blk->bkp_image = out;
+ memcpy(out, ptr, blk->bimg_len);
+ ptr += blk->bimg_len;
+ out += blk->bimg_len;
+ }
+ if (blk->has_data)
+ {
+ out = (char *) MAXALIGN(out);
+ blk->data = out;
+ memcpy(blk->data, ptr, blk->data_len);
+ ptr += blk->data_len;
+ out += blk->data_len;
+ }
+ }
+
+ /* and finally, the main data */
+ if (decoded->main_data_len > 0)
+ {
+ out = (char *) MAXALIGN(out);
+ decoded->main_data = out;
+ memcpy(decoded->main_data, ptr, decoded->main_data_len);
+ ptr += decoded->main_data_len;
+ out += decoded->main_data_len;
+ }
+
+ /* Report the actual size we used. */
+ decoded->size = MAXALIGN(out - (char *) decoded);
+ Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >=
+ decoded->size);
+
+ return true;
+
+shortdata_err:
+ report_invalid_record(state,
+ "record with invalid length at %X/%X",
+ LSN_FORMAT_ARGS(state->ReadRecPtr));
+err:
+ *errormsg = state->errormsg_buf;
+
+ return false;
+}
+
+/*
+ * Returns information about the block that a block reference refers to.
+ *
+ * This is like XLogRecGetBlockTagExtended, except that the block reference
+ * must exist and there's no access to prefetch_buffer.
+ */
+void
+XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
+ RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
+{
+ if (!XLogRecGetBlockTagExtended(record, block_id, rnode, forknum, blknum,
+ NULL))
+ {
+#ifndef FRONTEND
+ elog(ERROR, "could not locate backup block with ID %d in WAL record",
+ block_id);
+#else
+ pg_fatal("could not locate backup block with ID %d in WAL record",
+ block_id);
+#endif
+ }
+}
+
+/*
+ * Returns information about the block that a block reference refers to,
+ * optionally including the buffer that the block may already be in.
+ *
+ * If the WAL record contains a block reference with the given ID, *rnode,
+ * *forknum, *blknum and *prefetch_buffer are filled in (if not NULL), and
+ * returns true. Otherwise returns false.
+ */
+bool
+XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id,
+ RelFileNode *rnode, ForkNumber *forknum,
+ BlockNumber *blknum,
+ Buffer *prefetch_buffer)
+{
+ DecodedBkpBlock *bkpb;
+
+ if (!XLogRecHasBlockRef(record, block_id))
+ return false;
+
+ bkpb = &record->record->blocks[block_id];
+ if (rnode)
+ *rnode = bkpb->rnode;
+ if (forknum)
+ *forknum = bkpb->forknum;
+ if (blknum)
+ *blknum = bkpb->blkno;
+ if (prefetch_buffer)
+ *prefetch_buffer = bkpb->prefetch_buffer;
+ return true;
+}
+
+/*
+ * Returns the data associated with a block reference, or NULL if there is
+ * no data (e.g. because a full-page image was taken instead). The returned
+ * pointer points to a MAXALIGNed buffer.
+ */
+char *
+XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
+{
+ DecodedBkpBlock *bkpb;
+
+ if (block_id > record->record->max_block_id ||
+ !record->record->blocks[block_id].in_use)
+ return NULL;
+
+ bkpb = &record->record->blocks[block_id];
+
+ if (!bkpb->has_data)
+ {
+ if (len)
+ *len = 0;
+ return NULL;
+ }
+ else
+ {
+ if (len)
+ *len = bkpb->data_len;
+ return bkpb->data;
+ }
+}
+
+/*
+ * Restore a full-page image from a backup block attached to an XLOG record.
+ *
+ * Returns true if a full-page image is restored, and false on failure with
+ * an error to be consumed by the caller.
+ */
+bool
+RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
+{
+ DecodedBkpBlock *bkpb;
+ char *ptr;
+ PGAlignedBlock tmp;
+
+ if (block_id > record->record->max_block_id ||
+ !record->record->blocks[block_id].in_use)
+ {
+ report_invalid_record(record,
+ "could not restore image at %X/%X with invalid block %d specified",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ block_id);
+ return false;
+ }
+ if (!record->record->blocks[block_id].has_image)
+ {
+ report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ block_id);
+ return false;
+ }
+
+ bkpb = &record->record->blocks[block_id];
+ ptr = bkpb->bkp_image;
+
+ if (BKPIMAGE_COMPRESSED(bkpb->bimg_info))
+ {
+ /* If a backup block image is compressed, decompress it */
+ bool decomp_success = true;
+
+ if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+ {
+ if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data,
+ BLCKSZ - bkpb->hole_length, true) < 0)
+ decomp_success = false;
+ }
+ else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+ {
+#ifdef USE_LZ4
+ if (LZ4_decompress_safe(ptr, tmp.data,
+ bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0)
+ decomp_success = false;
+#else
+ report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ "LZ4",
+ block_id);
+ return false;
+#endif
+ }
+ else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+ {
+#ifdef USE_ZSTD
+ size_t decomp_result = ZSTD_decompress(tmp.data,
+ BLCKSZ - bkpb->hole_length,
+ ptr, bkpb->bimg_len);
+
+ if (ZSTD_isError(decomp_result))
+ decomp_success = false;
+#else
+ report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ "zstd",
+ block_id);
+ return false;
+#endif
+ }
+ else
+ {
+ report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ block_id);
+ return false;
+ }
+
+ if (!decomp_success)
+ {
+ report_invalid_record(record, "could not decompress image at %X/%X, block %d",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ block_id);
+ return false;
+ }
+
+ ptr = tmp.data;
+ }
+
+ /* generate page, taking into account hole if necessary */
+ if (bkpb->hole_length == 0)
+ {
+ memcpy(page, ptr, BLCKSZ);
+ }
+ else
+ {
+ memcpy(page, ptr, bkpb->hole_offset);
+ /* must zero-fill the hole */
+ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length);
+ memcpy(page + (bkpb->hole_offset + bkpb->hole_length),
+ ptr + bkpb->hole_offset,
+ BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
+ }
+
+ return true;
+}
+
+#ifndef FRONTEND
+
+/*
+ * Extract the FullTransactionId from a WAL record.
+ */
+FullTransactionId
+XLogRecGetFullXid(XLogReaderState *record)
+{
+ TransactionId xid,
+ next_xid;
+ uint32 epoch;
+
+ /*
+ * This function is only safe during replay, because it depends on the
+ * replay state. See AdvanceNextFullTransactionIdPastXid() for more.
+ */
+ Assert(AmStartupProcess() || !IsUnderPostmaster);
+
+ xid = XLogRecGetXid(record);
+ next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * If xid is numerically greater than next_xid, it has to be from the last
+ * epoch.
+ */
+ if (unlikely(xid > next_xid))
+ --epoch;
+
+ return FullTransactionIdFromEpochAndXid(epoch, xid);
+}
+
+#endif
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
new file mode 100644
index 0000000..166f7b7
--- /dev/null
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -0,0 +1,4699 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogrecovery.c
+ * Functions for WAL recovery, standby mode
+ *
+ * This source file contains functions controlling WAL recovery.
+ * InitWalRecovery() initializes the system for crash or archive recovery,
+ * or standby mode, depending on configuration options and the state of
+ * the control file and possible backup label file. PerformWalRecovery()
+ * performs the actual WAL replay, calling the rmgr-specific redo routines.
+ * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
+ * and prepares information needed to initialize the WAL for writes. In
+ * addition to these three main functions, there are a bunch of functions
+ * for interrogating recovery state and controlling the recovery process.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogrecovery.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "backup/basebackup.h"
+#include "catalog/pg_control.h"
+#include "commands/tablespace.h"
+#include "common/file_utils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/pg_rusage.h"
+
+/* Unsupported old recovery command file names (relative to $PGDATA) */
+#define RECOVERY_COMMAND_FILE "recovery.conf"
+#define RECOVERY_COMMAND_DONE "recovery.done"
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry recovery_target_action_options[] = {
+ {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
+ {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
+ {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
+ {NULL, 0, false}
+};
+
+/* options formerly taken from recovery.conf for archive recovery */
+char *recoveryRestoreCommand = NULL;
+char *recoveryEndCommand = NULL;
+char *archiveCleanupCommand = NULL;
+RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
+bool recoveryTargetInclusive = true;
+int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
+TransactionId recoveryTargetXid;
+char *recovery_target_time_string;
+TimestampTz recoveryTargetTime;
+const char *recoveryTargetName;
+XLogRecPtr recoveryTargetLSN;
+int recovery_min_apply_delay = 0;
+
+/* options formerly taken from recovery.conf for XLOG streaming */
+char *PrimaryConnInfo = NULL;
+char *PrimarySlotName = NULL;
+char *PromoteTriggerFile = NULL;
+bool wal_receiver_create_temp_slot = false;
+
+/*
+ * recoveryTargetTimeLineGoal: what the user requested, if any
+ *
+ * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
+ *
+ * recoveryTargetTLI: the currently understood target timeline; changes
+ *
+ * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
+ * the timelines of its known parents, newest first (so recoveryTargetTLI is
+ * always the first list member). Only these TLIs are expected to be seen in
+ * the WAL segments we read, and indeed only these TLIs will be considered as
+ * candidate WAL files to open at all.
+ *
+ * curFileTLI: the TLI appearing in the name of the current input WAL file.
+ * (This is not necessarily the same as the timeline from which we are
+ * replaying WAL, which StartupXLOG calls replayTLI, because we could be
+ * scanning data that was copied from an ancestor timeline when the current
+ * file was created.) During a sequential scan we do not allow this value
+ * to decrease.
+ */
+RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
+TimeLineID recoveryTargetTLIRequested = 0;
+TimeLineID recoveryTargetTLI = 0;
+static List *expectedTLEs;
+static TimeLineID curFileTLI;
+
+/*
+ * When ArchiveRecoveryRequested is set, archive recovery was requested,
+ * ie. signal files were present. When InArchiveRecovery is set, we are
+ * currently recovering using offline XLOG archives. These variables are only
+ * valid in the startup process.
+ *
+ * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
+ * currently performing crash recovery using only XLOG files in pg_wal, but
+ * will switch to using offline XLOG archives as soon as we reach the end of
+ * WAL in pg_wal.
+*/
+bool ArchiveRecoveryRequested = false;
+bool InArchiveRecovery = false;
+
+/*
+ * When StandbyModeRequested is set, standby mode was requested, i.e.
+ * standby.signal file was present. When StandbyMode is set, we are currently
+ * in standby mode. These variables are only valid in the startup process.
+ * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
+ */
+static bool StandbyModeRequested = false;
+bool StandbyMode = false;
+
+/* was a signal file present at startup? */
+static bool standby_signal_file_found = false;
+static bool recovery_signal_file_found = false;
+
+/*
+ * CheckPointLoc is the position of the checkpoint record that determines
+ * where to start the replay. It comes from the backup label file or the
+ * control file.
+ *
+ * RedoStartLSN is the checkpoint's REDO location, also from the backup label
+ * file or the control file. In standby mode, XLOG streaming usually starts
+ * from the position where an invalid record was found. But if we fail to
+ * read even the initial checkpoint record, we use the REDO location instead
+ * of the checkpoint location as the start position of XLOG streaming.
+ * Otherwise we would have to jump backwards to the REDO location after
+ * reading the checkpoint record, because the REDO record can precede the
+ * checkpoint record.
+ */
+static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
+static TimeLineID CheckPointTLI = 0;
+static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
+static TimeLineID RedoStartTLI = 0;
+
+/*
+ * Local copy of SharedHotStandbyActive variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalHotStandbyActive = false;
+
+/*
+ * Local copy of SharedPromoteIsTriggered variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalPromoteIsTriggered = false;
+
+/* Has the recovery code requested a walreceiver wakeup? */
+static bool doRequestWalReceiverReply;
+
+/* XLogReader object used to parse the WAL records */
+static XLogReaderState *xlogreader = NULL;
+
+/* XLogPrefetcher object used to consume WAL records with read-ahead */
+static XLogPrefetcher *xlogprefetcher = NULL;
+
+/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
+typedef struct XLogPageReadPrivate
+{
+ int emode;
+ bool fetching_ckpt; /* are we fetching a checkpoint record? */
+ bool randAccess;
+ TimeLineID replayTLI;
+} XLogPageReadPrivate;
+
+/* flag to tell XLogPageRead that we have started replaying */
+static bool InRedo = false;
+
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.
+ */
+typedef enum
+{
+ XLOG_FROM_ANY = 0, /* request to read WAL from any source */
+ XLOG_FROM_ARCHIVE, /* restored using restore_command */
+ XLOG_FROM_PG_WAL, /* existing file in pg_wal */
+ XLOG_FROM_STREAM /* streamed from primary */
+} XLogSource;
+
+/* human-readable names for XLogSources, for debugging output */
+static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
+
+/*
+ * readFile is -1 or a kernel FD for the log file segment that's currently
+ * open for reading. readSegNo identifies the segment. readOff is the offset
+ * of the page just read, readLen indicates how much of it has been read into
+ * readBuf, and readSource indicates where we got the currently open file from.
+ *
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
+ * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
+ * worthwhile, since the XLOG is not read by general-purpose sessions.
+ */
+static int readFile = -1;
+static XLogSegNo readSegNo = 0;
+static uint32 readOff = 0;
+static uint32 readLen = 0;
+static XLogSource readSource = XLOG_FROM_ANY;
+
+/*
+ * Keeps track of which source we're currently reading from. This is
+ * different from readSource in that this is always set, even when we don't
+ * currently have a WAL file open. If lastSourceFailed is set, our last
+ * attempt to read from currentSource failed, and we should try another source
+ * next.
+ *
+ * pendingWalRcvRestart is set when a config change occurs that requires a
+ * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
+ */
+static XLogSource currentSource = XLOG_FROM_ANY;
+static bool lastSourceFailed = false;
+static bool pendingWalRcvRestart = false;
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from. (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now. It is also different from currentSource, which
+ * also changes when we try to read from a source and fail, while
+ * XLogReceiptSource tracks where we last successfully read some WAL.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
+
+/* Local copy of WalRcv->flushedUpto */
+static XLogRecPtr flushedUpto = 0;
+static TimeLineID receiveTLI = 0;
+
+/*
+ * Copy of minRecoveryPoint and backupEndPoint from the control file.
+ *
+ * In order to reach consistency, we must replay the WAL up to
+ * minRecoveryPoint. If backupEndRequired is true, we must also reach
+ * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
+ * to backupStartPoint.
+ *
+ * Note: In archive recovery, after consistency has been reached, the
+ * functions in xlog.c will start updating minRecoveryPoint in the control
+ * file. But this copy of minRecoveryPoint variable reflects the value at the
+ * beginning of recovery, and is *not* updated after consistency is reached.
+ */
+static XLogRecPtr minRecoveryPoint;
+static TimeLineID minRecoveryPointTLI;
+
+static XLogRecPtr backupStartPoint;
+static XLogRecPtr backupEndPoint;
+static bool backupEndRequired = false;
+
+/*
+ * Have we reached a consistent database state? In crash recovery, we have
+ * to replay all the WAL, so reachedConsistency is never set. During archive
+ * recovery, the database is consistent once minRecoveryPoint is reached.
+ *
+ * Consistent state means that the system is internally consistent, all
+ * the WAL has been replayed up to a certain point, and importantly, there
+ * is no trace of later actions on disk.
+ */
+bool reachedConsistency = false;
+
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *primary_image_masked = NULL;
+
+
+/*
+ * Shared-memory state for WAL recovery.
+ */
+typedef struct XLogRecoveryCtlData
+{
+ /*
+ * SharedHotStandbyActive indicates if we allow hot standby queries to be
+ * run. Protected by info_lck.
+ */
+ bool SharedHotStandbyActive;
+
+ /*
+ * SharedPromoteIsTriggered indicates if a standby promotion has been
+ * triggered. Protected by info_lck.
+ */
+ bool SharedPromoteIsTriggered;
+
+ /*
+ * recoveryWakeupLatch is used to wake up the startup process to continue
+ * WAL replay, if it is waiting for WAL to arrive or failover trigger file
+ * to appear.
+ *
+ * Note that the startup process also uses another latch, its procLatch,
+ * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
+ * signaling the startup process in favor of using its procLatch, which
+ * comports better with possible generic signal handlers using that latch.
+ * But we should not do that because the startup process doesn't assume
+ * that it's waken up by walreceiver process or SIGHUP signal handler
+ * while it's waiting for recovery conflict. The separate latches,
+ * recoveryWakeupLatch and procLatch, should be used for inter-process
+ * communication for WAL replay and recovery conflict, respectively.
+ */
+ Latch recoveryWakeupLatch;
+
+ /*
+ * Last record successfully replayed.
+ */
+ XLogRecPtr lastReplayedReadRecPtr; /* start position */
+ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
+ TimeLineID lastReplayedTLI; /* timeline */
+
+ /*
+ * When we're currently replaying a record, ie. in a redo function,
+ * replayEndRecPtr points to the end+1 of the record being replayed,
+ * otherwise it's equal to lastReplayedEndRecPtr.
+ */
+ XLogRecPtr replayEndRecPtr;
+ TimeLineID replayEndTLI;
+ /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
+ TimestampTz recoveryLastXTime;
+
+ /*
+ * timestamp of when we started replaying the current chunk of WAL data,
+ * only relevant for replication or archive recovery
+ */
+ TimestampTz currentChunkStartTime;
+ /* Recovery pause state */
+ RecoveryPauseState recoveryPauseState;
+ ConditionVariable recoveryNotPausedCV;
+
+ slock_t info_lck; /* locks shared variables shown above */
+} XLogRecoveryCtlData;
+
+static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
+
+/*
+ * abortedRecPtr is the start pointer of a broken record at end of WAL when
+ * recovery completes; missingContrecPtr is the location of the first
+ * contrecord that went missing. See CreateOverwriteContrecordRecord for
+ * details.
+ */
+static XLogRecPtr abortedRecPtr;
+static XLogRecPtr missingContrecPtr;
+
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
+static TransactionId recoveryStopXid;
+static TimestampTz recoveryStopTime;
+static XLogRecPtr recoveryStopLSN;
+static char recoveryStopName[MAXFNAMELEN];
+static bool recoveryStopAfter;
+
+/* prototypes for local functions */
+static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
+
+static void EnableStandbyMode(void);
+static void readRecoverySignalFile(void);
+static void validateRecoveryParameters(void);
+static bool read_backup_label(XLogRecPtr *checkPointLoc,
+ TimeLineID *backupLabelTLI,
+ bool *backupEndRequired, bool *backupFromStandby);
+static bool read_tablespace_map(List **tablespaces);
+
+static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
+static void CheckRecoveryConsistency(void);
+static void rm_redo_error_callback(void *arg);
+#ifdef WAL_DEBUG
+static void xlog_outrec(StringInfo buf, XLogReaderState *record);
+#endif
+static void xlog_block_info(StringInfo buf, XLogReaderState *record);
+static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
+ TimeLineID prevTLI, TimeLineID replayTLI);
+static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
+static void verifyBackupPageConsistency(XLogReaderState *record);
+
+static bool recoveryStopsBefore(XLogReaderState *record);
+static bool recoveryStopsAfter(XLogReaderState *record);
+static char *getRecoveryStopReason(void);
+static void recoveryPausesHere(bool endOfRecovery);
+static bool recoveryApplyDelay(XLogReaderState *record);
+static void ConfirmRecoveryPaused(void);
+
+static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
+ int emode, bool fetching_ckpt,
+ TimeLineID replayTLI);
+
+static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
+static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
+ bool randAccess,
+ bool fetching_ckpt,
+ XLogRecPtr tliRecPtr,
+ TimeLineID replayTLI,
+ XLogRecPtr replayLSN,
+ bool nonblocking);
+static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
+static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
+ int whichChkpt, bool report, TimeLineID replayTLI);
+static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
+static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+ XLogSource source, bool notfoundOk);
+static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
+
+static bool CheckForStandbyTrigger(void);
+static void SetPromoteIsTriggered(void);
+static bool HotStandbyActiveInReplay(void);
+
+static void SetCurrentChunkStartTime(TimestampTz xtime);
+static void SetLatestXTime(TimestampTz xtime);
+
+/*
+ * Initialization of shared memory for WAL recovery
+ */
+Size
+XLogRecoveryShmemSize(void)
+{
+ Size size;
+
+ /* XLogRecoveryCtl */
+ size = sizeof(XLogRecoveryCtlData);
+
+ return size;
+}
+
+void
+XLogRecoveryShmemInit(void)
+{
+ bool found;
+
+ XLogRecoveryCtl = (XLogRecoveryCtlData *)
+ ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
+ if (found)
+ return;
+ memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
+
+ SpinLockInit(&XLogRecoveryCtl->info_lck);
+ InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * A thin wrapper to enable StandbyMode and do other preparatory work as
+ * needed.
+ */
+static void
+EnableStandbyMode(void)
+{
+ StandbyMode = true;
+
+ /*
+ * To avoid server log bloat, we don't report recovery progress in a
+ * standby as it will always be in recovery unless promoted. We disable
+ * startup progress timeout in standby mode to avoid calling
+ * startup_progress_timeout_handler() unnecessarily.
+ */
+ disable_startup_progress_timeout();
+}
+
+/*
+ * Prepare the system for WAL recovery, if needed.
+ *
+ * This is called by StartupXLOG() which coordinates the server startup
+ * sequence. This function analyzes the control file and the backup label
+ * file, if any, and figures out whether we need to perform crash recovery or
+ * archive recovery, and how far we need to replay the WAL to reach a
+ * consistent state.
+ *
+ * This doesn't yet change the on-disk state, except for creating the symlinks
+ * from table space map file if any, and for fetching WAL files needed to find
+ * the checkpoint record. On entry, the caller has already read the control
+ * file into memory, and passes it as argument. This function updates it to
+ * reflect the recovery state, and the caller is expected to write it back to
+ * disk does after initializing other subsystems, but before calling
+ * PerformWalRecovery().
+ *
+ * This initializes some global variables like ArchiveModeRequested, and
+ * StandbyModeRequested and InRecovery.
+ */
+void
+InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
+ bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
+{
+ XLogPageReadPrivate *private;
+ struct stat st;
+ bool wasShutdown;
+ XLogRecord *record;
+ DBState dbstate_at_startup;
+ bool haveTblspcMap = false;
+ bool haveBackupLabel = false;
+ CheckPoint checkPoint;
+ bool backupFromStandby = false;
+
+ dbstate_at_startup = ControlFile->state;
+
+ /*
+ * Initialize on the assumption we want to recover to the latest timeline
+ * that's active according to pg_control.
+ */
+ if (ControlFile->minRecoveryPointTLI >
+ ControlFile->checkPointCopy.ThisTimeLineID)
+ recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
+ else
+ recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+
+ /*
+ * Check for signal files, and if so set up state for offline recovery
+ */
+ readRecoverySignalFile();
+ validateRecoveryParameters();
+
+ if (ArchiveRecoveryRequested)
+ {
+ if (StandbyModeRequested)
+ ereport(LOG,
+ (errmsg("entering standby mode")));
+ else if (recoveryTarget == RECOVERY_TARGET_XID)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to XID %u",
+ recoveryTargetXid)));
+ else if (recoveryTarget == RECOVERY_TARGET_TIME)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to %s",
+ timestamptz_to_str(recoveryTargetTime))));
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to \"%s\"",
+ recoveryTargetName)));
+ else if (recoveryTarget == RECOVERY_TARGET_LSN)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryTargetLSN))));
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to earliest consistent point")));
+ else
+ ereport(LOG,
+ (errmsg("starting archive recovery")));
+ }
+
+ /*
+ * Take ownership of the wakeup latch if we're going to sleep during
+ * recovery.
+ */
+ if (ArchiveRecoveryRequested)
+ OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+ private = palloc0(sizeof(XLogPageReadPrivate));
+ xlogreader =
+ XLogReaderAllocate(wal_segment_size, NULL,
+ XL_ROUTINE(.page_read = &XLogPageRead,
+ .segment_open = NULL,
+ .segment_close = wal_segment_close),
+ private);
+ if (!xlogreader)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while allocating a WAL reading processor.")));
+ xlogreader->system_identifier = ControlFile->system_identifier;
+
+ /*
+ * Set the WAL decode buffer size. This limits how far ahead we can read
+ * in the WAL.
+ */
+ XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
+
+ /* Create a WAL prefetcher. */
+ xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
+
+ /*
+ * Allocate two page buffers dedicated to WAL consistency checks. We do
+ * it this way, rather than just making static arrays, for two reasons:
+ * (1) no need to waste the storage in most instantiations of the backend;
+ * (2) a static char array isn't guaranteed to have any particular
+ * alignment, whereas palloc() will provide MAXALIGN'd storage.
+ */
+ replay_image_masked = (char *) palloc(BLCKSZ);
+ primary_image_masked = (char *) palloc(BLCKSZ);
+
+ if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
+ &backupFromStandby))
+ {
+ List *tablespaces = NIL;
+
+ /*
+ * Archive recovery was requested, and thanks to the backup label
+ * file, we know how far we need to replay to reach consistency. Enter
+ * archive recovery directly.
+ */
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ EnableStandbyMode();
+
+ /*
+ * When a backup_label file is present, we want to roll forward from
+ * the checkpoint it identifies, rather than using pg_control.
+ */
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true,
+ CheckPointTLI);
+ if (record != NULL)
+ {
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+ ereport(DEBUG1,
+ (errmsg_internal("checkpoint record is at %X/%X",
+ LSN_FORMAT_ARGS(CheckPointLoc))));
+ InRecovery = true; /* force recovery even if SHUTDOWNED */
+
+ /*
+ * Make sure that REDO location exists. This may not be the case
+ * if there was a crash during an online backup, which left a
+ * backup_label around that references a WAL segment that's
+ * already been archived.
+ */
+ if (checkPoint.redo < CheckPointLoc)
+ {
+ XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
+ if (!ReadRecord(xlogprefetcher, LOG, false,
+ checkPoint.ThisTimeLineID))
+ ereport(FATAL,
+ (errmsg("could not find redo location referenced by checkpoint record"),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir)));
+ }
+ }
+ else
+ {
+ ereport(FATAL,
+ (errmsg("could not locate required checkpoint record"),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir)));
+ wasShutdown = false; /* keep compiler quiet */
+ }
+
+ /* Read the tablespace_map file if present and create symlinks. */
+ if (read_tablespace_map(&tablespaces))
+ {
+ ListCell *lc;
+
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *ti = lfirst(lc);
+ char *linkloc;
+
+ linkloc = psprintf("pg_tblspc/%s", ti->oid);
+
+ /*
+ * Remove the existing symlink if any and Create the symlink
+ * under PGDATA.
+ */
+ remove_tablespace_symlink(linkloc);
+
+ if (symlink(ti->path, linkloc) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create symbolic link \"%s\": %m",
+ linkloc)));
+
+ pfree(ti->oid);
+ pfree(ti->path);
+ pfree(ti);
+ }
+
+ /* tell the caller to delete it later */
+ haveTblspcMap = true;
+ }
+
+ /* tell the caller to delete it later */
+ haveBackupLabel = true;
+ }
+ else
+ {
+ /*
+ * If tablespace_map file is present without backup_label file, there
+ * is no use of such file. There is no harm in retaining it, but it
+ * is better to get rid of the map file so that we don't have any
+ * redundant file in data directory and it will avoid any sort of
+ * confusion. It seems prudent though to just rename the file out of
+ * the way rather than delete it completely, also we ignore any error
+ * that occurs in rename operation as even if map file is present
+ * without backup_label file, it is harmless.
+ */
+ if (stat(TABLESPACE_MAP, &st) == 0)
+ {
+ unlink(TABLESPACE_MAP_OLD);
+ if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
+ ereport(LOG,
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("File \"%s\" was renamed to \"%s\".",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ else
+ ereport(LOG,
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("Could not rename file \"%s\" to \"%s\": %m.",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ }
+
+ /*
+ * It's possible that archive recovery was requested, but we don't
+ * know how far we need to replay the WAL before we reach consistency.
+ * This can happen for example if a base backup is taken from a
+ * running server using an atomic filesystem snapshot, without calling
+ * pg_backup_start/stop. Or if you just kill a running primary server
+ * and put it into archive recovery by creating a recovery signal
+ * file.
+ *
+ * Our strategy in that case is to perform crash recovery first,
+ * replaying all the WAL present in pg_wal, and only enter archive
+ * recovery after that.
+ *
+ * But usually we already know how far we need to replay the WAL (up
+ * to minRecoveryPoint, up to backupEndPoint, or until we see an
+ * end-of-backup record), and we can enter archive recovery directly.
+ */
+ if (ArchiveRecoveryRequested &&
+ (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
+ ControlFile->backupEndRequired ||
+ ControlFile->backupEndPoint != InvalidXLogRecPtr ||
+ ControlFile->state == DB_SHUTDOWNED))
+ {
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ EnableStandbyMode();
+ }
+
+ /* Get the last valid checkpoint record. */
+ CheckPointLoc = ControlFile->checkPoint;
+ CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+ RedoStartLSN = ControlFile->checkPointCopy.redo;
+ RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true,
+ CheckPointTLI);
+ if (record != NULL)
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("checkpoint record is at %X/%X",
+ LSN_FORMAT_ARGS(CheckPointLoc))));
+ }
+ else
+ {
+ /*
+ * We used to attempt to go back to a secondary checkpoint record
+ * here, but only when not in standby mode. We now just fail if we
+ * can't read the last checkpoint because this allows us to
+ * simplify processing around checkpoints.
+ */
+ ereport(PANIC,
+ (errmsg("could not locate a valid checkpoint record")));
+ }
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+ }
+
+ /*
+ * If the location of the checkpoint record is not on the expected
+ * timeline in the history of the requested timeline, we cannot proceed:
+ * the backup is not part of the history of the requested timeline.
+ */
+ Assert(expectedTLEs); /* was initialized by reading checkpoint
+ * record */
+ if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
+ CheckPointTLI)
+ {
+ XLogRecPtr switchpoint;
+
+ /*
+ * tliSwitchPoint will throw an error if the checkpoint's timeline is
+ * not in expectedTLEs at all.
+ */
+ switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
+ ereport(FATAL,
+ (errmsg("requested timeline %u is not a child of this server's history",
+ recoveryTargetTLI),
+ errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+ LSN_FORMAT_ARGS(ControlFile->checkPoint),
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ LSN_FORMAT_ARGS(switchpoint))));
+ }
+
+ /*
+ * The min recovery point should be part of the requested timeline's
+ * history, too.
+ */
+ if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
+ tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+ ControlFile->minRecoveryPointTLI)
+ ereport(FATAL,
+ (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
+ recoveryTargetTLI,
+ LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+ ControlFile->minRecoveryPointTLI)));
+
+ ereport(DEBUG1,
+ (errmsg_internal("redo record is at %X/%X; shutdown %s",
+ LSN_FORMAT_ARGS(checkPoint.redo),
+ wasShutdown ? "true" : "false")));
+ ereport(DEBUG1,
+ (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
+ U64FromFullTransactionId(checkPoint.nextXid),
+ checkPoint.nextOid)));
+ ereport(DEBUG1,
+ (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
+ checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+ ereport(DEBUG1,
+ (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
+ checkPoint.oldestXid, checkPoint.oldestXidDB)));
+ ereport(DEBUG1,
+ (errmsg_internal("oldest MultiXactId: %u, in database %u",
+ checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
+ ereport(DEBUG1,
+ (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
+ checkPoint.oldestCommitTsXid,
+ checkPoint.newestCommitTsXid)));
+ if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
+ ereport(PANIC,
+ (errmsg("invalid next transaction ID")));
+
+ /* sanity check */
+ if (checkPoint.redo > CheckPointLoc)
+ ereport(PANIC,
+ (errmsg("invalid redo in checkpoint record")));
+
+ /*
+ * Check whether we need to force recovery from WAL. If it appears to
+ * have been a clean shutdown and we did not have a recovery signal file,
+ * then assume no recovery needed.
+ */
+ if (checkPoint.redo < CheckPointLoc)
+ {
+ if (wasShutdown)
+ ereport(PANIC,
+ (errmsg("invalid redo record in shutdown checkpoint")));
+ InRecovery = true;
+ }
+ else if (ControlFile->state != DB_SHUTDOWNED)
+ InRecovery = true;
+ else if (ArchiveRecoveryRequested)
+ {
+ /* force recovery due to presence of recovery signal file */
+ InRecovery = true;
+ }
+
+ /*
+ * If recovery is needed, update our in-memory copy of pg_control to show
+ * that we are recovering and to show the selected checkpoint as the place
+ * we are starting from. We also mark pg_control with any minimum recovery
+ * stop point obtained from a backup history file.
+ *
+ * We don't write the changes to disk yet, though. Only do that after
+ * initializing various subsystems.
+ */
+ if (InRecovery)
+ {
+ if (InArchiveRecovery)
+ {
+ ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("database system was not properly shut down; "
+ "automatic recovery in progress")));
+ if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
+ ereport(LOG,
+ (errmsg("crash recovery starts in timeline %u "
+ "and has target timeline %u",
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ recoveryTargetTLI)));
+ ControlFile->state = DB_IN_CRASH_RECOVERY;
+ }
+ ControlFile->checkPoint = CheckPointLoc;
+ ControlFile->checkPointCopy = checkPoint;
+ if (InArchiveRecovery)
+ {
+ /* initialize minRecoveryPoint if not set yet */
+ if (ControlFile->minRecoveryPoint < checkPoint.redo)
+ {
+ ControlFile->minRecoveryPoint = checkPoint.redo;
+ ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
+ }
+ }
+
+ /*
+ * Set backupStartPoint if we're starting recovery from a base backup.
+ *
+ * Also set backupEndPoint and use minRecoveryPoint as the backup end
+ * location if we're starting recovery from a base backup which was
+ * taken from a standby. In this case, the database system status in
+ * pg_control must indicate that the database was already in recovery.
+ * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
+ * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
+ * before reaching this point; e.g. because restore_command or
+ * primary_conninfo were faulty.
+ *
+ * Any other state indicates that the backup somehow became corrupted
+ * and we can't sensibly continue with recovery.
+ */
+ if (haveBackupLabel)
+ {
+ ControlFile->backupStartPoint = checkPoint.redo;
+ ControlFile->backupEndRequired = backupEndRequired;
+
+ if (backupFromStandby)
+ {
+ if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
+ dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
+ ereport(FATAL,
+ (errmsg("backup_label contains data inconsistent with control file"),
+ errhint("This means that the backup is corrupted and you will "
+ "have to use another backup for recovery.")));
+ ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
+ }
+ }
+ }
+
+ /* remember these, so that we know when we have reached consistency */
+ backupStartPoint = ControlFile->backupStartPoint;
+ backupEndRequired = ControlFile->backupEndRequired;
+ backupEndPoint = ControlFile->backupEndPoint;
+ if (InArchiveRecovery)
+ {
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+ minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ else
+ {
+ minRecoveryPoint = InvalidXLogRecPtr;
+ minRecoveryPointTLI = 0;
+ }
+
+ /*
+ * Start recovery assuming that the final record isn't lost.
+ */
+ abortedRecPtr = InvalidXLogRecPtr;
+ missingContrecPtr = InvalidXLogRecPtr;
+
+ *wasShutdown_ptr = wasShutdown;
+ *haveBackupLabel_ptr = haveBackupLabel;
+ *haveTblspcMap_ptr = haveTblspcMap;
+}
+
+/*
+ * See if there are any recovery signal files and if so, set state for
+ * recovery.
+ *
+ * See if there is a recovery command file (recovery.conf), and if so
+ * throw an ERROR since as of PG12 we no longer recognize that.
+ */
+static void
+readRecoverySignalFile(void)
+{
+ struct stat stat_buf;
+
+ if (IsBootstrapProcessingMode())
+ return;
+
+ /*
+ * Check for old recovery API file: recovery.conf
+ */
+ if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("using recovery command file \"%s\" is not supported",
+ RECOVERY_COMMAND_FILE)));
+
+ /*
+ * Remove unused .done file, if present. Ignore if absent.
+ */
+ unlink(RECOVERY_COMMAND_DONE);
+
+ /*
+ * Check for recovery signal files and if found, fsync them since they
+ * represent server state information. We don't sweat too much about the
+ * possibility of fsync failure, however.
+ *
+ * If present, standby signal file takes precedence. If neither is present
+ * then we won't enter archive recovery.
+ */
+ if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ int fd;
+
+ fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd >= 0)
+ {
+ (void) pg_fsync(fd);
+ close(fd);
+ }
+ standby_signal_file_found = true;
+ }
+ else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ int fd;
+
+ fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd >= 0)
+ {
+ (void) pg_fsync(fd);
+ close(fd);
+ }
+ recovery_signal_file_found = true;
+ }
+
+ StandbyModeRequested = false;
+ ArchiveRecoveryRequested = false;
+ if (standby_signal_file_found)
+ {
+ StandbyModeRequested = true;
+ ArchiveRecoveryRequested = true;
+ }
+ else if (recovery_signal_file_found)
+ {
+ StandbyModeRequested = false;
+ ArchiveRecoveryRequested = true;
+ }
+ else
+ return;
+
+ /*
+ * We don't support standby mode in standalone backends; that requires
+ * other processes such as the WAL receiver to be alive.
+ */
+ if (StandbyModeRequested && !IsUnderPostmaster)
+ ereport(FATAL,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("standby mode is not supported by single-user servers")));
+}
+
+static void
+validateRecoveryParameters(void)
+{
+ if (!ArchiveRecoveryRequested)
+ return;
+
+ /*
+ * Check for compulsory parameters
+ */
+ if (StandbyModeRequested)
+ {
+ if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
+ (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
+ ereport(WARNING,
+ (errmsg("specified neither primary_conninfo nor restore_command"),
+ errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
+ }
+ else
+ {
+ if (recoveryRestoreCommand == NULL ||
+ strcmp(recoveryRestoreCommand, "") == 0)
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("must specify restore_command when standby mode is not enabled")));
+ }
+
+ /*
+ * Override any inconsistent requests. Note that this is a change of
+ * behaviour in 9.5; prior to this we simply ignored a request to pause if
+ * hot_standby = off, which was surprising behaviour.
+ */
+ if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
+ !EnableHotStandby)
+ recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
+
+ /*
+ * Final parsing of recovery_target_time string; see also
+ * check_recovery_target_time().
+ */
+ if (recoveryTarget == RECOVERY_TARGET_TIME)
+ {
+ recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+ CStringGetDatum(recovery_target_time_string),
+ ObjectIdGetDatum(InvalidOid),
+ Int32GetDatum(-1)));
+ }
+
+ /*
+ * If user specified recovery_target_timeline, validate it or compute the
+ * "latest" value. We can't do this until after we've gotten the restore
+ * command and set InArchiveRecovery, because we need to fetch timeline
+ * history files from the archive.
+ */
+ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
+ {
+ TimeLineID rtli = recoveryTargetTLIRequested;
+
+ /* Timeline 1 does not have a history file, all else should */
+ if (rtli != 1 && !existsTimeLineHistory(rtli))
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery target timeline %u does not exist",
+ rtli)));
+ recoveryTargetTLI = rtli;
+ }
+ else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+ {
+ /* We start the "latest" search from pg_control's timeline */
+ recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
+ }
+ else
+ {
+ /*
+ * else we just use the recoveryTargetTLI as already read from
+ * ControlFile
+ */
+ Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
+ }
+}
+
+/*
+ * read_backup_label: check to see if a backup_label file is present
+ *
+ * If we see a backup_label during recovery, we assume that we are recovering
+ * from a backup dump file, and we therefore roll forward from the checkpoint
+ * identified by the label file, NOT what pg_control says. This avoids the
+ * problem that pg_control might have been archived one or more checkpoints
+ * later than the start of the dump, and so if we rely on it as the start
+ * point, we will fail to restore a consistent database state.
+ *
+ * Returns true if a backup_label was found (and fills the checkpoint
+ * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
+ * returns false if not. If this backup_label came from a streamed backup,
+ * *backupEndRequired is set to true. If this backup_label was created during
+ * recovery, *backupFromStandby is set to true.
+ *
+ * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
+ * and TLI read from the backup file.
+ */
+static bool
+read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
+ bool *backupEndRequired, bool *backupFromStandby)
+{
+ char startxlogfilename[MAXFNAMELEN];
+ TimeLineID tli_from_walseg,
+ tli_from_file;
+ FILE *lfp;
+ char ch;
+ char backuptype[20];
+ char backupfrom[20];
+ char backuplabel[MAXPGPATH];
+ char backuptime[128];
+ uint32 hi,
+ lo;
+
+ /* suppress possible uninitialized-variable warnings */
+ *checkPointLoc = InvalidXLogRecPtr;
+ *backupLabelTLI = 0;
+ *backupEndRequired = false;
+ *backupFromStandby = false;
+
+ /*
+ * See if label file is present
+ */
+ lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+ if (!lfp)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ return false; /* it's not there, all is fine */
+ }
+
+ /*
+ * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
+ * is pretty crude, but we are not expecting any variability in the file
+ * format).
+ */
+ if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+ &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ RedoStartLSN = ((uint64) hi) << 32 | lo;
+ RedoStartTLI = tli_from_walseg;
+ if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+ &hi, &lo, &ch) != 3 || ch != '\n')
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ *checkPointLoc = ((uint64) hi) << 32 | lo;
+ *backupLabelTLI = tli_from_walseg;
+
+ /*
+ * BACKUP METHOD lets us know if this was a typical backup ("streamed",
+ * which could mean either pg_basebackup or the pg_backup_start/stop
+ * method was used) or if this label came from somewhere else (the only
+ * other option today being from pg_rewind). If this was a streamed
+ * backup then we know that we need to play through until we get to the
+ * end of the WAL which was generated during the backup (at which point we
+ * will have reached consistency and backupEndRequired will be reset to be
+ * false).
+ */
+ if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
+ {
+ if (strcmp(backuptype, "streamed") == 0)
+ *backupEndRequired = true;
+ }
+
+ /*
+ * BACKUP FROM lets us know if this was from a primary or a standby. If
+ * it was from a standby, we'll double-check that the control file state
+ * matches that of a standby.
+ */
+ if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
+ {
+ if (strcmp(backupfrom, "standby") == 0)
+ *backupFromStandby = true;
+ }
+
+ /*
+ * Parse START TIME and LABEL. Those are not mandatory fields for recovery
+ * but checking for their presence is useful for debugging and the next
+ * sanity checks. Cope also with the fact that the result buffers have a
+ * pre-allocated size, hence if the backup_label file has been generated
+ * with strings longer than the maximum assumed here an incorrect parsing
+ * happens. That's fine as only minor consistency checks are done
+ * afterwards.
+ */
+ if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
+ ereport(DEBUG1,
+ (errmsg_internal("backup time %s in file \"%s\"",
+ backuptime, BACKUP_LABEL_FILE)));
+
+ if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
+ ereport(DEBUG1,
+ (errmsg_internal("backup label %s in file \"%s\"",
+ backuplabel, BACKUP_LABEL_FILE)));
+
+ /*
+ * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
+ * it as a sanity check if present.
+ */
+ if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
+ {
+ if (tli_from_walseg != tli_from_file)
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
+ errdetail("Timeline ID parsed is %u, but expected %u.",
+ tli_from_file, tli_from_walseg)));
+
+ ereport(DEBUG1,
+ (errmsg_internal("backup timeline %u in file \"%s\"",
+ tli_from_file, BACKUP_LABEL_FILE)));
+ }
+
+ if (ferror(lfp) || FreeFile(lfp))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+
+ return true;
+}
+
+/*
+ * read_tablespace_map: check to see if a tablespace_map file is present
+ *
+ * If we see a tablespace_map file during recovery, we assume that we are
+ * recovering from a backup dump file, and we therefore need to create symlinks
+ * as per the information present in tablespace_map file.
+ *
+ * Returns true if a tablespace_map file was found (and fills *tablespaces
+ * with a tablespaceinfo struct for each tablespace listed in the file);
+ * returns false if not.
+ */
+static bool
+read_tablespace_map(List **tablespaces)
+{
+ tablespaceinfo *ti;
+ FILE *lfp;
+ char str[MAXPGPATH];
+ int ch,
+ i,
+ n;
+ bool was_backslash;
+
+ /*
+ * See if tablespace_map file is present
+ */
+ lfp = AllocateFile(TABLESPACE_MAP, "r");
+ if (!lfp)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ TABLESPACE_MAP)));
+ return false; /* it's not there, all is fine */
+ }
+
+ /*
+ * Read and parse the link name and path lines from tablespace_map file
+ * (this code is pretty crude, but we are not expecting any variability in
+ * the file format). De-escape any backslashes that were inserted.
+ */
+ i = 0;
+ was_backslash = false;
+ while ((ch = fgetc(lfp)) != EOF)
+ {
+ if (!was_backslash && (ch == '\n' || ch == '\r'))
+ {
+ if (i == 0)
+ continue; /* \r immediately followed by \n */
+
+ /*
+ * The de-escaped line should contain an OID followed by exactly
+ * one space followed by a path. The path might start with
+ * spaces, so don't be too liberal about parsing.
+ */
+ str[i] = '\0';
+ n = 0;
+ while (str[n] && str[n] != ' ')
+ n++;
+ if (n < 1 || n >= i - 1)
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+ str[n++] = '\0';
+
+ ti = palloc0(sizeof(tablespaceinfo));
+ ti->oid = pstrdup(str);
+ ti->path = pstrdup(str + n);
+ *tablespaces = lappend(*tablespaces, ti);
+
+ i = 0;
+ continue;
+ }
+ else if (!was_backslash && ch == '\\')
+ was_backslash = true;
+ else
+ {
+ if (i < sizeof(str) - 1)
+ str[i++] = ch;
+ was_backslash = false;
+ }
+ }
+
+ if (i != 0 || was_backslash) /* last line not terminated? */
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+
+ if (ferror(lfp) || FreeFile(lfp))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ TABLESPACE_MAP)));
+
+ return true;
+}
+
+/*
+ * Finish WAL recovery.
+ *
+ * This does not close the 'xlogreader' yet, because in some cases the caller
+ * still wants to re-read the last checkpoint record by calling
+ * ReadCheckPointRecord().
+ *
+ * Returns the position of the last valid or applied record, after which new
+ * WAL should be appended, information about why recovery was ended, and some
+ * other things. See the WalRecoveryResult struct for details.
+ */
+EndOfWalRecoveryInfo *
+FinishWalRecovery(void)
+{
+ EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
+ XLogRecPtr lastRec;
+ TimeLineID lastRecTLI;
+ XLogRecPtr endOfLog;
+
+ /*
+ * Kill WAL receiver, if it's still running, before we continue to write
+ * the startup checkpoint and aborted-contrecord records. It will trump
+ * over these records and subsequent ones if it's still alive when we
+ * start writing WAL.
+ */
+ XLogShutdownWalRcv();
+
+ /*
+ * We are now done reading the xlog from stream. Turn off streaming
+ * recovery to force fetching the files (which would be required at end of
+ * recovery, e.g., timeline history file) from archive or pg_wal.
+ *
+ * Note that standby mode must be turned off after killing WAL receiver,
+ * i.e., calling XLogShutdownWalRcv().
+ */
+ Assert(!WalRcvStreaming());
+ StandbyMode = false;
+
+ /*
+ * Determine where to start writing WAL next.
+ *
+ * Re-fetch the last valid or last applied record, so we can identify the
+ * exact endpoint of what we consider the valid portion of WAL. There may
+ * be an incomplete continuation record after that, in which case
+ * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
+ * write a special OVERWRITE_CONTRECORD message to mark that the rest of
+ * it is intentionally missing. See CreateOverwriteContrecordRecord().
+ *
+ * An important side-effect of this is to load the last page into
+ * xlogreader. The caller uses it to initialize the WAL for writing.
+ */
+ if (!InRecovery)
+ {
+ lastRec = CheckPointLoc;
+ lastRecTLI = CheckPointTLI;
+ }
+ else
+ {
+ lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
+ lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
+ }
+ XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
+ (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
+ endOfLog = xlogreader->EndRecPtr;
+
+ /*
+ * Remember the TLI in the filename of the XLOG segment containing the
+ * end-of-log. It could be different from the timeline that endOfLog
+ * nominally belongs to, if there was a timeline switch in that segment,
+ * and we were reading the old WAL from a segment belonging to a higher
+ * timeline.
+ */
+ result->endOfLogTLI = xlogreader->seg.ws_tli;
+
+ if (ArchiveRecoveryRequested)
+ {
+ /*
+ * We are no longer in archive recovery state.
+ *
+ * We are now done reading the old WAL. Turn off archive fetching if
+ * it was active.
+ */
+ Assert(InArchiveRecovery);
+ InArchiveRecovery = false;
+
+ /*
+ * If the ending log segment is still open, close it (to avoid
+ * problems on Windows with trying to rename or delete an open file).
+ */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ }
+
+ /*
+ * Copy the last partial block to the caller, for initializing the WAL
+ * buffer for appending new WAL.
+ */
+ if (endOfLog % XLOG_BLCKSZ != 0)
+ {
+ char *page;
+ int len;
+ XLogRecPtr pageBeginPtr;
+
+ pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
+ Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+
+ /* Copy the valid part of the last block */
+ len = endOfLog % XLOG_BLCKSZ;
+ page = palloc(len);
+ memcpy(page, xlogreader->readBuf, len);
+
+ result->lastPageBeginPtr = pageBeginPtr;
+ result->lastPage = page;
+ }
+ else
+ {
+ /* There is no partial block to copy. */
+ result->lastPageBeginPtr = endOfLog;
+ result->lastPage = NULL;
+ }
+
+ /*
+ * Create a comment for the history file to explain why and where timeline
+ * changed.
+ */
+ result->recoveryStopReason = getRecoveryStopReason();
+
+ result->lastRec = lastRec;
+ result->lastRecTLI = lastRecTLI;
+ result->endOfLog = endOfLog;
+
+ result->abortedRecPtr = abortedRecPtr;
+ result->missingContrecPtr = missingContrecPtr;
+
+ result->standby_signal_file_found = standby_signal_file_found;
+ result->recovery_signal_file_found = recovery_signal_file_found;
+
+ return result;
+}
+
+/*
+ * Clean up the WAL reader and leftovers from restoring WAL from archive
+ */
+void
+ShutdownWalRecovery(void)
+{
+ char recoveryPath[MAXPGPATH];
+
+ /* Final update of pg_stat_recovery_prefetch. */
+ XLogPrefetcherComputeStats(xlogprefetcher);
+
+ /* Shut down xlogreader */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ XLogReaderFree(xlogreader);
+ XLogPrefetcherFree(xlogprefetcher);
+
+ if (ArchiveRecoveryRequested)
+ {
+ /*
+ * Since there might be a partial WAL segment named RECOVERYXLOG, get
+ * rid of it.
+ */
+ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
+ unlink(recoveryPath); /* ignore any error */
+
+ /* Get rid of any remaining recovered timeline-history file, too */
+ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
+ unlink(recoveryPath); /* ignore any error */
+ }
+
+ /*
+ * We don't need the latch anymore. It's not strictly necessary to disown
+ * it, but let's do it for the sake of tidiness.
+ */
+ if (ArchiveRecoveryRequested)
+ DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Perform WAL recovery.
+ *
+ * If the system was shut down cleanly, this is never called.
+ */
+void
+PerformWalRecovery(void)
+{
+ XLogRecord *record;
+ bool reachedRecoveryTarget = false;
+ TimeLineID replayTLI;
+
+ /*
+ * Initialize shared variables for tracking progress of WAL replay, as if
+ * we had just replayed the record before the REDO location (or the
+ * checkpoint record itself, if it's a shutdown checkpoint).
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ if (RedoStartLSN < CheckPointLoc)
+ {
+ XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
+ XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
+ }
+ else
+ {
+ XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
+ }
+ XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
+ XLogRecoveryCtl->recoveryLastXTime = 0;
+ XLogRecoveryCtl->currentChunkStartTime = 0;
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /* Also ensure XLogReceiptTime has a sane value */
+ XLogReceiptTime = GetCurrentTimestamp();
+
+ /*
+ * Let postmaster know we've started redo now, so that it can launch the
+ * archiver if necessary.
+ */
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+
+ /*
+ * Allow read-only connections immediately if we're consistent already.
+ */
+ CheckRecoveryConsistency();
+
+ /*
+ * Find the first record that logically follows the checkpoint --- it
+ * might physically precede it, though.
+ */
+ if (RedoStartLSN < CheckPointLoc)
+ {
+ /* back up to find the record */
+ replayTLI = RedoStartTLI;
+ XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
+ record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
+ }
+ else
+ {
+ /* just have to read next record after CheckPoint */
+ Assert(xlogreader->ReadRecPtr == CheckPointLoc);
+ replayTLI = CheckPointTLI;
+ record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
+ }
+
+ if (record != NULL)
+ {
+ TimestampTz xtime;
+ PGRUsage ru0;
+
+ pg_rusage_init(&ru0);
+
+ InRedo = true;
+
+ RmgrStartup();
+
+ ereport(LOG,
+ (errmsg("redo starts at %X/%X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+
+ /* Prepare to report progress of the redo phase. */
+ if (!StandbyMode)
+ begin_startup_progress_phase();
+
+ /*
+ * main redo apply loop
+ */
+ do
+ {
+ if (!StandbyMode)
+ ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG ||
+ (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+ (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
+ {
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+ LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
+ xlog_outrec(&buf, xlogreader);
+ appendStringInfoString(&buf, " - ");
+ xlog_outdesc(&buf, xlogreader);
+ elog(LOG, "%s", buf.data);
+ pfree(buf.data);
+ }
+#endif
+
+ /* Handle interrupt signals of startup process */
+ HandleStartupProcInterrupts();
+
+ /*
+ * Pause WAL replay, if requested by a hot-standby session via
+ * SetRecoveryPause().
+ *
+ * Note that we intentionally don't take the info_lck spinlock
+ * here. We might therefore read a slightly stale value of the
+ * recoveryPause flag, but it can't be very stale (no worse than
+ * the last spinlock we did acquire). Since a pause request is a
+ * pretty asynchronous thing anyway, possibly responding to it one
+ * WAL record later than we otherwise would is a minor issue, so
+ * it doesn't seem worth adding another spinlock cycle to prevent
+ * that.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+
+ /*
+ * Have we reached our recovery target?
+ */
+ if (recoveryStopsBefore(xlogreader))
+ {
+ reachedRecoveryTarget = true;
+ break;
+ }
+
+ /*
+ * If we've been asked to lag the primary, wait on latch until
+ * enough time has passed.
+ */
+ if (recoveryApplyDelay(xlogreader))
+ {
+ /*
+ * We test for paused recovery again here. If user sets
+ * delayed apply, it may be because they expect to pause
+ * recovery in case of problems, so we must test again here
+ * otherwise pausing during the delay-wait wouldn't work.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+ }
+
+ /*
+ * Apply the record
+ */
+ ApplyWalRecord(xlogreader, record, &replayTLI);
+
+ /* Exit loop if we reached inclusive recovery target */
+ if (recoveryStopsAfter(xlogreader))
+ {
+ reachedRecoveryTarget = true;
+ break;
+ }
+
+ /* Else, try to fetch the next WAL record */
+ record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
+ } while (record != NULL);
+
+ /*
+ * end of main redo apply loop
+ */
+
+ if (reachedRecoveryTarget)
+ {
+ if (!reachedConsistency)
+ ereport(FATAL,
+ (errmsg("requested recovery stop point is before consistent recovery point")));
+
+ /*
+ * This is the last point where we can restart recovery with a new
+ * recovery target, if we shutdown and begin again. After this,
+ * Resource Managers may choose to do permanent corrective actions
+ * at end of recovery.
+ */
+ switch (recoveryTargetAction)
+ {
+ case RECOVERY_TARGET_ACTION_SHUTDOWN:
+
+ /*
+ * exit with special return code to request shutdown of
+ * postmaster. Log messages issued from postmaster.
+ */
+ proc_exit(3);
+
+ case RECOVERY_TARGET_ACTION_PAUSE:
+ SetRecoveryPause(true);
+ recoveryPausesHere(true);
+
+ /* drop into promote */
+
+ case RECOVERY_TARGET_ACTION_PROMOTE:
+ break;
+ }
+ }
+
+ RmgrCleanup();
+
+ ereport(LOG,
+ (errmsg("redo done at %X/%X system usage: %s",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+ pg_rusage_show(&ru0))));
+ xtime = GetLatestXTime();
+ if (xtime)
+ ereport(LOG,
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(xtime))));
+
+ InRedo = false;
+ }
+ else
+ {
+ /* there are no WAL records following the checkpoint */
+ ereport(LOG,
+ (errmsg("redo is not required")));
+ }
+
+ /*
+ * This check is intentionally after the above log messages that indicate
+ * how far recovery went.
+ */
+ if (ArchiveRecoveryRequested &&
+ recoveryTarget != RECOVERY_TARGET_UNSET &&
+ !reachedRecoveryTarget)
+ ereport(FATAL,
+ (errmsg("recovery ended before configured recovery target was reached")));
+}
+
+/*
+ * Subroutine of PerformWalRecovery, to apply one WAL record.
+ */
+static void
+ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
+{
+ ErrorContextCallback errcallback;
+ bool switchedTLI = false;
+
+ /* Setup error traceback support for ereport() */
+ errcallback.callback = rm_redo_error_callback;
+ errcallback.arg = (void *) xlogreader;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /*
+ * ShmemVariableCache->nextXid must be beyond record's xid.
+ */
+ AdvanceNextFullTransactionIdPastXid(record->xl_xid);
+
+ /*
+ * Before replaying this record, check if this record causes the current
+ * timeline to change. The record is already considered to be part of the
+ * new timeline, so we update replayTLI before replaying it. That's
+ * important so that replayEndTLI, which is recorded as the minimum
+ * recovery point's TLI if recovery stops after this record, is set
+ * correctly.
+ */
+ if (record->xl_rmid == RM_XLOG_ID)
+ {
+ TimeLineID newReplayTLI = *replayTLI;
+ TimeLineID prevReplayTLI = *replayTLI;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (info == XLOG_CHECKPOINT_SHUTDOWN)
+ {
+ CheckPoint checkPoint;
+
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ newReplayTLI = checkPoint.ThisTimeLineID;
+ prevReplayTLI = checkPoint.PrevTimeLineID;
+ }
+ else if (info == XLOG_END_OF_RECOVERY)
+ {
+ xl_end_of_recovery xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
+ newReplayTLI = xlrec.ThisTimeLineID;
+ prevReplayTLI = xlrec.PrevTimeLineID;
+ }
+
+ if (newReplayTLI != *replayTLI)
+ {
+ /* Check that it's OK to switch to this TLI */
+ checkTimeLineSwitch(xlogreader->EndRecPtr,
+ newReplayTLI, prevReplayTLI, *replayTLI);
+
+ /* Following WAL records should be run with new TLI */
+ *replayTLI = newReplayTLI;
+ switchedTLI = true;
+ }
+ }
+
+ /*
+ * Update shared replayEndRecPtr before replaying this record, so that
+ * XLogFlush will update minRecoveryPoint correctly.
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->replayEndTLI = *replayTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * If we are attempting to enter Hot Standby mode, process XIDs we see
+ */
+ if (standbyState >= STANDBY_INITIALIZED &&
+ TransactionIdIsValid(record->xl_xid))
+ RecordKnownAssignedTransactionIds(record->xl_xid);
+
+ /*
+ * Some XLOG record types that are related to recovery are processed
+ * directly here, rather than in xlog_redo()
+ */
+ if (record->xl_rmid == RM_XLOG_ID)
+ xlogrecovery_redo(xlogreader, *replayTLI);
+
+ /* Now apply the WAL record itself */
+ GetRmgr(record->xl_rmid).rm_redo(xlogreader);
+
+ /*
+ * After redo, check whether the backup pages associated with the WAL
+ * record are consistent with the existing pages. This check is done only
+ * if consistency check is enabled for this record.
+ */
+ if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+ verifyBackupPageConsistency(xlogreader);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+
+ /*
+ * Update lastReplayedEndRecPtr after this record has been successfully
+ * replayed.
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
+ * receiver so that it notices the updated lastReplayedEndRecPtr and sends
+ * a reply to the primary.
+ */
+ if (doRequestWalReceiverReply)
+ {
+ doRequestWalReceiverReply = false;
+ WalRcvForceReply();
+ }
+
+ /* Allow read-only connections if we're consistent now */
+ CheckRecoveryConsistency();
+
+ /* Is this a timeline switch? */
+ if (switchedTLI)
+ {
+ /*
+ * Before we continue on the new timeline, clean up any (possibly
+ * bogus) future WAL segments on the old timeline.
+ */
+ RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
+
+ /*
+ * Wake up any walsenders to notice that we are on a new timeline.
+ */
+ if (AllowCascadeReplication())
+ WalSndWakeup();
+
+ /* Reset the prefetcher. */
+ XLogPrefetchReconfigure();
+ }
+}
+
+/*
+ * Some XLOG RM record types that are directly related to WAL recovery are
+ * handled here rather than in the xlog_redo()
+ */
+static void
+xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ XLogRecPtr lsn = record->EndRecPtr;
+
+ Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
+
+ if (info == XLOG_OVERWRITE_CONTRECORD)
+ {
+ /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
+ xl_overwrite_contrecord xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
+ if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
+ elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+ LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+ LSN_FORMAT_ARGS(record->overwrittenRecPtr));
+
+ /* We have safely skipped the aborted record */
+ abortedRecPtr = InvalidXLogRecPtr;
+ missingContrecPtr = InvalidXLogRecPtr;
+
+ ereport(LOG,
+ (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
+ LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+ timestamptz_to_str(xlrec.overwrite_time))));
+
+ /* Verifying the record should only happen once */
+ record->overwrittenRecPtr = InvalidXLogRecPtr;
+ }
+ else if (info == XLOG_BACKUP_END)
+ {
+ XLogRecPtr startpoint;
+
+ memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
+
+ if (backupStartPoint == startpoint)
+ {
+ /*
+ * We have reached the end of base backup, the point where
+ * pg_backup_stop() was done. The data on disk is now consistent
+ * (assuming we have also reached minRecoveryPoint). Set
+ * backupEndPoint to the current LSN, so that the next call to
+ * CheckRecoveryConsistency() will notice it and do the
+ * end-of-backup processing.
+ */
+ elog(DEBUG1, "end of backup record reached");
+
+ backupEndPoint = lsn;
+ }
+ else
+ elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+ LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
+ }
+}
+
+/*
+ * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
+ * directories.
+ *
+ * Replay of database creation XLOG records for databases that were later
+ * dropped can create fake directories in pg_tblspc. By the time consistency
+ * is reached these directories should have been removed; here we verify
+ * that this did indeed happen. This is to be called at the point where
+ * consistent state is reached.
+ *
+ * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
+ * useful for testing purposes, and also allows for an escape hatch in case
+ * things go south.
+ */
+static void
+CheckTablespaceDirectory(void)
+{
+ DIR *dir;
+ struct dirent *de;
+
+ dir = AllocateDir("pg_tblspc");
+ while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
+ {
+ char path[MAXPGPATH + 10];
+
+ /* Skip entries of non-oid names */
+ if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+ continue;
+
+ snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
+
+ if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
+ ereport(allow_in_place_tablespaces ? WARNING : PANIC,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("unexpected directory entry \"%s\" found in %s",
+ de->d_name, "pg_tblspc/"),
+ errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
+ errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
+ }
+}
+
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+ XLogRecPtr lastReplayedEndRecPtr;
+ TimeLineID lastReplayedTLI;
+
+ /*
+ * During crash recovery, we don't reach a consistent state until we've
+ * replayed all the WAL.
+ */
+ if (XLogRecPtrIsInvalid(minRecoveryPoint))
+ return;
+
+ Assert(InArchiveRecovery);
+
+ /*
+ * assume that we are called in the startup process, and hence don't need
+ * a lock to read lastReplayedEndRecPtr
+ */
+ lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
+
+ /*
+ * Have we reached the point where our base backup was completed?
+ */
+ if (!XLogRecPtrIsInvalid(backupEndPoint) &&
+ backupEndPoint <= lastReplayedEndRecPtr)
+ {
+ elog(DEBUG1, "end of backup reached");
+
+ /*
+ * We have reached the end of base backup, as indicated by pg_control.
+ * Update the control file accordingly.
+ */
+ ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
+ backupStartPoint = InvalidXLogRecPtr;
+ backupEndPoint = InvalidXLogRecPtr;
+ backupEndRequired = false;
+ }
+
+ /*
+ * Have we passed our safe starting point? Note that minRecoveryPoint is
+ * known to be incorrectly set if recovering from a backup, until the
+ * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
+ * All we know prior to that is that we're not consistent yet.
+ */
+ if (!reachedConsistency && !backupEndRequired &&
+ minRecoveryPoint <= lastReplayedEndRecPtr)
+ {
+ /*
+ * Check to see if the XLOG sequence contained any unresolved
+ * references to uninitialized pages.
+ */
+ XLogCheckInvalidPages();
+
+ /*
+ * Check that pg_tblspc doesn't contain any real directories. Replay
+ * of Database/CREATE_* records may have created ficticious tablespace
+ * directories that should have been removed by the time consistency
+ * was reached.
+ */
+ CheckTablespaceDirectory();
+
+ reachedConsistency = true;
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+ }
+
+ /*
+ * Have we got a valid starting snapshot that will allow queries to be
+ * run? If so, we can tell postmaster that the database is consistent now,
+ * enabling connections.
+ */
+ if (standbyState == STANDBY_SNAPSHOT_READY &&
+ !LocalHotStandbyActive &&
+ reachedConsistency &&
+ IsUnderPostmaster)
+ {
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->SharedHotStandbyActive = true;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ LocalHotStandbyActive = true;
+
+ SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
+ }
+}
+
+/*
+ * Error context callback for errors occurring during rm_redo().
+ */
+static void
+rm_redo_error_callback(void *arg)
+{
+ XLogReaderState *record = (XLogReaderState *) arg;
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ xlog_outdesc(&buf, record);
+ xlog_block_info(&buf, record);
+
+ /* translator: %s is a WAL record description */
+ errcontext("WAL redo at %X/%X for %s",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ buf.data);
+
+ pfree(buf.data);
+}
+
+/*
+ * Returns a string describing an XLogRecord, consisting of its identity
+ * optionally followed by a colon, a space, and a further description.
+ */
+void
+xlog_outdesc(StringInfo buf, XLogReaderState *record)
+{
+ RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
+ uint8 info = XLogRecGetInfo(record);
+ const char *id;
+
+ appendStringInfoString(buf, rmgr.rm_name);
+ appendStringInfoChar(buf, '/');
+
+ id = rmgr.rm_identify(info);
+ if (id == NULL)
+ appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
+ else
+ appendStringInfo(buf, "%s: ", id);
+
+ rmgr.rm_desc(buf, record);
+}
+
+#ifdef WAL_DEBUG
+
+static void
+xlog_outrec(StringInfo buf, XLogReaderState *record)
+{
+ appendStringInfo(buf, "prev %X/%X; xid %u",
+ LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
+ XLogRecGetXid(record));
+
+ appendStringInfo(buf, "; len %u",
+ XLogRecGetDataLen(record));
+
+ xlog_block_info(buf, record);
+}
+#endif /* WAL_DEBUG */
+
+/*
+ * Returns a string giving information about all the blocks in an
+ * XLogRecord.
+ */
+static void
+xlog_block_info(StringInfo buf, XLogReaderState *record)
+{
+ int block_id;
+
+ /* decode block references */
+ for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blk;
+
+ if (!XLogRecGetBlockTagExtended(record, block_id,
+ &rnode, &forknum, &blk, NULL))
+ continue;
+
+ if (forknum != MAIN_FORKNUM)
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
+ block_id,
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum,
+ blk);
+ else
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
+ block_id,
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ blk);
+ if (XLogRecHasBlockImage(record, block_id))
+ appendStringInfoString(buf, " FPW");
+ }
+}
+
+
+/*
+ * Check that it's OK to switch to new timeline during recovery.
+ *
+ * 'lsn' is the address of the shutdown checkpoint record we're about to
+ * replay. (Currently, timeline can only change at a shutdown checkpoint).
+ */
+static void
+checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
+ TimeLineID replayTLI)
+{
+ /* Check that the record agrees on what the current (old) timeline is */
+ if (prevTLI != replayTLI)
+ ereport(PANIC,
+ (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
+ prevTLI, replayTLI)));
+
+ /*
+ * The new timeline better be in the list of timelines we expect to see,
+ * according to the timeline history. It should also not decrease.
+ */
+ if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+ newTLI, replayTLI)));
+
+ /*
+ * If we have not yet reached min recovery point, and we're about to
+ * switch to a timeline greater than the timeline of the min recovery
+ * point: trouble. After switching to the new timeline, we could not
+ * possibly visit the min recovery point on the correct timeline anymore.
+ * This can happen if there is a newer timeline in the archive that
+ * branched before the timeline the min recovery point is on, and you
+ * attempt to do PITR to the new timeline.
+ */
+ if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
+ lsn < minRecoveryPoint &&
+ newTLI > minRecoveryPointTLI)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
+ newTLI,
+ LSN_FORMAT_ARGS(minRecoveryPoint),
+ minRecoveryPointTLI)));
+
+ /* Looks good */
+}
+
+
+/*
+ * Extract timestamp from WAL record.
+ *
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
+ */
+static bool
+getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ uint8 xact_info = info & XLOG_XACT_OPMASK;
+ uint8 rmid = XLogRecGetRmid(record);
+
+ if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+ {
+ *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+ return true;
+ }
+ if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED))
+ {
+ *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED))
+ {
+ *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+verifyBackupPageConsistency(XLogReaderState *record)
+{
+ RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+ int block_id;
+
+ /* Records with no backup blocks have no need for consistency checks. */
+ if (!XLogRecHasAnyBlockRefs(record))
+ return;
+
+ Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+ for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ Buffer buf;
+ Page page;
+
+ if (!XLogRecGetBlockTagExtended(record, block_id,
+ &rnode, &forknum, &blkno, NULL))
+ {
+ /*
+ * WAL record doesn't contain a block reference with the given id.
+ * Do nothing.
+ */
+ continue;
+ }
+
+ Assert(XLogRecHasBlockImage(record, block_id));
+
+ if (XLogRecBlockImageApply(record, block_id))
+ {
+ /*
+ * WAL record has already applied the page, so bypass the
+ * consistency check as that would result in comparing the full
+ * page stored in the record with itself.
+ */
+ continue;
+ }
+
+ /*
+ * Read the contents from the current buffer and store it in a
+ * temporary page.
+ */
+ buf = XLogReadBufferExtended(rnode, forknum, blkno,
+ RBM_NORMAL_NO_LOG,
+ InvalidBuffer);
+ if (!BufferIsValid(buf))
+ continue;
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+
+ /*
+ * Take a copy of the local page where WAL has been applied to have a
+ * comparison base before masking it...
+ */
+ memcpy(replay_image_masked, page, BLCKSZ);
+
+ /* No need for this page anymore now that a copy is in. */
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * If the block LSN is already ahead of this WAL record, we can't
+ * expect contents to match. This can happen if recovery is
+ * restarted.
+ */
+ if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+ continue;
+
+ /*
+ * Read the contents from the backup copy, stored in WAL record and
+ * store it in a temporary page. There is no need to allocate a new
+ * page here, a local buffer is fine to hold its contents and a mask
+ * can be directly applied on it.
+ */
+ if (!RestoreBlockImage(record, block_id, primary_image_masked))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /*
+ * If masking function is defined, mask both the primary and replay
+ * images
+ */
+ if (rmgr.rm_mask != NULL)
+ {
+ rmgr.rm_mask(replay_image_masked, blkno);
+ rmgr.rm_mask(primary_image_masked, blkno);
+ }
+
+ /* Time to compare the primary and replay images. */
+ if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
+ {
+ elog(FATAL,
+ "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum, blkno);
+ }
+ }
+}
+
+/*
+ * For point-in-time recovery, this function decides whether we want to
+ * stop applying the XLOG before the current record.
+ *
+ * Returns true if we are stopping, false otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
+ */
+static bool
+recoveryStopsBefore(XLogReaderState *record)
+{
+ bool stopsHere = false;
+ uint8 xact_info;
+ bool isCommit;
+ TimestampTz recordXtime = 0;
+ TransactionId recordXid;
+
+ /*
+ * Ignore recovery target settings when not in archive recovery (meaning
+ * we are in crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
+
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ /* Check if target LSN has been reached */
+ if (recoveryTarget == RECOVERY_TARGET_LSN &&
+ !recoveryTargetInclusive &&
+ record->ReadRecPtr >= recoveryTargetLSN)
+ {
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = record->ReadRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ ereport(LOG,
+ (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN))));
+ return true;
+ }
+
+ /* Otherwise we only consider stopping before COMMIT or ABORT records. */
+ if (XLogRecGetRmid(record) != RM_XACT_ID)
+ return false;
+
+ xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+ if (xact_info == XLOG_XACT_COMMIT)
+ {
+ isCommit = true;
+ recordXid = XLogRecGetXid(record);
+ }
+ else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ isCommit = true;
+ ParseCommitRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else if (xact_info == XLOG_XACT_ABORT)
+ {
+ isCommit = false;
+ recordXid = XLogRecGetXid(record);
+ }
+ else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ isCommit = false;
+ ParseAbortRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else
+ return false;
+
+ if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
+ {
+ /*
+ * There can be only one transaction end record with this exact
+ * transactionid
+ *
+ * when testing for an xid, we MUST test for equality only, since
+ * transactions are numbered in the order they start, not the order
+ * they complete. A higher numbered xid will complete before you about
+ * 50% of the time...
+ */
+ stopsHere = (recordXid == recoveryTargetXid);
+ }
+
+ /*
+ * Note: we must fetch recordXtime regardless of recoveryTarget setting.
+ * We don't expect getRecordTimestamp ever to fail, since we already know
+ * this is a commit or abort record; but test its result anyway.
+ */
+ if (getRecordTimestamp(record, &recordXtime) &&
+ recoveryTarget == RECOVERY_TARGET_TIME)
+ {
+ /*
+ * There can be many transactions that share the same commit time, so
+ * we stop after the last one, if we are inclusive, or stop at the
+ * first one if we are exclusive
+ */
+ if (recoveryTargetInclusive)
+ stopsHere = (recordXtime > recoveryTargetTime);
+ else
+ stopsHere = (recordXtime >= recoveryTargetTime);
+ }
+
+ if (stopsHere)
+ {
+ recoveryStopAfter = false;
+ recoveryStopXid = recordXid;
+ recoveryStopTime = recordXtime;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+
+ if (isCommit)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping before commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping before abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ }
+
+ return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogRecoveryCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogReaderState *record)
+{
+ uint8 info;
+ uint8 xact_info;
+ uint8 rmid;
+ TimestampTz recordXtime = 0;
+
+ /*
+ * Ignore recovery target settings when not in archive recovery (meaning
+ * we are in crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ rmid = XLogRecGetRmid(record);
+
+ /*
+ * There can be many restore points that share the same name; we stop at
+ * the first one.
+ */
+ if (recoveryTarget == RECOVERY_TARGET_NAME &&
+ rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *recordRestorePointData;
+
+ recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+ if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ (void) getRecordTimestamp(record, &recoveryStopTime);
+ strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+ ereport(LOG,
+ (errmsg("recovery stopping at restore point \"%s\", time %s",
+ recoveryStopName,
+ timestamptz_to_str(recoveryStopTime))));
+ return true;
+ }
+ }
+
+ /* Check if the target LSN has been reached */
+ if (recoveryTarget == RECOVERY_TARGET_LSN &&
+ recoveryTargetInclusive &&
+ record->ReadRecPtr >= recoveryTargetLSN)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = record->ReadRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ ereport(LOG,
+ (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN))));
+ return true;
+ }
+
+ if (rmid != RM_XACT_ID)
+ return false;
+
+ xact_info = info & XLOG_XACT_OPMASK;
+
+ if (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED ||
+ xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ TransactionId recordXid;
+
+ /* Update the last applied transaction timestamp */
+ if (getRecordTimestamp(record, &recordXtime))
+ SetLatestXTime(recordXtime);
+
+ /* Extract the XID of the committed/aborted transaction */
+ if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ ParseCommitRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ ParseAbortRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else
+ recordXid = XLogRecGetXid(record);
+
+ /*
+ * There can be only one transaction end record with this exact
+ * transactionid
+ *
+ * when testing for an xid, we MUST test for equality only, since
+ * transactions are numbered in the order they start, not the order
+ * they complete. A higher numbered xid will complete before you about
+ * 50% of the time...
+ */
+ if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+ recordXid == recoveryTargetXid)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = recordXid;
+ recoveryStopTime = recordXtime;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+
+ if (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ else if (xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ return true;
+ }
+ }
+
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
+
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopTime = 0;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Create a comment for the history file to explain why and where
+ * timeline changed.
+ */
+static char *
+getRecoveryStopReason(void)
+{
+ char reason[200];
+
+ if (recoveryTarget == RECOVERY_TARGET_XID)
+ snprintf(reason, sizeof(reason),
+ "%s transaction %u",
+ recoveryStopAfter ? "after" : "before",
+ recoveryStopXid);
+ else if (recoveryTarget == RECOVERY_TARGET_TIME)
+ snprintf(reason, sizeof(reason),
+ "%s %s\n",
+ recoveryStopAfter ? "after" : "before",
+ timestamptz_to_str(recoveryStopTime));
+ else if (recoveryTarget == RECOVERY_TARGET_LSN)
+ snprintf(reason, sizeof(reason),
+ "%s LSN %X/%X\n",
+ recoveryStopAfter ? "after" : "before",
+ LSN_FORMAT_ARGS(recoveryStopLSN));
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ snprintf(reason, sizeof(reason),
+ "at restore point \"%s\"",
+ recoveryStopName);
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ snprintf(reason, sizeof(reason), "reached consistency");
+ else
+ snprintf(reason, sizeof(reason), "no recovery target specified");
+
+ return pstrdup(reason);
+}
+
+/*
+ * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
+ *
+ * endOfRecovery is true if the recovery target is reached and
+ * the paused state starts at the end of recovery because of
+ * recovery_target_action=pause, and false otherwise.
+ */
+static void
+recoveryPausesHere(bool endOfRecovery)
+{
+ /* Don't pause unless users can connect! */
+ if (!LocalHotStandbyActive)
+ return;
+
+ /* Don't pause after standby promotion has been triggered */
+ if (LocalPromoteIsTriggered)
+ return;
+
+ if (endOfRecovery)
+ ereport(LOG,
+ (errmsg("pausing at the end of recovery"),
+ errhint("Execute pg_wal_replay_resume() to promote.")));
+ else
+ ereport(LOG,
+ (errmsg("recovery has paused"),
+ errhint("Execute pg_wal_replay_resume() to continue.")));
+
+ /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
+ while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+ {
+ HandleStartupProcInterrupts();
+ if (CheckForStandbyTrigger())
+ return;
+
+ /*
+ * If recovery pause is requested then set it paused. While we are in
+ * the loop, user might resume and pause again so set this every time.
+ */
+ ConfirmRecoveryPaused();
+
+ /*
+ * We wait on a condition variable that will wake us as soon as the
+ * pause ends, but we use a timeout so we can check the above exit
+ * condition periodically too.
+ */
+ ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+ WAIT_EVENT_RECOVERY_PAUSE);
+ }
+ ConditionVariableCancelSleep();
+}
+
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the primary.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogReaderState *record)
+{
+ uint8 xact_info;
+ TimestampTz xtime;
+ TimestampTz delayUntil;
+ long msecs;
+
+ /* nothing to do if no delay configured */
+ if (recovery_min_apply_delay <= 0)
+ return false;
+
+ /* no delay is applied on a database not yet consistent */
+ if (!reachedConsistency)
+ return false;
+
+ /* nothing to do if crash recovery is requested */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ /*
+ * Is it a COMMIT record?
+ *
+ * We deliberately choose not to delay aborts since they have no effect on
+ * MVCC. We already allow replay of records that don't have a timestamp,
+ * so there is already opportunity for issues caused by early conflicts on
+ * standbys.
+ */
+ if (XLogRecGetRmid(record) != RM_XACT_ID)
+ return false;
+
+ xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+ if (xact_info != XLOG_XACT_COMMIT &&
+ xact_info != XLOG_XACT_COMMIT_PREPARED)
+ return false;
+
+ if (!getRecordTimestamp(record, &xtime))
+ return false;
+
+ delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+ /*
+ * Exit without arming the latch if it's already past time to apply this
+ * record
+ */
+ msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
+ if (msecs <= 0)
+ return false;
+
+ while (true)
+ {
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+ /*
+ * This might change recovery_min_apply_delay or the trigger file's
+ * location.
+ */
+ HandleStartupProcInterrupts();
+
+ if (CheckForStandbyTrigger())
+ break;
+
+ /*
+ * Recalculate delayUntil as recovery_min_apply_delay could have
+ * changed while waiting in this loop.
+ */
+ delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+ /*
+ * Wait for difference between GetCurrentTimestamp() and delayUntil.
+ */
+ msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
+ delayUntil);
+
+ if (msecs <= 0)
+ break;
+
+ elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
+
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ msecs,
+ WAIT_EVENT_RECOVERY_APPLY_DELAY);
+ }
+ return true;
+}
+
+/*
+ * Get the current state of the recovery pause request.
+ */
+RecoveryPauseState
+GetRecoveryPauseState(void)
+{
+ RecoveryPauseState state;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ state = XLogRecoveryCtl->recoveryPauseState;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return state;
+}
+
+/*
+ * Set the recovery pause state.
+ *
+ * If recovery pause is requested then sets the recovery pause state to
+ * 'pause requested' if it is not already 'paused'. Otherwise, sets it
+ * to 'not paused' to resume the recovery. The recovery pause will be
+ * confirmed by the ConfirmRecoveryPaused.
+ */
+void
+SetRecoveryPause(bool recoveryPause)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+
+ if (!recoveryPause)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+ else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
+
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (!recoveryPause)
+ ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Confirm the recovery pause by setting the recovery pause state to
+ * RECOVERY_PAUSED.
+ */
+static void
+ConfirmRecoveryPaused(void)
+{
+ /* If recovery pause is requested then set it paused */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+
+/*
+ * Attempt to read the next XLOG record.
+ *
+ * Before first call, the reader needs to be positioned to the first record
+ * by calling XLogPrefetcherBeginRead().
+ *
+ * If no valid record is available, returns NULL, or fails if emode is PANIC.
+ * (emode must be either PANIC, LOG). In standby mode, retries until a valid
+ * record is available.
+ */
+static XLogRecord *
+ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
+ bool fetching_ckpt, TimeLineID replayTLI)
+{
+ XLogRecord *record;
+ XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
+ XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
+
+ /* Pass through parameters to XLogPageRead */
+ private->fetching_ckpt = fetching_ckpt;
+ private->emode = emode;
+ private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
+ private->replayTLI = replayTLI;
+
+ /* This is the first attempt to read this page. */
+ lastSourceFailed = false;
+
+ for (;;)
+ {
+ char *errormsg;
+
+ record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
+ if (record == NULL)
+ {
+ /*
+ * When we find that WAL ends in an incomplete record, keep track
+ * of that record. After recovery is done, we'll write a record to
+ * indicate to downstream WAL readers that that portion is to be
+ * ignored.
+ *
+ * However, when ArchiveRecoveryRequested = true, we're going to
+ * switch to a new timeline at the end of recovery. We will only
+ * copy WAL over to the new timeline up to the end of the last
+ * complete record, so if we did this, we would later create an
+ * overwrite contrecord in the wrong place, breaking everything.
+ */
+ if (!ArchiveRecoveryRequested &&
+ !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+ {
+ abortedRecPtr = xlogreader->abortedRecPtr;
+ missingContrecPtr = xlogreader->missingContrecPtr;
+ }
+
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+
+ /*
+ * We only end up here without a message when XLogPageRead()
+ * failed - in that case we already logged something. In
+ * StandbyMode that only happens if we have been triggered, so we
+ * shouldn't loop anymore in that case.
+ */
+ if (errormsg)
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg_internal("%s", errormsg) /* already translated */ ));
+ }
+
+ /*
+ * Check page TLI is one of the expected values.
+ */
+ else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
+ {
+ char fname[MAXFNAMELEN];
+ XLogSegNo segno;
+ int32 offset;
+
+ XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
+ offset = XLogSegmentOffset(xlogreader->latestPagePtr,
+ wal_segment_size);
+ XLogFileName(fname, xlogreader->seg.ws_tli, segno,
+ wal_segment_size);
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+ xlogreader->latestPageTLI,
+ fname,
+ offset)));
+ record = NULL;
+ }
+
+ if (record)
+ {
+ /* Great, got a record */
+ return record;
+ }
+ else
+ {
+ /* No valid record available from this source */
+ lastSourceFailed = true;
+
+ /*
+ * If archive recovery was requested, but we were still doing
+ * crash recovery, switch to archive recovery and retry using the
+ * offline archive. We have now replayed all the valid WAL in
+ * pg_wal, so we are presumably now consistent.
+ *
+ * We require that there's at least some valid WAL present in
+ * pg_wal, however (!fetching_ckpt). We could recover using the
+ * WAL from the archive, even if pg_wal is completely empty, but
+ * we'd have no idea how far we'd have to replay to reach
+ * consistency. So err on the safe side and give up.
+ */
+ if (!InArchiveRecovery && ArchiveRecoveryRequested &&
+ !fetching_ckpt)
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ EnableStandbyMode();
+
+ SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
+ minRecoveryPoint = xlogreader->EndRecPtr;
+ minRecoveryPointTLI = replayTLI;
+
+ CheckRecoveryConsistency();
+
+ /*
+ * Before we retry, reset lastSourceFailed and currentSource
+ * so that we will check the archive next.
+ */
+ lastSourceFailed = false;
+ currentSource = XLOG_FROM_ANY;
+
+ continue;
+ }
+
+ /* In standby mode, loop back to retry. Otherwise, give up. */
+ if (StandbyMode && !CheckForStandbyTrigger())
+ continue;
+ else
+ return NULL;
+ }
+ }
+}
+
+/*
+ * Read the XLOG page containing RecPtr into readBuf (if not read already).
+ * Returns number of bytes read, if the page is read successfully, or
+ * XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed, but
+ * only if they have not been previously reported.
+ *
+ * While prefetching, xlogreader->nonblocking may be set. In that case,
+ * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
+ *
+ * This is responsible for restoring files from archive as needed, as well
+ * as for waiting for the requested WAL record to arrive in standby mode.
+ *
+ * 'emode' specifies the log level used for reporting "file not found" or
+ * "end of WAL" situations in archive recovery, or in standby mode when a
+ * trigger file is found. If set to WARNING or below, XLogPageRead() returns
+ * XLREAD_FAIL in those situations, on higher log levels the ereport() won't
+ * return.
+ *
+ * In standby mode, if after a successful return of XLogPageRead() the
+ * caller finds the record it's interested in to be broken, it should
+ * ereport the error with the level determined by
+ * emode_for_corrupt_record(), and then set lastSourceFailed
+ * and call XLogPageRead() again with the same arguments. This lets
+ * XLogPageRead() to try fetching the record from another source, or to
+ * sleep and retry.
+ */
+static int
+XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
+ XLogRecPtr targetRecPtr, char *readBuf)
+{
+ XLogPageReadPrivate *private =
+ (XLogPageReadPrivate *) xlogreader->private_data;
+ int emode = private->emode;
+ uint32 targetPageOff;
+ XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+ int r;
+
+ XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
+ targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
+
+ /*
+ * See if we need to switch to a new segment because the requested record
+ * is not in the currently open one.
+ */
+ if (readFile >= 0 &&
+ !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
+ {
+ /*
+ * Request a restartpoint if we've replayed too much xlog since the
+ * last one.
+ */
+ if (ArchiveRecoveryRequested && IsUnderPostmaster)
+ {
+ if (XLogCheckpointNeeded(readSegNo))
+ {
+ (void) GetRedoRecPtr();
+ if (XLogCheckpointNeeded(readSegNo))
+ RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+ }
+ }
+
+ close(readFile);
+ readFile = -1;
+ readSource = XLOG_FROM_ANY;
+ }
+
+ XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
+
+retry:
+ /* See if we need to retrieve more data */
+ if (readFile < 0 ||
+ (readSource == XLOG_FROM_STREAM &&
+ flushedUpto < targetPagePtr + reqLen))
+ {
+ if (readFile >= 0 &&
+ xlogreader->nonblocking &&
+ readSource == XLOG_FROM_STREAM &&
+ flushedUpto < targetPagePtr + reqLen)
+ return XLREAD_WOULDBLOCK;
+
+ switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
+ private->randAccess,
+ private->fetching_ckpt,
+ targetRecPtr,
+ private->replayTLI,
+ xlogreader->EndRecPtr,
+ xlogreader->nonblocking))
+ {
+ case XLREAD_WOULDBLOCK:
+ return XLREAD_WOULDBLOCK;
+ case XLREAD_FAIL:
+ if (readFile >= 0)
+ close(readFile);
+ readFile = -1;
+ readLen = 0;
+ readSource = XLOG_FROM_ANY;
+ return XLREAD_FAIL;
+ case XLREAD_SUCCESS:
+ break;
+ }
+ }
+
+ /*
+ * At this point, we have the right segment open and if we're streaming we
+ * know the requested record is in it.
+ */
+ Assert(readFile != -1);
+
+ /*
+ * If the current segment is being streamed from the primary, calculate
+ * how much of the current page we have received already. We know the
+ * requested record has been received, but this is for the benefit of
+ * future calls, to allow quick exit at the top of this function.
+ */
+ if (readSource == XLOG_FROM_STREAM)
+ {
+ if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
+ readLen = XLOG_BLCKSZ;
+ else
+ readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
+ targetPageOff;
+ }
+ else
+ readLen = XLOG_BLCKSZ;
+
+ /* Read the requested page */
+ readOff = targetPageOff;
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+ r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+ if (r != XLOG_BLCKSZ)
+ {
+ char fname[MAXFNAMELEN];
+ int save_errno = errno;
+
+ pgstat_report_wait_end();
+ XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+ if (r < 0)
+ {
+ errno = save_errno;
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode_for_file_access(),
+ errmsg("could not read from log segment %s, offset %u: %m",
+ fname, readOff)));
+ }
+ else
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+ fname, readOff, r, (Size) XLOG_BLCKSZ)));
+ goto next_record_is_invalid;
+ }
+ pgstat_report_wait_end();
+
+ Assert(targetSegNo == readSegNo);
+ Assert(targetPageOff == readOff);
+ Assert(reqLen <= readLen);
+
+ xlogreader->seg.ws_tli = curFileTLI;
+
+ /*
+ * Check the page header immediately, so that we can retry immediately if
+ * it's not valid. This may seem unnecessary, because ReadPageInternal()
+ * validates the page header anyway, and would propagate the failure up to
+ * ReadRecord(), which would retry. However, there's a corner case with
+ * continuation records, if a record is split across two pages such that
+ * we would need to read the two pages from different sources. For
+ * example, imagine a scenario where a streaming replica is started up,
+ * and replay reaches a record that's split across two WAL segments. The
+ * first page is only available locally, in pg_wal, because it's already
+ * been recycled on the primary. The second page, however, is not present
+ * in pg_wal, and we should stream it from the primary. There is a
+ * recycled WAL segment present in pg_wal, with garbage contents, however.
+ * We would read the first page from the local WAL segment, but when
+ * reading the second page, we would read the bogus, recycled, WAL
+ * segment. If we didn't catch that case here, we would never recover,
+ * because ReadRecord() would retry reading the whole record from the
+ * beginning.
+ *
+ * Of course, this only catches errors in the page header, which is what
+ * happens in the case of a recycled WAL segment. Other kinds of errors or
+ * corruption still has the same problem. But this at least fixes the
+ * common case, which can happen as part of normal operation.
+ *
+ * Validating the page header is cheap enough that doing it twice
+ * shouldn't be a big deal from a performance point of view.
+ *
+ * When not in standby mode, an invalid page header should cause recovery
+ * to end, not retry reading the page, so we don't need to validate the
+ * page header here for the retry. Instead, ReadPageInternal() is
+ * responsible for the validation.
+ */
+ if (StandbyMode &&
+ !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+ {
+ /*
+ * Emit this error right now then retry this page immediately. Use
+ * errmsg_internal() because the message was already translated.
+ */
+ if (xlogreader->errormsg_buf[0])
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg_internal("%s", xlogreader->errormsg_buf)));
+
+ /* reset any error XLogReaderValidatePageHeader() might have set */
+ XLogReaderResetError(xlogreader);
+ goto next_record_is_invalid;
+ }
+
+ return readLen;
+
+next_record_is_invalid:
+
+ /*
+ * If we're reading ahead, give up fast. Retries and error reporting will
+ * be handled by a later read when recovery catches up to this point.
+ */
+ if (xlogreader->nonblocking)
+ return XLREAD_WOULDBLOCK;
+
+ lastSourceFailed = true;
+
+ if (readFile >= 0)
+ close(readFile);
+ readFile = -1;
+ readLen = 0;
+ readSource = XLOG_FROM_ANY;
+
+ /* In standby-mode, keep trying */
+ if (StandbyMode)
+ goto retry;
+ else
+ return XLREAD_FAIL;
+}
+
+/*
+ * Open the WAL segment containing WAL location 'RecPtr'.
+ *
+ * The segment can be fetched via restore_command, or via walreceiver having
+ * streamed the record, or it can already be present in pg_wal. Checking
+ * pg_wal is mainly for crash recovery, but it will be polled in standby mode
+ * too, in case someone copies a new segment directly to pg_wal. That is not
+ * documented or recommended, though.
+ *
+ * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
+ * prepare to read WAL starting from RedoStartLSN after this.
+ *
+ * 'RecPtr' might not point to the beginning of the record we're interested
+ * in, it might also point to the page or segment header. In that case,
+ * 'tliRecPtr' is the position of the WAL record we're interested in. It is
+ * used to decide which timeline to stream the requested WAL from.
+ *
+ * 'replayLSN' is the current replay LSN, so that if we scan for new
+ * timelines, we can reject a switch to a timeline that branched off before
+ * this point.
+ *
+ * If the record is not immediately available, the function returns false
+ * if we're not in standby mode. In standby mode, waits for it to become
+ * available.
+ *
+ * When the requested record becomes available, the function opens the file
+ * containing it (if not open already), and returns XLREAD_SUCCESS. When end
+ * of standby mode is triggered by the user, and there is no more WAL
+ * available, returns XLREAD_FAIL.
+ *
+ * If nonblocking is true, then give up immediately if we can't satisfy the
+ * request, returning XLREAD_WOULDBLOCK instead of waiting.
+ */
+static XLogPageReadResult
+WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+ bool fetching_ckpt, XLogRecPtr tliRecPtr,
+ TimeLineID replayTLI, XLogRecPtr replayLSN,
+ bool nonblocking)
+{
+ static TimestampTz last_fail_time = 0;
+ TimestampTz now;
+ bool streaming_reply_sent = false;
+
+ /*-------
+ * Standby mode is implemented by a state machine:
+ *
+ * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
+ * pg_wal (XLOG_FROM_PG_WAL)
+ * 2. Check trigger file
+ * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+ * 4. Rescan timelines
+ * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
+ *
+ * Failure to read from the current source advances the state machine to
+ * the next state.
+ *
+ * 'currentSource' indicates the current state. There are no currentSource
+ * values for "check trigger", "rescan timelines", and "sleep" states,
+ * those actions are taken when reading from the previous source fails, as
+ * part of advancing to the next state.
+ *
+ * If standby mode is turned off while reading WAL from stream, we move
+ * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
+ * the files (which would be required at end of recovery, e.g., timeline
+ * history file) from archive or pg_wal. We don't need to kill WAL receiver
+ * here because it's already stopped when standby mode is turned off at
+ * the end of recovery.
+ *-------
+ */
+ if (!InArchiveRecovery)
+ currentSource = XLOG_FROM_PG_WAL;
+ else if (currentSource == XLOG_FROM_ANY ||
+ (!StandbyMode && currentSource == XLOG_FROM_STREAM))
+ {
+ lastSourceFailed = false;
+ currentSource = XLOG_FROM_ARCHIVE;
+ }
+
+ for (;;)
+ {
+ XLogSource oldSource = currentSource;
+ bool startWalReceiver = false;
+
+ /*
+ * First check if we failed to read from the current source, and
+ * advance the state machine if so. The failure to read might've
+ * happened outside this function, e.g when a CRC check fails on a
+ * record, or within this loop.
+ */
+ if (lastSourceFailed)
+ {
+ /*
+ * Don't allow any retry loops to occur during nonblocking
+ * readahead. Let the caller process everything that has been
+ * decoded already first.
+ */
+ if (nonblocking)
+ return XLREAD_WOULDBLOCK;
+
+ switch (currentSource)
+ {
+ case XLOG_FROM_ARCHIVE:
+ case XLOG_FROM_PG_WAL:
+
+ /*
+ * Check to see if the trigger file exists. Note that we
+ * do this only after failure, so when you create the
+ * trigger file, we still finish replaying as much as we
+ * can from archive and pg_wal before failover.
+ */
+ if (StandbyMode && CheckForStandbyTrigger())
+ {
+ XLogShutdownWalRcv();
+ return XLREAD_FAIL;
+ }
+
+ /*
+ * Not in standby mode, and we've now tried the archive
+ * and pg_wal.
+ */
+ if (!StandbyMode)
+ return XLREAD_FAIL;
+
+ /*
+ * Move to XLOG_FROM_STREAM state, and set to start a
+ * walreceiver if necessary.
+ */
+ currentSource = XLOG_FROM_STREAM;
+ startWalReceiver = true;
+ break;
+
+ case XLOG_FROM_STREAM:
+
+ /*
+ * Failure while streaming. Most likely, we got here
+ * because streaming replication was terminated, or
+ * promotion was triggered. But we also get here if we
+ * find an invalid record in the WAL streamed from the
+ * primary, in which case something is seriously wrong.
+ * There's little chance that the problem will just go
+ * away, but PANIC is not good for availability either,
+ * especially in hot standby mode. So, we treat that the
+ * same as disconnection, and retry from archive/pg_wal
+ * again. The WAL in the archive should be identical to
+ * what was streamed, so it's unlikely that it helps, but
+ * one can hope...
+ */
+
+ /*
+ * We should be able to move to XLOG_FROM_STREAM only in
+ * standby mode.
+ */
+ Assert(StandbyMode);
+
+ /*
+ * Before we leave XLOG_FROM_STREAM state, make sure that
+ * walreceiver is not active, so that it won't overwrite
+ * WAL that we restore from archive.
+ */
+ XLogShutdownWalRcv();
+
+ /*
+ * Before we sleep, re-scan for possible new timelines if
+ * we were requested to recover to the latest timeline.
+ */
+ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+ {
+ if (rescanLatestTimeLine(replayTLI, replayLSN))
+ {
+ currentSource = XLOG_FROM_ARCHIVE;
+ break;
+ }
+ }
+
+ /*
+ * XLOG_FROM_STREAM is the last state in our state
+ * machine, so we've exhausted all the options for
+ * obtaining the requested WAL. We're going to loop back
+ * and retry from the archive, but if it hasn't been long
+ * since last attempt, sleep wal_retrieve_retry_interval
+ * milliseconds to avoid busy-waiting.
+ */
+ now = GetCurrentTimestamp();
+ if (!TimestampDifferenceExceeds(last_fail_time, now,
+ wal_retrieve_retry_interval))
+ {
+ long wait_time;
+
+ wait_time = wal_retrieve_retry_interval -
+ TimestampDifferenceMilliseconds(last_fail_time, now);
+
+ elog(LOG, "waiting for WAL to become available at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr));
+
+ /* Do background tasks that might benefit us later. */
+ KnownAssignedTransactionIdsIdleMaintenance();
+
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT |
+ WL_EXIT_ON_PM_DEATH,
+ wait_time,
+ WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ now = GetCurrentTimestamp();
+
+ /* Handle interrupt signals of startup process */
+ HandleStartupProcInterrupts();
+ }
+ last_fail_time = now;
+ currentSource = XLOG_FROM_ARCHIVE;
+ break;
+
+ default:
+ elog(ERROR, "unexpected WAL source %d", currentSource);
+ }
+ }
+ else if (currentSource == XLOG_FROM_PG_WAL)
+ {
+ /*
+ * We just successfully read a file in pg_wal. We prefer files in
+ * the archive over ones in pg_wal, so try the next file again
+ * from the archive first.
+ */
+ if (InArchiveRecovery)
+ currentSource = XLOG_FROM_ARCHIVE;
+ }
+
+ if (currentSource != oldSource)
+ elog(DEBUG2, "switched WAL source from %s to %s after %s",
+ xlogSourceNames[oldSource], xlogSourceNames[currentSource],
+ lastSourceFailed ? "failure" : "success");
+
+ /*
+ * We've now handled possible failure. Try to read from the chosen
+ * source.
+ */
+ lastSourceFailed = false;
+
+ switch (currentSource)
+ {
+ case XLOG_FROM_ARCHIVE:
+ case XLOG_FROM_PG_WAL:
+
+ /*
+ * WAL receiver must not be running when reading WAL from
+ * archive or pg_wal.
+ */
+ Assert(!WalRcvStreaming());
+
+ /* Close any old file we might have open. */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ /* Reset curFileTLI if random fetch. */
+ if (randAccess)
+ curFileTLI = 0;
+
+ /*
+ * Try to restore the file from archive, or read an existing
+ * file from pg_wal.
+ */
+ readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+ currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+ currentSource);
+ if (readFile >= 0)
+ return XLREAD_SUCCESS; /* success! */
+
+ /*
+ * Nope, not found in archive or pg_wal.
+ */
+ lastSourceFailed = true;
+ break;
+
+ case XLOG_FROM_STREAM:
+ {
+ bool havedata;
+
+ /*
+ * We should be able to move to XLOG_FROM_STREAM only in
+ * standby mode.
+ */
+ Assert(StandbyMode);
+
+ /*
+ * First, shutdown walreceiver if its restart has been
+ * requested -- but no point if we're already slated for
+ * starting it.
+ */
+ if (pendingWalRcvRestart && !startWalReceiver)
+ {
+ XLogShutdownWalRcv();
+
+ /*
+ * Re-scan for possible new timelines if we were
+ * requested to recover to the latest timeline.
+ */
+ if (recoveryTargetTimeLineGoal ==
+ RECOVERY_TARGET_TIMELINE_LATEST)
+ rescanLatestTimeLine(replayTLI, replayLSN);
+
+ startWalReceiver = true;
+ }
+ pendingWalRcvRestart = false;
+
+ /*
+ * Launch walreceiver if needed.
+ *
+ * If fetching_ckpt is true, RecPtr points to the initial
+ * checkpoint location. In that case, we use RedoStartLSN
+ * as the streaming start position instead of RecPtr, so
+ * that when we later jump backwards to start redo at
+ * RedoStartLSN, we will have the logs streamed already.
+ */
+ if (startWalReceiver &&
+ PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
+ {
+ XLogRecPtr ptr;
+ TimeLineID tli;
+
+ if (fetching_ckpt)
+ {
+ ptr = RedoStartLSN;
+ tli = RedoStartTLI;
+ }
+ else
+ {
+ ptr = RecPtr;
+
+ /*
+ * Use the record begin position to determine the
+ * TLI, rather than the position we're reading.
+ */
+ tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
+
+ if (curFileTLI > 0 && tli < curFileTLI)
+ elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+ LSN_FORMAT_ARGS(tliRecPtr),
+ tli, curFileTLI);
+ }
+ curFileTLI = tli;
+ SetInstallXLogFileSegmentActive();
+ RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+ PrimarySlotName,
+ wal_receiver_create_temp_slot);
+ flushedUpto = 0;
+ }
+
+ /*
+ * Check if WAL receiver is active or wait to start up.
+ */
+ if (!WalRcvStreaming())
+ {
+ lastSourceFailed = true;
+ break;
+ }
+
+ /*
+ * Walreceiver is active, so see if new data has arrived.
+ *
+ * We only advance XLogReceiptTime when we obtain fresh
+ * WAL from walreceiver and observe that we had already
+ * processed everything before the most recent "chunk"
+ * that it flushed to disk. In steady state where we are
+ * keeping up with the incoming data, XLogReceiptTime will
+ * be updated on each cycle. When we are behind,
+ * XLogReceiptTime will not advance, so the grace time
+ * allotted to conflicting queries will decrease.
+ */
+ if (RecPtr < flushedUpto)
+ havedata = true;
+ else
+ {
+ XLogRecPtr latestChunkStart;
+
+ flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
+ if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
+ {
+ havedata = true;
+ if (latestChunkStart <= RecPtr)
+ {
+ XLogReceiptTime = GetCurrentTimestamp();
+ SetCurrentChunkStartTime(XLogReceiptTime);
+ }
+ }
+ else
+ havedata = false;
+ }
+ if (havedata)
+ {
+ /*
+ * Great, streamed far enough. Open the file if it's
+ * not open already. Also read the timeline history
+ * file if we haven't initialized timeline history
+ * yet; it should be streamed over and present in
+ * pg_wal by now. Use XLOG_FROM_STREAM so that source
+ * info is set correctly and XLogReceiptTime isn't
+ * changed.
+ *
+ * NB: We must set readTimeLineHistory based on
+ * recoveryTargetTLI, not receiveTLI. Normally they'll
+ * be the same, but if recovery_target_timeline is
+ * 'latest' and archiving is configured, then it's
+ * possible that we managed to retrieve one or more
+ * new timeline history files from the archive,
+ * updating recoveryTargetTLI.
+ */
+ if (readFile < 0)
+ {
+ if (!expectedTLEs)
+ expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+ readFile = XLogFileRead(readSegNo, PANIC,
+ receiveTLI,
+ XLOG_FROM_STREAM, false);
+ Assert(readFile >= 0);
+ }
+ else
+ {
+ /* just make sure source info is correct... */
+ readSource = XLOG_FROM_STREAM;
+ XLogReceiptSource = XLOG_FROM_STREAM;
+ return XLREAD_SUCCESS;
+ }
+ break;
+ }
+
+ /* In nonblocking mode, return rather than sleeping. */
+ if (nonblocking)
+ return XLREAD_WOULDBLOCK;
+
+ /*
+ * Data not here yet. Check for trigger, then wait for
+ * walreceiver to wake us up when new WAL arrives.
+ */
+ if (CheckForStandbyTrigger())
+ {
+ /*
+ * Note that we don't return XLREAD_FAIL immediately
+ * here. After being triggered, we still want to
+ * replay all the WAL that was already streamed. It's
+ * in pg_wal now, so we just treat this as a failure,
+ * and the state machine will move on to replay the
+ * streamed WAL from pg_wal, and then recheck the
+ * trigger and exit replay.
+ */
+ lastSourceFailed = true;
+ break;
+ }
+
+ /*
+ * Since we have replayed everything we have received so
+ * far and are about to start waiting for more WAL, let's
+ * tell the upstream server our replay location now so
+ * that pg_stat_replication doesn't show stale
+ * information.
+ */
+ if (!streaming_reply_sent)
+ {
+ WalRcvForceReply();
+ streaming_reply_sent = true;
+ }
+
+ /* Do any background tasks that might benefit us later. */
+ KnownAssignedTransactionIdsIdleMaintenance();
+
+ /* Update pg_stat_recovery_prefetch before sleeping. */
+ XLogPrefetcherComputeStats(xlogprefetcher);
+
+ /*
+ * Wait for more WAL to arrive. Time out after 5 seconds
+ * to react to a trigger file promptly and to check if the
+ * WAL receiver is still active.
+ */
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT |
+ WL_EXIT_ON_PM_DEATH,
+ 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ break;
+ }
+
+ default:
+ elog(ERROR, "unexpected WAL source %d", currentSource);
+ }
+
+ /*
+ * Check for recovery pause here so that we can confirm more quickly
+ * that a requested pause has actually taken effect.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+
+ /*
+ * This possibly-long loop needs to handle interrupts of startup
+ * process.
+ */
+ HandleStartupProcInterrupts();
+ }
+
+ return XLREAD_FAIL; /* not reached */
+}
+
+
+/*
+ * Determine what log level should be used to report a corrupt WAL record
+ * in the current WAL page, previously read by XLogPageRead().
+ *
+ * 'emode' is the error mode that would be used to report a file-not-found
+ * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
+ * we're retrying the exact same record that we've tried previously, only
+ * complain the first time to keep the noise down. However, we only do when
+ * reading from pg_wal, because we don't expect any invalid records in archive
+ * or in records streamed from the primary. Files in the archive should be complete,
+ * and we should never hit the end of WAL because we stop and wait for more WAL
+ * to arrive before replaying it.
+ *
+ * NOTE: This function remembers the RecPtr value it was last called with,
+ * to suppress repeated messages about the same record. Only call this when
+ * you are about to ereport(), or you might cause a later message to be
+ * erroneously suppressed.
+ */
+static int
+emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
+{
+ static XLogRecPtr lastComplaint = 0;
+
+ if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
+ {
+ if (RecPtr == lastComplaint)
+ emode = DEBUG1;
+ else
+ lastComplaint = RecPtr;
+ }
+ return emode;
+}
+
+
+/*
+ * Subroutine to try to fetch and validate a prior checkpoint record.
+ *
+ * whichChkpt identifies the checkpoint (merely for reporting purposes).
+ * 1 for "primary", 0 for "other" (backup_label)
+ */
+static XLogRecord *
+ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
+ int whichChkpt, bool report, TimeLineID replayTLI)
+{
+ XLogRecord *record;
+ uint8 info;
+
+ Assert(xlogreader != NULL);
+
+ if (!XRecOffIsValid(RecPtr))
+ {
+ if (!report)
+ return NULL;
+
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid primary checkpoint link in control file")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid checkpoint link in backup_label file")));
+ break;
+ }
+ return NULL;
+ }
+
+ XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
+ record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
+
+ if (record == NULL)
+ {
+ if (!report)
+ return NULL;
+
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ if (record->xl_rmid != RM_XLOG_ID)
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ info = record->xl_info & ~XLR_INFO_MASK;
+ if (info != XLOG_CHECKPOINT_SHUTDOWN &&
+ info != XLOG_CHECKPOINT_ONLINE)
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid xl_info in primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid xl_info in checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid length of primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid length of checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ return record;
+}
+
+/*
+ * Scan for new timelines that might have appeared in the archive since we
+ * started recovery.
+ *
+ * If there are any, the function changes recovery target TLI to the latest
+ * one and returns 'true'.
+ */
+static bool
+rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
+{
+ List *newExpectedTLEs;
+ bool found;
+ ListCell *cell;
+ TimeLineID newtarget;
+ TimeLineID oldtarget = recoveryTargetTLI;
+ TimeLineHistoryEntry *currentTle = NULL;
+
+ newtarget = findNewestTimeLine(recoveryTargetTLI);
+ if (newtarget == recoveryTargetTLI)
+ {
+ /* No new timelines found */
+ return false;
+ }
+
+ /*
+ * Determine the list of expected TLIs for the new TLI
+ */
+
+ newExpectedTLEs = readTimeLineHistory(newtarget);
+
+ /*
+ * If the current timeline is not part of the history of the new timeline,
+ * we cannot proceed to it.
+ */
+ found = false;
+ foreach(cell, newExpectedTLEs)
+ {
+ currentTle = (TimeLineHistoryEntry *) lfirst(cell);
+
+ if (currentTle->tli == recoveryTargetTLI)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ {
+ ereport(LOG,
+ (errmsg("new timeline %u is not a child of database system timeline %u",
+ newtarget,
+ replayTLI)));
+ return false;
+ }
+
+ /*
+ * The current timeline was found in the history file, but check that the
+ * next timeline was forked off from it *after* the current recovery
+ * location.
+ */
+ if (currentTle->end < replayLSN)
+ {
+ ereport(LOG,
+ (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
+ newtarget,
+ replayTLI,
+ LSN_FORMAT_ARGS(replayLSN))));
+ return false;
+ }
+
+ /* The new timeline history seems valid. Switch target */
+ recoveryTargetTLI = newtarget;
+ list_free_deep(expectedTLEs);
+ expectedTLEs = newExpectedTLEs;
+
+ /*
+ * As in StartupXLOG(), try to ensure we have all the history files
+ * between the old target and new target in pg_wal.
+ */
+ restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
+ ereport(LOG,
+ (errmsg("new target timeline is %u",
+ recoveryTargetTLI)));
+
+ return true;
+}
+
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
+ * Otherwise, it's assumed to be already available in pg_wal.
+ */
+static int
+XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+ XLogSource source, bool notfoundOk)
+{
+ char xlogfname[MAXFNAMELEN];
+ char activitymsg[MAXFNAMELEN + 16];
+ char path[MAXPGPATH];
+ int fd;
+
+ XLogFileName(xlogfname, tli, segno, wal_segment_size);
+
+ switch (source)
+ {
+ case XLOG_FROM_ARCHIVE:
+ /* Report recovery progress in PS display */
+ snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+ xlogfname);
+ set_ps_display(activitymsg);
+
+ if (!RestoreArchivedFile(path, xlogfname,
+ "RECOVERYXLOG",
+ wal_segment_size,
+ InRedo))
+ return -1;
+ break;
+
+ case XLOG_FROM_PG_WAL:
+ case XLOG_FROM_STREAM:
+ XLogFilePath(path, tli, segno, wal_segment_size);
+ break;
+
+ default:
+ elog(ERROR, "invalid XLogFileRead source %d", source);
+ }
+
+ /*
+ * If the segment was fetched from archival storage, replace the existing
+ * xlog segment (if any) with the archival version.
+ */
+ if (source == XLOG_FROM_ARCHIVE)
+ {
+ Assert(!IsInstallXLogFileSegmentActive());
+ KeepFileRestoredFromArchive(path, xlogfname);
+
+ /*
+ * Set path to point at the new file in pg_wal.
+ */
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+ }
+
+ fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+ if (fd >= 0)
+ {
+ /* Success! */
+ curFileTLI = tli;
+
+ /* Report recovery progress in PS display */
+ snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
+ xlogfname);
+ set_ps_display(activitymsg);
+
+ /* Track source of data in assorted state variables */
+ readSource = source;
+ XLogReceiptSource = source;
+ /* In FROM_STREAM case, caller tracks receipt time, not me */
+ if (source != XLOG_FROM_STREAM)
+ XLogReceiptTime = GetCurrentTimestamp();
+
+ return fd;
+ }
+ if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return -1;
+}
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * This version searches for the segment with any TLI listed in expectedTLEs.
+ */
+static int
+XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+{
+ char path[MAXPGPATH];
+ ListCell *cell;
+ int fd;
+ List *tles;
+
+ /*
+ * Loop looking for a suitable timeline ID: we might need to read any of
+ * the timelines listed in expectedTLEs.
+ *
+ * We expect curFileTLI on entry to be the TLI of the preceding file in
+ * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
+ * to go backwards; this prevents us from picking up the wrong file when a
+ * parent timeline extends to higher segment numbers than the child we
+ * want to read.
+ *
+ * If we haven't read the timeline history file yet, read it now, so that
+ * we know which TLIs to scan. We don't save the list in expectedTLEs,
+ * however, unless we actually find a valid segment. That way if there is
+ * neither a timeline history file nor a WAL segment in the archive, and
+ * streaming replication is set up, we'll read the timeline history file
+ * streamed from the primary when we start streaming, instead of
+ * recovering with a dummy history generated here.
+ */
+ if (expectedTLEs)
+ tles = expectedTLEs;
+ else
+ tles = readTimeLineHistory(recoveryTargetTLI);
+
+ foreach(cell, tles)
+ {
+ TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
+ TimeLineID tli = hent->tli;
+
+ if (tli < curFileTLI)
+ break; /* don't bother looking at too-old TLIs */
+
+ /*
+ * Skip scanning the timeline ID that the logfile segment to read
+ * doesn't belong to
+ */
+ if (hent->begin != InvalidXLogRecPtr)
+ {
+ XLogSegNo beginseg = 0;
+
+ XLByteToSeg(hent->begin, beginseg, wal_segment_size);
+
+ /*
+ * The logfile segment that doesn't belong to the timeline is
+ * older or newer than the segment that the timeline started or
+ * ended at, respectively. It's sufficient to check only the
+ * starting segment of the timeline here. Since the timelines are
+ * scanned in descending order in this loop, any segments newer
+ * than the ending segment should belong to newer timeline and
+ * have already been read before. So it's not necessary to check
+ * the ending segment of the timeline here.
+ */
+ if (segno < beginseg)
+ continue;
+ }
+
+ if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
+ {
+ fd = XLogFileRead(segno, emode, tli,
+ XLOG_FROM_ARCHIVE, true);
+ if (fd != -1)
+ {
+ elog(DEBUG1, "got WAL segment from archive");
+ if (!expectedTLEs)
+ expectedTLEs = tles;
+ return fd;
+ }
+ }
+
+ if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
+ {
+ fd = XLogFileRead(segno, emode, tli,
+ XLOG_FROM_PG_WAL, true);
+ if (fd != -1)
+ {
+ if (!expectedTLEs)
+ expectedTLEs = tles;
+ return fd;
+ }
+ }
+ }
+
+ /* Couldn't find it. For simplicity, complain about front timeline */
+ XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
+ errno = ENOENT;
+ ereport(emode,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return -1;
+}
+
+/*
+ * Set flag to signal the walreceiver to restart. (The startup process calls
+ * this on noticing a relevant configuration change.)
+ */
+void
+StartupRequestWalReceiverRestart(void)
+{
+ if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
+ {
+ ereport(LOG,
+ (errmsg("WAL receiver process shutdown requested")));
+
+ pendingWalRcvRestart = true;
+ }
+}
+
+
+/*
+ * Has a standby promotion already been triggered?
+ *
+ * Unlike CheckForStandbyTrigger(), this works in any process
+ * that's connected to shared memory.
+ */
+bool
+PromoteIsTriggered(void)
+{
+ /*
+ * We check shared state each time only until a standby promotion is
+ * triggered. We can't trigger a promotion again, so there's no need to
+ * keep checking after the shared variable has once been seen true.
+ */
+ if (LocalPromoteIsTriggered)
+ return true;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return LocalPromoteIsTriggered;
+}
+
+static void
+SetPromoteIsTriggered(void)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->SharedPromoteIsTriggered = true;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * Mark the recovery pause state as 'not paused' because the paused state
+ * ends and promotion continues if a promotion is triggered while recovery
+ * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
+ * return 'paused' while a promotion is ongoing.
+ */
+ SetRecoveryPause(false);
+
+ LocalPromoteIsTriggered = true;
+}
+
+/*
+ * Check to see whether the user-specified trigger file exists and whether a
+ * promote request has arrived. If either condition holds, return true.
+ */
+static bool
+CheckForStandbyTrigger(void)
+{
+ struct stat stat_buf;
+
+ if (LocalPromoteIsTriggered)
+ return true;
+
+ if (IsPromoteSignaled() && CheckPromoteSignal())
+ {
+ ereport(LOG, (errmsg("received promote request")));
+ RemovePromoteSignalFiles();
+ ResetPromoteSignaled();
+ SetPromoteIsTriggered();
+ return true;
+ }
+
+ if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
+ return false;
+
+ if (stat(PromoteTriggerFile, &stat_buf) == 0)
+ {
+ ereport(LOG,
+ (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
+ unlink(PromoteTriggerFile);
+ SetPromoteIsTriggered();
+ return true;
+ }
+ else if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat promote trigger file \"%s\": %m",
+ PromoteTriggerFile)));
+
+ return false;
+}
+
+/*
+ * Remove the files signaling a standby promotion request.
+ */
+void
+RemovePromoteSignalFiles(void)
+{
+ unlink(PROMOTE_SIGNAL_FILE);
+}
+
+/*
+ * Check to see if a promote request has arrived.
+ */
+bool
+CheckPromoteSignal(void)
+{
+ struct stat stat_buf;
+
+ if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * Wake up startup process to replay newly arrived WAL, or to notice that
+ * failover has been requested.
+ */
+void
+WakeupRecovery(void)
+{
+ SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Schedule a walreceiver wakeup in the main recovery loop.
+ */
+void
+XLogRequestWalReceiverReply(void)
+{
+ doRequestWalReceiverReply = true;
+}
+
+/*
+ * Is HotStandby active yet? This is only important in special backends
+ * since normal backends won't ever be able to connect until this returns
+ * true. Postmaster knows this by way of signal, not via shared memory.
+ *
+ * Unlike testing standbyState, this works in any process that's connected to
+ * shared memory. (And note that standbyState alone doesn't tell the truth
+ * anyway.)
+ */
+bool
+HotStandbyActive(void)
+{
+ /*
+ * We check shared state each time only until Hot Standby is active. We
+ * can't de-activate Hot Standby, so there's no need to keep checking
+ * after the shared variable has once been seen true.
+ */
+ if (LocalHotStandbyActive)
+ return true;
+ else
+ {
+ /* spinlock is essential on machines with weak memory ordering! */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return LocalHotStandbyActive;
+ }
+}
+
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+static bool
+HotStandbyActiveInReplay(void)
+{
+ Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+ return LocalHotStandbyActive;
+}
+
+/*
+ * Get latest redo apply position.
+ *
+ * Exported to allow WALReceiver to read the pointer directly.
+ */
+XLogRecPtr
+GetXLogReplayRecPtr(TimeLineID *replayTLI)
+{
+ XLogRecPtr recptr;
+ TimeLineID tli;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ tli = XLogRecoveryCtl->lastReplayedTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (replayTLI)
+ *replayTLI = tli;
+ return recptr;
+}
+
+
+/*
+ * Get position of last applied, or the record being applied.
+ *
+ * This is different from GetXLogReplayRecPtr() in that if a WAL
+ * record is currently being applied, this includes that record.
+ */
+XLogRecPtr
+GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
+{
+ XLogRecPtr recptr;
+ TimeLineID tli;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ recptr = XLogRecoveryCtl->replayEndRecPtr;
+ tli = XLogRecoveryCtl->replayEndTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (replayEndTLI)
+ *replayEndTLI = tli;
+ return recptr;
+}
+
+/*
+ * Save timestamp of latest processed commit/abort record.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by processes other than the startup process. Note in particular
+ * that CreateRestartPoint is executed in the checkpointer.
+ */
+static void
+SetLatestXTime(TimestampTz xtime)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->recoveryLastXTime = xtime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ */
+TimestampTz
+GetLatestXTime(void)
+{
+ TimestampTz xtime;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ xtime = XLogRecoveryCtl->recoveryLastXTime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return xtime;
+}
+
+/*
+ * Save timestamp of the next chunk of WAL records to apply.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by all backends.
+ */
+static void
+SetCurrentChunkStartTime(TimestampTz xtime)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->currentChunkStartTime = xtime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ * Startup process maintains an accurate local copy in XLogReceiptTime
+ */
+TimestampTz
+GetCurrentChunkReplayStartTime(void)
+{
+ TimestampTz xtime;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ xtime = XLogRecoveryCtl->currentChunkStartTime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return xtime;
+}
+
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+ /*
+ * This must be executed in the startup process, since we don't export the
+ * relevant state to shared memory.
+ */
+ Assert(InRecovery);
+
+ *rtime = XLogReceiptTime;
+ *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
+/*
+ * Note that text field supplied is a parameter name and does not require
+ * translation
+ */
+void
+RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
+{
+ if (currValue < minValue)
+ {
+ if (HotStandbyActiveInReplay())
+ {
+ bool warned_for_promote = false;
+
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("hot standby is not possible because of insufficient parameter settings"),
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue)));
+
+ SetRecoveryPause(true);
+
+ ereport(LOG,
+ (errmsg("recovery has paused"),
+ errdetail("If recovery is unpaused, the server will shut down."),
+ errhint("You can then restart the server after making the necessary configuration changes.")));
+
+ while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+ {
+ HandleStartupProcInterrupts();
+
+ if (CheckForStandbyTrigger())
+ {
+ if (!warned_for_promote)
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("promotion is not possible because of insufficient parameter settings"),
+
+ /*
+ * Repeat the detail from above so it's easy to find
+ * in the log.
+ */
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue),
+ errhint("Restart the server after making the necessary configuration changes.")));
+ warned_for_promote = true;
+ }
+
+ /*
+ * If recovery pause is requested then set it paused. While
+ * we are in the loop, user might resume and pause again so
+ * set this every time.
+ */
+ ConfirmRecoveryPaused();
+
+ /*
+ * We wait on a condition variable that will wake us as soon
+ * as the pause ends, but we use a timeout so we can check the
+ * above conditions periodically too.
+ */
+ ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+ WAIT_EVENT_RECOVERY_PAUSE);
+ }
+ ConditionVariableCancelSleep();
+ }
+
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery aborted because of insufficient parameter settings"),
+ /* Repeat the detail from above so it's easy to find in the log. */
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue),
+ errhint("You can restart the server after making the necessary configuration changes.")));
+ }
+}
diff --git a/src/backend/access/transam/xlogstats.c b/src/backend/access/transam/xlogstats.c
new file mode 100644
index 0000000..5141817
--- /dev/null
+++ b/src/backend/access/transam/xlogstats.c
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogstats.c
+ * Functions for WAL Statitstics
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/transam/xlogstats.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlogreader.h"
+#include "access/xlogstats.h"
+
+/*
+ * Calculate the size of a record, split into !FPI and FPI parts.
+ */
+void
+XLogRecGetLen(XLogReaderState *record, uint32 *rec_len,
+ uint32 *fpi_len)
+{
+ int block_id;
+
+ /*
+ * Calculate the amount of FPI data in the record.
+ *
+ * XXX: We peek into xlogreader's private decoded backup blocks for the
+ * bimg_len indicating the length of FPI data.
+ */
+ *fpi_len = 0;
+ for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+ {
+ if (!XLogRecHasBlockRef(record, block_id))
+ continue;
+
+ if (XLogRecHasBlockImage(record, block_id))
+ *fpi_len += XLogRecGetBlock(record, block_id)->bimg_len;
+ }
+
+ /*
+ * Calculate the length of the record as the total length - the length of
+ * all the block images.
+ */
+ *rec_len = XLogRecGetTotalLen(record) - *fpi_len;
+}
+
+/*
+ * Store per-rmgr and per-record statistics for a given record.
+ */
+void
+XLogRecStoreStats(XLogStats *stats, XLogReaderState *record)
+{
+ RmgrId rmid;
+ uint8 recid;
+ uint32 rec_len;
+ uint32 fpi_len;
+
+ Assert(stats != NULL && record != NULL);
+
+ stats->count++;
+
+ rmid = XLogRecGetRmid(record);
+
+ XLogRecGetLen(record, &rec_len, &fpi_len);
+
+ /* Update per-rmgr statistics */
+
+ stats->rmgr_stats[rmid].count++;
+ stats->rmgr_stats[rmid].rec_len += rec_len;
+ stats->rmgr_stats[rmid].fpi_len += fpi_len;
+
+ /*
+ * Update per-record statistics, where the record is identified by a
+ * combination of the RmgrId and the four bits of the xl_info field that
+ * are the rmgr's domain (resulting in sixteen possible entries per
+ * RmgrId).
+ */
+
+ recid = XLogRecGetInfo(record) >> 4;
+
+ /*
+ * XACT records need to be handled differently. Those records use the
+ * first bit of those four bits for an optional flag variable and the
+ * following three bits for the opcode. We filter opcode out of xl_info
+ * and use it as the identifier of the record.
+ */
+ if (rmid == RM_XACT_ID)
+ recid &= 0x07;
+
+ stats->record_stats[rmid][recid].count++;
+ stats->record_stats[rmid][recid].rec_len += rec_len;
+ stats->record_stats[rmid][recid].fpi_len += fpi_len;
+}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
new file mode 100644
index 0000000..702c8c1
--- /dev/null
+++ b/src/backend/access/transam/xlogutils.c
@@ -0,0 +1,1064 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogutils.c
+ *
+ * PostgreSQL write-ahead log manager utility routines
+ *
+ * This file contains support routines that are used by XLOG replay functions.
+ * None of this code is used during normal system operation.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/xlogrecovery.h"
+#include "access/xlog_internal.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+
+
+/* GUC variable */
+bool ignore_invalid_pages = false;
+
+/*
+ * Are we doing recovery from XLOG?
+ *
+ * This is only ever true in the startup process; it should be read as meaning
+ * "this process is replaying WAL records", rather than "the system is in
+ * recovery mode". It should be examined primarily by functions that need
+ * to act differently when called from a WAL redo function (e.g., to skip WAL
+ * logging). To check whether the system is in recovery regardless of which
+ * process you're running in, use RecoveryInProgress() but only after shared
+ * memory startup and lock initialization.
+ *
+ * This is updated from xlog.c and xlogrecovery.c, but lives here because
+ * it's mostly read by WAL redo functions.
+ */
+bool InRecovery = false;
+
+/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
+HotStandbyState standbyState = STANDBY_DISABLED;
+
+/*
+ * During XLOG replay, we may see XLOG records for incremental updates of
+ * pages that no longer exist, because their relation was later dropped or
+ * truncated. (Note: this is only possible when full_page_writes = OFF,
+ * since when it's ON, the first reference we see to a page should always
+ * be a full-page rewrite not an incremental update.) Rather than simply
+ * ignoring such records, we make a note of the referenced page, and then
+ * complain if we don't actually see a drop or truncate covering the page
+ * later in replay.
+ */
+typedef struct xl_invalid_page_key
+{
+ RelFileNode node; /* the relation */
+ ForkNumber forkno; /* the fork number */
+ BlockNumber blkno; /* the page */
+} xl_invalid_page_key;
+
+typedef struct xl_invalid_page
+{
+ xl_invalid_page_key key; /* hash key ... must be first */
+ bool present; /* page existed but contained zeroes */
+} xl_invalid_page;
+
+static HTAB *invalid_page_tab = NULL;
+
+static int read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr,
+ char *cur_page, bool wait_for_wal);
+
+/* Report a reference to an invalid page */
+static void
+report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
+ BlockNumber blkno, bool present)
+{
+ char *path = relpathperm(node, forkno);
+
+ if (present)
+ elog(elevel, "page %u of relation %s is uninitialized",
+ blkno, path);
+ else
+ elog(elevel, "page %u of relation %s does not exist",
+ blkno, path);
+ pfree(path);
+}
+
+/* Log a reference to an invalid page */
+static void
+log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
+ bool present)
+{
+ xl_invalid_page_key key;
+ xl_invalid_page *hentry;
+ bool found;
+
+ /*
+ * Once recovery has reached a consistent state, the invalid-page table
+ * should be empty and remain so. If a reference to an invalid page is
+ * found after consistency is reached, PANIC immediately. This might seem
+ * aggressive, but it's better than letting the invalid reference linger
+ * in the hash table until the end of recovery and PANIC there, which
+ * might come only much later if this is a standby server.
+ */
+ if (reachedConsistency)
+ {
+ report_invalid_page(WARNING, node, forkno, blkno, present);
+ elog(ignore_invalid_pages ? WARNING : PANIC,
+ "WAL contains references to invalid pages");
+ }
+
+ /*
+ * Log references to invalid pages at DEBUG1 level. This allows some
+ * tracing of the cause (note the elog context mechanism will tell us
+ * something about the XLOG record that generated the reference).
+ */
+ if (message_level_is_interesting(DEBUG1))
+ report_invalid_page(DEBUG1, node, forkno, blkno, present);
+
+ if (invalid_page_tab == NULL)
+ {
+ /* create hash table when first needed */
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(xl_invalid_page_key);
+ ctl.entrysize = sizeof(xl_invalid_page);
+
+ invalid_page_tab = hash_create("XLOG invalid-page table",
+ 100,
+ &ctl,
+ HASH_ELEM | HASH_BLOBS);
+ }
+
+ /* we currently assume xl_invalid_page_key contains no padding */
+ key.node = node;
+ key.forkno = forkno;
+ key.blkno = blkno;
+ hentry = (xl_invalid_page *)
+ hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
+
+ if (!found)
+ {
+ /* hash_search already filled in the key */
+ hentry->present = present;
+ }
+ else
+ {
+ /* repeat reference ... leave "present" as it was */
+ }
+}
+
+/* Forget any invalid pages >= minblkno, because they've been dropped */
+static void
+forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
+{
+ HASH_SEQ_STATUS status;
+ xl_invalid_page *hentry;
+
+ if (invalid_page_tab == NULL)
+ return; /* nothing to do */
+
+ hash_seq_init(&status, invalid_page_tab);
+
+ while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+ {
+ if (RelFileNodeEquals(hentry->key.node, node) &&
+ hentry->key.forkno == forkno &&
+ hentry->key.blkno >= minblkno)
+ {
+ if (message_level_is_interesting(DEBUG2))
+ {
+ char *path = relpathperm(hentry->key.node, forkno);
+
+ elog(DEBUG2, "page %u of relation %s has been dropped",
+ hentry->key.blkno, path);
+ pfree(path);
+ }
+
+ if (hash_search(invalid_page_tab,
+ (void *) &hentry->key,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "hash table corrupted");
+ }
+ }
+}
+
+/* Forget any invalid pages in a whole database */
+static void
+forget_invalid_pages_db(Oid dbid)
+{
+ HASH_SEQ_STATUS status;
+ xl_invalid_page *hentry;
+
+ if (invalid_page_tab == NULL)
+ return; /* nothing to do */
+
+ hash_seq_init(&status, invalid_page_tab);
+
+ while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+ {
+ if (hentry->key.node.dbNode == dbid)
+ {
+ if (message_level_is_interesting(DEBUG2))
+ {
+ char *path = relpathperm(hentry->key.node, hentry->key.forkno);
+
+ elog(DEBUG2, "page %u of relation %s has been dropped",
+ hentry->key.blkno, path);
+ pfree(path);
+ }
+
+ if (hash_search(invalid_page_tab,
+ (void *) &hentry->key,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "hash table corrupted");
+ }
+ }
+}
+
+/* Are there any unresolved references to invalid pages? */
+bool
+XLogHaveInvalidPages(void)
+{
+ if (invalid_page_tab != NULL &&
+ hash_get_num_entries(invalid_page_tab) > 0)
+ return true;
+ return false;
+}
+
+/* Complain about any remaining invalid-page entries */
+void
+XLogCheckInvalidPages(void)
+{
+ HASH_SEQ_STATUS status;
+ xl_invalid_page *hentry;
+ bool foundone = false;
+
+ if (invalid_page_tab == NULL)
+ return; /* nothing to do */
+
+ hash_seq_init(&status, invalid_page_tab);
+
+ /*
+ * Our strategy is to emit WARNING messages for all remaining entries and
+ * only PANIC after we've dumped all the available info.
+ */
+ while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+ {
+ report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
+ hentry->key.blkno, hentry->present);
+ foundone = true;
+ }
+
+ if (foundone)
+ elog(ignore_invalid_pages ? WARNING : PANIC,
+ "WAL contains references to invalid pages");
+
+ hash_destroy(invalid_page_tab);
+ invalid_page_tab = NULL;
+}
+
+
+/*
+ * XLogReadBufferForRedo
+ * Read a page during XLOG replay
+ *
+ * Reads a block referenced by a WAL record into shared buffer cache, and
+ * determines what needs to be done to redo the changes to it. If the WAL
+ * record includes a full-page image of the page, it is restored.
+ *
+ * 'record.EndRecPtr' is compared to the page's LSN to determine if the record
+ * has already been replayed. 'block_id' is the ID number the block was
+ * registered with, when the WAL record was created.
+ *
+ * Returns one of the following:
+ *
+ * BLK_NEEDS_REDO - changes from the WAL record need to be applied
+ * BLK_DONE - block doesn't need replaying
+ * BLK_RESTORED - block was restored from a full-page image included in
+ * the record
+ * BLK_NOTFOUND - block was not found (because it was truncated away by
+ * an operation later in the WAL stream)
+ *
+ * On return, the buffer is locked in exclusive-mode, and returned in *buf.
+ * Note that the buffer is locked and returned even if it doesn't need
+ * replaying. (Getting the buffer lock is not really necessary during
+ * single-process crash recovery, but some subroutines such as MarkBufferDirty
+ * will complain if we don't have the lock. In hot standby mode it's
+ * definitely necessary.)
+ *
+ * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
+ * set, we restore it, even if the page in the database appears newer. This
+ * is to protect ourselves against database pages that were partially or
+ * incorrectly written during a crash. We assume that the XLOG data must be
+ * good because it has passed a CRC check, while the database page might not
+ * be. This will force us to replay all subsequent modifications of the page
+ * that appear in XLOG, rather than possibly ignoring them as already
+ * applied, but that's not a huge drawback.
+ */
+XLogRedoAction
+XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
+ Buffer *buf)
+{
+ return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL,
+ false, buf);
+}
+
+/*
+ * Pin and lock a buffer referenced by a WAL record, for the purpose of
+ * re-initializing it.
+ */
+Buffer
+XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
+{
+ Buffer buf;
+
+ XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false,
+ &buf);
+ return buf;
+}
+
+/*
+ * XLogReadBufferForRedoExtended
+ * Like XLogReadBufferForRedo, but with extra options.
+ *
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the referenced block number. In
+ * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
+ * is always BLK_NEEDS_REDO.
+ *
+ * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
+ * parameter. Do not use an inconsistent combination!)
+ *
+ * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
+ * using LockBufferForCleanup(), instead of a regular exclusive lock.
+ */
+XLogRedoAction
+XLogReadBufferForRedoExtended(XLogReaderState *record,
+ uint8 block_id,
+ ReadBufferMode mode, bool get_cleanup_lock,
+ Buffer *buf)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+ Buffer prefetch_buffer;
+ Page page;
+ bool zeromode;
+ bool willinit;
+
+ if (!XLogRecGetBlockTagExtended(record, block_id, &rnode, &forknum, &blkno,
+ &prefetch_buffer))
+ {
+ /* Caller specified a bogus block_id */
+ elog(PANIC, "failed to locate backup block with ID %d in WAL record",
+ block_id);
+ }
+
+ /*
+ * Make sure that if the block is marked with WILL_INIT, the caller is
+ * going to initialize it. And vice versa.
+ */
+ zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
+ willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0;
+ if (willinit && !zeromode)
+ elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
+ if (!willinit && zeromode)
+ elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
+
+ /* If it has a full-page image and it should be restored, do it. */
+ if (XLogRecBlockImageApply(record, block_id))
+ {
+ Assert(XLogRecHasBlockImage(record, block_id));
+ *buf = XLogReadBufferExtended(rnode, forknum, blkno,
+ get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK,
+ prefetch_buffer);
+ page = BufferGetPage(*buf);
+ if (!RestoreBlockImage(record, block_id, page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg_internal("%s", record->errormsg_buf)));
+
+ /*
+ * The page may be uninitialized. If so, we can't set the LSN because
+ * that would corrupt the page.
+ */
+ if (!PageIsNew(page))
+ {
+ PageSetLSN(page, lsn);
+ }
+
+ MarkBufferDirty(*buf);
+
+ /*
+ * At the end of crash recovery the init forks of unlogged relations
+ * are copied, without going through shared buffers. So we need to
+ * force the on-disk state of init forks to always be in sync with the
+ * state in shared buffers.
+ */
+ if (forknum == INIT_FORKNUM)
+ FlushOneBuffer(*buf);
+
+ return BLK_RESTORED;
+ }
+ else
+ {
+ *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode, prefetch_buffer);
+ if (BufferIsValid(*buf))
+ {
+ if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
+ {
+ if (get_cleanup_lock)
+ LockBufferForCleanup(*buf);
+ else
+ LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
+ }
+ if (lsn <= PageGetLSN(BufferGetPage(*buf)))
+ return BLK_DONE;
+ else
+ return BLK_NEEDS_REDO;
+ }
+ else
+ return BLK_NOTFOUND;
+ }
+}
+
+/*
+ * XLogReadBufferExtended
+ * Read a page during XLOG replay
+ *
+ * This is functionally comparable to ReadBufferExtended. There's some
+ * differences in the behavior wrt. the "mode" argument:
+ *
+ * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
+ * return InvalidBuffer. In this case the caller should silently skip the
+ * update on this page. (In this situation, we expect that the page was later
+ * dropped or truncated. If we don't see evidence of that later in the WAL
+ * sequence, we'll complain at the end of WAL replay.)
+ *
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes. Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
+ *
+ * Optionally, recent_buffer can be used to provide a hint about the location
+ * of the page in the buffer pool; it does not have to be correct, but avoids
+ * a buffer mapping table probe if it is.
+ *
+ * NB: A redo function should normally not call this directly. To get a page
+ * to modify, use XLogReadBufferForRedoExtended instead. It is important that
+ * all pages modified by a WAL record are registered in the WAL records, or
+ * they will be invisible to tools that need to know which pages are modified.
+ */
+Buffer
+XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
+ BlockNumber blkno, ReadBufferMode mode,
+ Buffer recent_buffer)
+{
+ BlockNumber lastblock;
+ Buffer buffer;
+ SMgrRelation smgr;
+
+ Assert(blkno != P_NEW);
+
+ /* Do we have a clue where the buffer might be already? */
+ if (BufferIsValid(recent_buffer) &&
+ mode == RBM_NORMAL &&
+ ReadRecentBuffer(rnode, forknum, blkno, recent_buffer))
+ {
+ buffer = recent_buffer;
+ goto recent_buffer_fast_path;
+ }
+
+ /* Open the relation at smgr level */
+ smgr = smgropen(rnode, InvalidBackendId);
+
+ /*
+ * Create the target file if it doesn't already exist. This lets us cope
+ * if the replay sequence contains writes to a relation that is later
+ * deleted. (The original coding of this routine would instead suppress
+ * the writes, but that seems like it risks losing valuable data if the
+ * filesystem loses an inode during a crash. Better to write the data
+ * until we are actually told to delete the file.)
+ */
+ smgrcreate(smgr, forknum, true);
+
+ lastblock = smgrnblocks(smgr, forknum);
+
+ if (blkno < lastblock)
+ {
+ /* page exists in file */
+ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+ mode, NULL, true);
+ }
+ else
+ {
+ /* hm, page doesn't exist in file */
+ if (mode == RBM_NORMAL)
+ {
+ log_invalid_page(rnode, forknum, blkno, false);
+ return InvalidBuffer;
+ }
+ if (mode == RBM_NORMAL_NO_LOG)
+ return InvalidBuffer;
+ /* OK to extend the file */
+ /* we do this in recovery only - no rel-extension lock needed */
+ Assert(InRecovery);
+ buffer = InvalidBuffer;
+ do
+ {
+ if (buffer != InvalidBuffer)
+ {
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buffer);
+ }
+ buffer = ReadBufferWithoutRelcache(rnode, forknum,
+ P_NEW, mode, NULL, true);
+ }
+ while (BufferGetBlockNumber(buffer) < blkno);
+ /* Handle the corner case that P_NEW returns non-consecutive pages */
+ if (BufferGetBlockNumber(buffer) != blkno)
+ {
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buffer);
+ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+ mode, NULL, true);
+ }
+ }
+
+recent_buffer_fast_path:
+ if (mode == RBM_NORMAL)
+ {
+ /* check that page has been initialized */
+ Page page = (Page) BufferGetPage(buffer);
+
+ /*
+ * We assume that PageIsNew is safe without a lock. During recovery,
+ * there should be no other backends that could modify the buffer at
+ * the same time.
+ */
+ if (PageIsNew(page))
+ {
+ ReleaseBuffer(buffer);
+ log_invalid_page(rnode, forknum, blkno, true);
+ return InvalidBuffer;
+ }
+ }
+
+ return buffer;
+}
+
+/*
+ * Struct actually returned by CreateFakeRelcacheEntry, though the declared
+ * return type is Relation.
+ */
+typedef struct
+{
+ RelationData reldata; /* Note: this must be first */
+ FormData_pg_class pgc;
+} FakeRelCacheEntryData;
+
+typedef FakeRelCacheEntryData *FakeRelCacheEntry;
+
+/*
+ * Create a fake relation cache entry for a physical relation
+ *
+ * It's often convenient to use the same functions in XLOG replay as in the
+ * main codepath, but those functions typically work with a relcache entry.
+ * We don't have a working relation cache during XLOG replay, but this
+ * function can be used to create a fake relcache entry instead. Only the
+ * fields related to physical storage, like rd_rel, are initialized, so the
+ * fake entry is only usable in low-level operations like ReadBuffer().
+ *
+ * This is also used for syncing WAL-skipped files.
+ *
+ * Caller must free the returned entry with FreeFakeRelcacheEntry().
+ */
+Relation
+CreateFakeRelcacheEntry(RelFileNode rnode)
+{
+ FakeRelCacheEntry fakeentry;
+ Relation rel;
+
+ /* Allocate the Relation struct and all related space in one block. */
+ fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
+ rel = (Relation) fakeentry;
+
+ rel->rd_rel = &fakeentry->pgc;
+ rel->rd_node = rnode;
+
+ /*
+ * We will never be working with temp rels during recovery or while
+ * syncing WAL-skipped files.
+ */
+ rel->rd_backend = InvalidBackendId;
+
+ /* It must be a permanent table here */
+ rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
+
+ /* We don't know the name of the relation; use relfilenode instead */
+ sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
+
+ /*
+ * We set up the lockRelId in case anything tries to lock the dummy
+ * relation. Note that this is fairly bogus since relNode may be
+ * different from the relation's OID. It shouldn't really matter though.
+ * In recovery, we are running by ourselves and can't have any lock
+ * conflicts. While syncing, we already hold AccessExclusiveLock.
+ */
+ rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
+ rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
+
+ rel->rd_smgr = NULL;
+
+ return rel;
+}
+
+/*
+ * Free a fake relation cache entry.
+ */
+void
+FreeFakeRelcacheEntry(Relation fakerel)
+{
+ /* make sure the fakerel is not referenced by the SmgrRelation anymore */
+ if (fakerel->rd_smgr != NULL)
+ smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
+ pfree(fakerel);
+}
+
+/*
+ * Drop a relation during XLOG replay
+ *
+ * This is called when the relation is about to be deleted; we need to remove
+ * any open "invalid-page" records for the relation.
+ */
+void
+XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
+{
+ forget_invalid_pages(rnode, forknum, 0);
+}
+
+/*
+ * Drop a whole database during XLOG replay
+ *
+ * As above, but for DROP DATABASE instead of dropping a single rel
+ */
+void
+XLogDropDatabase(Oid dbid)
+{
+ /*
+ * This is unnecessarily heavy-handed, as it will close SMgrRelation
+ * objects for other databases as well. DROP DATABASE occurs seldom enough
+ * that it's not worth introducing a variant of smgrclose for just this
+ * purpose. XXX: Or should we rather leave the smgr entries dangling?
+ */
+ smgrcloseall();
+
+ forget_invalid_pages_db(dbid);
+}
+
+/*
+ * Truncate a relation during XLOG replay
+ *
+ * We need to clean up any open "invalid-page" records for the dropped pages.
+ */
+void
+XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber nblocks)
+{
+ forget_invalid_pages(rnode, forkNum, nblocks);
+}
+
+/*
+ * Determine which timeline to read an xlog page from and set the
+ * XLogReaderState's currTLI to that timeline ID.
+ *
+ * We care about timelines in xlogreader when we might be reading xlog
+ * generated prior to a promotion, either if we're currently a standby in
+ * recovery or if we're a promoted primary reading xlogs generated by the old
+ * primary before our promotion.
+ *
+ * wantPage must be set to the start address of the page to read and
+ * wantLength to the amount of the page that will be read, up to
+ * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
+ *
+ * The currTLI argument should be the system-wide current timeline.
+ * Note that this may be different from state->currTLI, which is the timeline
+ * from which the caller is currently reading previous xlog records.
+ *
+ * We switch to an xlog segment from the new timeline eagerly when on a
+ * historical timeline, as soon as we reach the start of the xlog segment
+ * containing the timeline switch. The server copied the segment to the new
+ * timeline so all the data up to the switch point is the same, but there's no
+ * guarantee the old segment will still exist. It may have been deleted or
+ * renamed with a .partial suffix so we can't necessarily keep reading from
+ * the old TLI even though tliSwitchPoint says it's OK.
+ *
+ * We can't just check the timeline when we read a page on a different segment
+ * to the last page. We could've received a timeline switch from a cascading
+ * upstream, so the current segment ends abruptly (possibly getting renamed to
+ * .partial) and we have to switch to a new one. Even in the middle of reading
+ * a page we could have to dump the cached page and switch to a new TLI.
+ *
+ * Because of this, callers MAY NOT assume that currTLI is the timeline that
+ * will be in a page's xlp_tli; the page may begin on an older timeline or we
+ * might be reading from historical timeline data on a segment that's been
+ * copied to a new timeline.
+ *
+ * The caller must also make sure it doesn't read past the current replay
+ * position (using GetXLogReplayRecPtr) if executing in recovery, so it
+ * doesn't fail to notice that the current timeline became historical.
+ */
+void
+XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
+ uint32 wantLength, TimeLineID currTLI)
+{
+ const XLogRecPtr lastReadPage = (state->seg.ws_segno *
+ state->segcxt.ws_segsize + state->segoff);
+
+ Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
+ Assert(wantLength <= XLOG_BLCKSZ);
+ Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
+ Assert(currTLI != 0);
+
+ /*
+ * If the desired page is currently read in and valid, we have nothing to
+ * do.
+ *
+ * The caller should've ensured that it didn't previously advance readOff
+ * past the valid limit of this timeline, so it doesn't matter if the
+ * current TLI has since become historical.
+ */
+ if (lastReadPage == wantPage &&
+ state->readLen != 0 &&
+ lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
+ return;
+
+ /*
+ * If we're reading from the current timeline, it hasn't become historical
+ * and the page we're reading is after the last page read, we can again
+ * just carry on. (Seeking backwards requires a check to make sure the
+ * older page isn't on a prior timeline).
+ *
+ * currTLI might've become historical since the caller obtained the value,
+ * but the caller is required not to read past the flush limit it saw at
+ * the time it looked up the timeline. There's nothing we can do about it
+ * if StartupXLOG() renames it to .partial concurrently.
+ */
+ if (state->currTLI == currTLI && wantPage >= lastReadPage)
+ {
+ Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
+ return;
+ }
+
+ /*
+ * If we're just reading pages from a previously validated historical
+ * timeline and the timeline we're reading from is valid until the end of
+ * the current segment we can just keep reading.
+ */
+ if (state->currTLIValidUntil != InvalidXLogRecPtr &&
+ state->currTLI != currTLI &&
+ state->currTLI != 0 &&
+ ((wantPage + wantLength) / state->segcxt.ws_segsize) <
+ (state->currTLIValidUntil / state->segcxt.ws_segsize))
+ return;
+
+ /*
+ * If we reach this point we're either looking up a page for random
+ * access, the current timeline just became historical, or we're reading
+ * from a new segment containing a timeline switch. In all cases we need
+ * to determine the newest timeline on the segment.
+ *
+ * If it's the current timeline we can just keep reading from here unless
+ * we detect a timeline switch that makes the current timeline historical.
+ * If it's a historical timeline we can read all the segment on the newest
+ * timeline because it contains all the old timelines' data too. So only
+ * one switch check is required.
+ */
+ {
+ /*
+ * We need to re-read the timeline history in case it's been changed
+ * by a promotion or replay from a cascaded replica.
+ */
+ List *timelineHistory = readTimeLineHistory(currTLI);
+ XLogRecPtr endOfSegment;
+
+ endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) *
+ state->segcxt.ws_segsize - 1;
+ Assert(wantPage / state->segcxt.ws_segsize ==
+ endOfSegment / state->segcxt.ws_segsize);
+
+ /*
+ * Find the timeline of the last LSN on the segment containing
+ * wantPage.
+ */
+ state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
+ state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
+ &state->nextTLI);
+
+ Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
+ wantPage + wantLength < state->currTLIValidUntil);
+
+ list_free_deep(timelineHistory);
+
+ elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+ state->currTLI,
+ LSN_FORMAT_ARGS(state->currTLIValidUntil));
+ }
+}
+
+/* XLogReaderRoutine->segment_open callback for local pg_wal files */
+void
+wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
+ TimeLineID *tli_p)
+{
+ TimeLineID tli = *tli_p;
+ char path[MAXPGPATH];
+
+ XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
+ state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+ if (state->seg.ws_file >= 0)
+ return;
+
+ if (errno == ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("requested WAL segment %s has already been removed",
+ path)));
+ else
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ path)));
+}
+
+/* stock XLogReaderRoutine->segment_close callback */
+void
+wal_segment_close(XLogReaderState *state)
+{
+ close(state->seg.ws_file);
+ /* need to check errno? */
+ state->seg.ws_file = -1;
+}
+
+/*
+ * XLogReaderRoutine->page_read callback for reading local xlog files
+ *
+ * Public because it would likely be very helpful for someone writing another
+ * output method outside walsender, e.g. in a bgworker.
+ *
+ * TODO: The walsender has its own version of this, but it relies on the
+ * walsender's latch being set whenever WAL is flushed. No such infrastructure
+ * exists for normal backends, so we have to do a check/sleep/repeat style of
+ * loop for now.
+ */
+int
+read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
+{
+ return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
+ targetRecPtr, cur_page, true);
+}
+
+/*
+ * Same as read_local_xlog_page except that it doesn't wait for future WAL
+ * to be available.
+ */
+int
+read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr,
+ char *cur_page)
+{
+ return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
+ targetRecPtr, cur_page, false);
+}
+
+/*
+ * Implementation of read_local_xlog_page and its no wait version.
+ */
+static int
+read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr,
+ char *cur_page, bool wait_for_wal)
+{
+ XLogRecPtr read_upto,
+ loc;
+ TimeLineID tli;
+ int count;
+ WALReadError errinfo;
+ TimeLineID currTLI;
+
+ loc = targetPagePtr + reqLen;
+
+ /* Loop waiting for xlog to be available if necessary */
+ while (1)
+ {
+ /*
+ * Determine the limit of xlog we can currently read to, and what the
+ * most recent timeline is.
+ */
+ if (!RecoveryInProgress())
+ read_upto = GetFlushRecPtr(&currTLI);
+ else
+ read_upto = GetXLogReplayRecPtr(&currTLI);
+ tli = currTLI;
+
+ /*
+ * Check which timeline to get the record from.
+ *
+ * We have to do it each time through the loop because if we're in
+ * recovery as a cascading standby, the current timeline might've
+ * become historical. We can't rely on RecoveryInProgress() because in
+ * a standby configuration like
+ *
+ * A => B => C
+ *
+ * if we're a logical decoding session on C, and B gets promoted, our
+ * timeline will change while we remain in recovery.
+ *
+ * We can't just keep reading from the old timeline as the last WAL
+ * archive in the timeline will get renamed to .partial by
+ * StartupXLOG().
+ *
+ * If that happens after our caller determined the TLI but before we
+ * actually read the xlog page, we might still try to read from the
+ * old (now renamed) segment and fail. There's not much we can do
+ * about this, but it can only happen when we're a leaf of a cascading
+ * standby whose primary gets promoted while we're decoding, so a
+ * one-off ERROR isn't too bad.
+ */
+ XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli);
+
+ if (state->currTLI == currTLI)
+ {
+
+ if (loc <= read_upto)
+ break;
+
+ /* If asked, let's not wait for future WAL. */
+ if (!wait_for_wal)
+ {
+ ReadLocalXLogPageNoWaitPrivate *private_data;
+
+ /*
+ * Inform the caller of read_local_xlog_page_no_wait that the
+ * end of WAL has been reached.
+ */
+ private_data = (ReadLocalXLogPageNoWaitPrivate *)
+ state->private_data;
+ private_data->end_of_wal = true;
+ break;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(1000L);
+ }
+ else
+ {
+ /*
+ * We're on a historical timeline, so limit reading to the switch
+ * point where we moved to the next timeline.
+ *
+ * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
+ * about the new timeline, so we must've received past the end of
+ * it.
+ */
+ read_upto = state->currTLIValidUntil;
+
+ /*
+ * Setting tli to our wanted record's TLI is slightly wrong; the
+ * page might begin on an older timeline if it contains a timeline
+ * switch, since its xlog segment will have been copied from the
+ * prior timeline. This is pretty harmless though, as nothing
+ * cares so long as the timeline doesn't go backwards. We should
+ * read the page header instead; FIXME someday.
+ */
+ tli = state->currTLI;
+
+ /* No need to wait on a historical timeline */
+ break;
+ }
+ }
+
+ if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
+ {
+ /*
+ * more than one block available; read only that block, have caller
+ * come back if they need more.
+ */
+ count = XLOG_BLCKSZ;
+ }
+ else if (targetPagePtr + reqLen > read_upto)
+ {
+ /* not enough data there */
+ return -1;
+ }
+ else
+ {
+ /* enough bytes available to satisfy the request */
+ count = read_upto - targetPagePtr;
+ }
+
+ /*
+ * Even though we just determined how much of the page can be validly read
+ * as 'count', read the whole page anyway. It's guaranteed to be
+ * zero-padded up to the page boundary if it's incomplete.
+ */
+ if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli,
+ &errinfo))
+ WALReadRaiseError(&errinfo);
+
+ /* number of valid bytes in the buffer */
+ return count;
+}
+
+/*
+ * Backend-specific convenience code to handle read errors encountered by
+ * WALRead().
+ */
+void
+WALReadRaiseError(WALReadError *errinfo)
+{
+ WALOpenSegment *seg = &errinfo->wre_seg;
+ char fname[MAXFNAMELEN];
+
+ XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size);
+
+ if (errinfo->wre_read < 0)
+ {
+ errno = errinfo->wre_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from log segment %s, offset %d: %m",
+ fname, errinfo->wre_off)));
+ }
+ else if (errinfo->wre_read == 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read from log segment %s, offset %d: read %d of %d",
+ fname, errinfo->wre_off, errinfo->wre_read,
+ errinfo->wre_req)));
+ }
+}