Adding upstream version 15.5.upstream/15.5

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
commit: 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree: 739caf8c461053357daa9f162bef34516c7bf452 /src/backend/access/transam
parent: Initial commit. (diff)
download: postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
26 files changed, 42364 insertions, 0 deletions
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
new file mode 100644
index 0000000..3e5444a
--- /dev/null
+++ b/src/backend/access/transam/Makefile
@@ -0,0 +1,43 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/transam
+#
+# IDENTIFICATION
+#    src/backend/access/transam/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/transam
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	clog.o \
+	commit_ts.o \
+	generic_xlog.o \
+	multixact.o \
+	parallel.o \
+	rmgr.o \
+	slru.o \
+	subtrans.o \
+	timeline.o \
+	transam.o \
+	twophase.o \
+	twophase_rmgr.o \
+	varsup.o \
+	xact.o \
+	xlog.o \
+	xlogarchive.o \
+	xlogfuncs.o \
+	xloginsert.o \
+	xlogprefetcher.o \
+	xlogreader.o \
+	xlogrecovery.o \
+	xlogstats.o \
+	xlogutils.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# ensure that version checks in xlog.c get recompiled when catversion.h changes
+xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
new file mode 100644
index 0000000..26fd77b
--- /dev/null
+++ b/src/backend/access/transam/README
@@ -0,0 +1,896 @@
+src/backend/access/transam/README
+
+The Transaction System
+======================
+
+PostgreSQL's transaction system is a three-layer system.  The bottom layer
+implements low-level transactions and subtransactions, on top of which rests
+the mainloop's control code, which in turn implements user-visible
+transactions and savepoints.
+
+The middle layer of code is called by postgres.c before and after the
+processing of each query, or after detecting an error:
+
+		StartTransactionCommand
+		CommitTransactionCommand
+		AbortCurrentTransaction
+
+Meanwhile, the user can alter the system's state by issuing the SQL commands
+BEGIN, COMMIT, ROLLBACK, SAVEPOINT, ROLLBACK TO or RELEASE.  The traffic cop
+redirects these calls to the toplevel routines
+
+		BeginTransactionBlock
+		EndTransactionBlock
+		UserAbortTransactionBlock
+		DefineSavepoint
+		RollbackToSavepoint
+		ReleaseSavepoint
+
+respectively.  Depending on the current state of the system, these functions
+call low level functions to activate the real transaction system:
+
+		StartTransaction
+		CommitTransaction
+		AbortTransaction
+		CleanupTransaction
+		StartSubTransaction
+		CommitSubTransaction
+		AbortSubTransaction
+		CleanupSubTransaction
+
+Additionally, within a transaction, CommandCounterIncrement is called to
+increment the command counter, which allows future commands to "see" the
+effects of previous commands within the same transaction.  Note that this is
+done automatically by CommitTransactionCommand after each query inside a
+transaction block, but some utility functions also do it internally to allow
+some operations (usually in the system catalogs) to be seen by future
+operations in the same utility command.  (For example, in DefineRelation it is
+done after creating the heap so the pg_class row is visible, to be able to
+lock it.)
+
+
+For example, consider the following sequence of user commands:
+
+1)		BEGIN
+2)		SELECT * FROM foo
+3)		INSERT INTO foo VALUES (...)
+4)		COMMIT
+
+In the main processing loop, this results in the following function call
+sequence:
+
+     /  StartTransactionCommand;
+    /       StartTransaction;
+1) <    ProcessUtility;                 << BEGIN
+    \       BeginTransactionBlock;
+     \  CommitTransactionCommand;
+
+    /   StartTransactionCommand;
+2) /    PortalRunSelect;                << SELECT ...
+   \    CommitTransactionCommand;
+    \       CommandCounterIncrement;
+
+    /   StartTransactionCommand;
+3) /    ProcessQuery;                   << INSERT ...
+   \    CommitTransactionCommand;
+    \       CommandCounterIncrement;
+
+     /  StartTransactionCommand;
+    /   ProcessUtility;                 << COMMIT
+4) <        EndTransactionBlock;
+    \   CommitTransactionCommand;
+     \      CommitTransaction;
+
+The point of this example is to demonstrate the need for
+StartTransactionCommand and CommitTransactionCommand to be state smart -- they
+should call CommandCounterIncrement between the calls to BeginTransactionBlock
+and EndTransactionBlock and outside these calls they need to do normal start,
+commit or abort processing.
+
+Furthermore, suppose the "SELECT * FROM foo" caused an abort condition. In
+this case AbortCurrentTransaction is called, and the transaction is put in
+aborted state.  In this state, any user input is ignored except for
+transaction-termination statements, or ROLLBACK TO <savepoint> commands.
+
+Transaction aborts can occur in two ways:
+
+1) system dies from some internal cause  (syntax error, etc)
+2) user types ROLLBACK
+
+The reason we have to distinguish them is illustrated by the following two
+situations:
+
+        case 1                                  case 2
+        ------                                  ------
+1) user types BEGIN                     1) user types BEGIN
+2) user does something                  2) user does something
+3) user does not like what              3) system aborts for some reason
+   she sees and types ABORT                (syntax error, etc)
+
+In case 1, we want to abort the transaction and return to the default state.
+In case 2, there may be more commands coming our way which are part of the
+same transaction block; we have to ignore these commands until we see a COMMIT
+or ROLLBACK.
+
+Internal aborts are handled by AbortCurrentTransaction, while user aborts are
+handled by UserAbortTransactionBlock.  Both of them rely on AbortTransaction
+to do all the real work.  The only difference is what state we enter after
+AbortTransaction does its work:
+
+* AbortCurrentTransaction leaves us in TBLOCK_ABORT,
+* UserAbortTransactionBlock leaves us in TBLOCK_ABORT_END
+
+Low-level transaction abort handling is divided in two phases:
+* AbortTransaction executes as soon as we realize the transaction has
+  failed.  It should release all shared resources (locks etc) so that we do
+  not delay other backends unnecessarily.
+* CleanupTransaction executes when we finally see a user COMMIT
+  or ROLLBACK command; it cleans things up and gets us out of the transaction
+  completely.  In particular, we mustn't destroy TopTransactionContext until
+  this point.
+
+Also, note that when a transaction is committed, we don't close it right away.
+Rather it's put in TBLOCK_END state, which means that when
+CommitTransactionCommand is called after the query has finished processing,
+the transaction has to be closed.  The distinction is subtle but important,
+because it means that control will leave the xact.c code with the transaction
+open, and the main loop will be able to keep processing inside the same
+transaction.  So, in a sense, transaction commit is also handled in two
+phases, the first at EndTransactionBlock and the second at
+CommitTransactionCommand (which is where CommitTransaction is actually
+called).
+
+The rest of the code in xact.c are routines to support the creation and
+finishing of transactions and subtransactions.  For example, AtStart_Memory
+takes care of initializing the memory subsystem at main transaction start.
+
+
+Subtransaction Handling
+-----------------------
+
+Subtransactions are implemented using a stack of TransactionState structures,
+each of which has a pointer to its parent transaction's struct.  When a new
+subtransaction is to be opened, PushTransaction is called, which creates a new
+TransactionState, with its parent link pointing to the current transaction.
+StartSubTransaction is in charge of initializing the new TransactionState to
+sane values, and properly initializing other subsystems (AtSubStart routines).
+
+When closing a subtransaction, either CommitSubTransaction has to be called
+(if the subtransaction is committing), or AbortSubTransaction and
+CleanupSubTransaction (if it's aborting).  In either case, PopTransaction is
+called so the system returns to the parent transaction.
+
+One important point regarding subtransaction handling is that several may need
+to be closed in response to a single user command.  That's because savepoints
+have names, and we allow to commit or rollback a savepoint by name, which is
+not necessarily the one that was last opened.  Also a COMMIT or ROLLBACK
+command must be able to close out the entire stack.  We handle this by having
+the utility command subroutine mark all the state stack entries as commit-
+pending or abort-pending, and then when the main loop reaches
+CommitTransactionCommand, the real work is done.  The main point of doing
+things this way is that if we get an error while popping state stack entries,
+the remaining stack entries still show what we need to do to finish up.
+
+In the case of ROLLBACK TO <savepoint>, we abort all the subtransactions up
+through the one identified by the savepoint name, and then re-create that
+subtransaction level with the same name.  So it's a completely new
+subtransaction as far as the internals are concerned.
+
+Other subsystems are allowed to start "internal" subtransactions, which are
+handled by BeginInternalSubTransaction.  This is to allow implementing
+exception handling, e.g. in PL/pgSQL.  ReleaseCurrentSubTransaction and
+RollbackAndReleaseCurrentSubTransaction allows the subsystem to close said
+subtransactions.  The main difference between this and the savepoint/release
+path is that we execute the complete state transition immediately in each
+subroutine, rather than deferring some work until CommitTransactionCommand.
+Another difference is that BeginInternalSubTransaction is allowed when no
+explicit transaction block has been established, while DefineSavepoint is not.
+
+
+Transaction and Subtransaction Numbering
+----------------------------------------
+
+Transactions and subtransactions are assigned permanent XIDs only when/if
+they first do something that requires one --- typically, insert/update/delete
+a tuple, though there are a few other places that need an XID assigned.
+If a subtransaction requires an XID, we always first assign one to its
+parent.  This maintains the invariant that child transactions have XIDs later
+than their parents, which is assumed in a number of places.
+
+The subsidiary actions of obtaining a lock on the XID and entering it into
+pg_subtrans and PG_PROC are done at the time it is assigned.
+
+A transaction that has no XID still needs to be identified for various
+purposes, notably holding locks.  For this purpose we assign a "virtual
+transaction ID" or VXID to each top-level transaction.  VXIDs are formed from
+two fields, the backendID and a backend-local counter; this arrangement allows
+assignment of a new VXID at transaction start without any contention for
+shared memory.  To ensure that a VXID isn't re-used too soon after backend
+exit, we store the last local counter value into shared memory at backend
+exit, and initialize it from the previous value for the same backendID slot
+at backend start.  All these counters go back to zero at shared memory
+re-initialization, but that's OK because VXIDs never appear anywhere on-disk.
+
+Internally, a backend needs a way to identify subtransactions whether or not
+they have XIDs; but this need only lasts as long as the parent top transaction
+endures.  Therefore, we have SubTransactionId, which is somewhat like
+CommandId in that it's generated from a counter that we reset at the start of
+each top transaction.  The top-level transaction itself has SubTransactionId 1,
+and subtransactions have IDs 2 and up.  (Zero is reserved for
+InvalidSubTransactionId.)  Note that subtransactions do not have their
+own VXIDs; they use the parent top transaction's VXID.
+
+
+Interlocking Transaction Begin, Transaction End, and Snapshots
+--------------------------------------------------------------
+
+We try hard to minimize the amount of overhead and lock contention involved
+in the frequent activities of beginning/ending a transaction and taking a
+snapshot.  Unfortunately, we must have some interlocking for this, because
+we must ensure consistency about the commit order of transactions.
+For example, suppose an UPDATE in xact A is blocked by xact B's prior
+update of the same row, and xact B is doing commit while xact C gets a
+snapshot.  Xact A can complete and commit as soon as B releases its locks.
+If xact C's GetSnapshotData sees xact B as still running, then it had
+better see xact A as still running as well, or it will be able to see two
+tuple versions - one deleted by xact B and one inserted by xact A.  Another
+reason why this would be bad is that C would see (in the row inserted by A)
+earlier changes by B, and it would be inconsistent for C not to see any
+of B's changes elsewhere in the database.
+
+Formally, the correctness requirement is "if a snapshot A considers
+transaction X as committed, and any of transaction X's snapshots considered
+transaction Y as committed, then snapshot A must consider transaction Y as
+committed".
+
+What we actually enforce is strict serialization of commits and rollbacks
+with snapshot-taking: we do not allow any transaction to exit the set of
+running transactions while a snapshot is being taken.  (This rule is
+stronger than necessary for consistency, but is relatively simple to
+enforce, and it assists with some other issues as explained below.)  The
+implementation of this is that GetSnapshotData takes the ProcArrayLock in
+shared mode (so that multiple backends can take snapshots in parallel),
+but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode
+while clearing the ProcGlobal->xids[] entry at transaction end (either
+commit or abort). (To reduce context switching, when multiple transactions
+commit nearly simultaneously, we have one backend take ProcArrayLock and
+clear the XIDs of multiple processes at once.)
+
+ProcArrayEndTransaction also holds the lock while advancing the shared
+latestCompletedXid variable.  This allows GetSnapshotData to use
+latestCompletedXid + 1 as xmax for its snapshot: there can be no
+transaction >= this xid value that the snapshot needs to consider as
+completed.
+
+In short, then, the rule is that no transaction may exit the set of
+currently-running transactions between the time we fetch latestCompletedXid
+and the time we finish building our snapshot.  However, this restriction
+only applies to transactions that have an XID --- read-only transactions
+can end without acquiring ProcArrayLock, since they don't affect anyone
+else's snapshot nor latestCompletedXid.
+
+Transaction start, per se, doesn't have any interlocking with these
+considerations, since we no longer assign an XID immediately at transaction
+start.  But when we do decide to allocate an XID, GetNewTransactionId must
+store the new XID into the shared ProcArray before releasing XidGenLock.
+This ensures that all top-level XIDs <= latestCompletedXid are either
+present in the ProcArray, or not running anymore.  (This guarantee doesn't
+apply to subtransaction XIDs, because of the possibility that there's not
+room for them in the subxid array; instead we guarantee that they are
+present or the overflow flag is set.)  If a backend released XidGenLock
+before storing its XID into ProcGlobal->xids[], then it would be possible for
+another backend to allocate and commit a later XID, causing latestCompletedXid
+to pass the first backend's XID, before that value became visible in the
+ProcArray.  That would break ComputeXidHorizons, as discussed below.
+
+We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the
+subxid array) without taking ProcArrayLock.  This was once necessary to
+avoid deadlock; while that is no longer the case, it's still beneficial for
+performance.  We are thereby relying on fetch/store of an XID to be atomic,
+else other backends might see a partially-set XID.  This also means that
+readers of the ProcArray xid fields must be careful to fetch a value only
+once, rather than assume they can read it multiple times and get the same
+answer each time.  (Use volatile-qualified pointers when doing this, to
+ensure that the C compiler does exactly what you tell it to.)
+
+Another important activity that uses the shared ProcArray is
+ComputeXidHorizons, which must determine a lower bound for the oldest xmin
+of any active MVCC snapshot, system-wide.  Each individual backend
+advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero
+if it currently has no live snapshots (eg, if it's between transactions or
+hasn't yet set a snapshot for a new transaction).  ComputeXidHorizons takes
+the MIN() of the valid xmin fields.  It does this with only shared lock on
+ProcArrayLock, which means there is a potential race condition against other
+backends doing GetSnapshotData concurrently: we must be certain that a
+concurrent backend that is about to set its xmin does not compute an xmin
+less than what ComputeXidHorizons determines.  We ensure that by including
+all the active XIDs into the MIN() calculation, along with the valid xmins.
+The rule that transactions can't exit without taking exclusive ProcArrayLock
+ensures that concurrent holders of shared ProcArrayLock will compute the
+same minimum of currently-active XIDs: no xact, in particular not the
+oldest, can exit while we hold shared ProcArrayLock.  So
+ComputeXidHorizons's view of the minimum active XID will be the same as that
+of any concurrent GetSnapshotData, and so it can't produce an overestimate.
+If there is no active transaction at all, ComputeXidHorizons uses
+latestCompletedXid + 1, which is a lower bound for the xmin that might
+be computed by concurrent or later GetSnapshotData calls.  (We know that no
+XID less than this could be about to appear in the ProcArray, because of the
+XidGenLock interlock discussed above.)
+
+As GetSnapshotData is performance critical, it does not perform an accurate
+oldest-xmin calculation (it used to, until v14). The contents of a snapshot
+only depend on the xids of other backends, not their xmin. As backend's xmin
+changes much more often than its xid, having GetSnapshotData look at xmins
+can lead to a lot of unnecessary cacheline ping-pong.  Instead
+GetSnapshotData updates approximate thresholds (one that guarantees that all
+deleted rows older than it can be removed, another determining that deleted
+rows newer than it can not be removed). GlobalVisTest* uses those thresholds
+to make invisibility decision, falling back to ComputeXidHorizons if
+necessary.
+
+Note that while it is certain that two concurrent executions of
+GetSnapshotData will compute the same xmin for their own snapshots, there is
+no such guarantee for the horizons computed by ComputeXidHorizons.  This is
+because we allow XID-less transactions to clear their MyProc->xmin
+asynchronously (without taking ProcArrayLock), so one execution might see
+what had been the oldest xmin, and another not.  This is OK since the
+thresholds need only be a valid lower bound.  As noted above, we are already
+assuming that fetch/store of the xid fields is atomic, so assuming it for
+xmin as well is no extra risk.
+
+
+pg_xact and pg_subtrans
+-----------------------
+
+pg_xact and pg_subtrans are permanent (on-disk) storage of transaction related
+information.  There is a limited number of pages of each kept in memory, so
+in many cases there is no need to actually read from disk.  However, if
+there's a long running transaction or a backend sitting idle with an open
+transaction, it may be necessary to be able to read and write this information
+from disk.  They also allow information to be permanent across server restarts.
+
+pg_xact records the commit status for each transaction that has been assigned
+an XID.  A transaction can be in progress, committed, aborted, or
+"sub-committed".  This last state means that it's a subtransaction that's no
+longer running, but its parent has not updated its state yet.  It is not
+necessary to update a subtransaction's transaction status to subcommit, so we
+can just defer it until main transaction commit.  The main role of marking
+transactions as sub-committed is to provide an atomic commit protocol when
+transaction status is spread across multiple clog pages. As a result, whenever
+transaction status spreads across multiple pages we must use a two-phase commit
+protocol: the first phase is to mark the subtransactions as sub-committed, then
+we mark the top level transaction and all its subtransactions committed (in
+that order).  Thus, subtransactions that have not aborted appear as in-progress
+even when they have already finished, and the subcommit status appears as a
+very short transitory state during main transaction commit.  Subtransaction
+abort is always marked in clog as soon as it occurs.  When the transaction
+status all fit in a single CLOG page, we atomically mark them all as committed
+without bothering with the intermediate sub-commit state.
+
+Savepoints are implemented using subtransactions.  A subtransaction is a
+transaction inside a transaction; its commit or abort status is not only
+dependent on whether it committed itself, but also whether its parent
+transaction committed.  To implement multiple savepoints in a transaction we
+allow unlimited transaction nesting depth, so any particular subtransaction's
+commit state is dependent on the commit status of each and every ancestor
+transaction.
+
+The "subtransaction parent" (pg_subtrans) mechanism records, for each
+transaction with an XID, the TransactionId of its parent transaction.  This
+information is stored as soon as the subtransaction is assigned an XID.
+Top-level transactions do not have a parent, so they leave their pg_subtrans
+entries set to the default value of zero (InvalidTransactionId).
+
+pg_subtrans is used to check whether the transaction in question is still
+running --- the main Xid of a transaction is recorded in ProcGlobal->xids[],
+with a copy in PGPROC->xid, but since we allow arbitrary nesting of
+subtransactions, we can't fit all Xids in shared memory, so we have to store
+them on disk.  Note, however, that for each transaction we keep a "cache" of
+Xids that are known to be part of the transaction tree, so we can skip looking
+at pg_subtrans unless we know the cache has been overflowed.  See
+storage/ipc/procarray.c for the gory details.
+
+slru.c is the supporting mechanism for both pg_xact and pg_subtrans.  It
+implements the LRU policy for in-memory buffer pages.  The high-level routines
+for pg_xact are implemented in transam.c, while the low-level functions are in
+clog.c.  pg_subtrans is contained completely in subtrans.c.
+
+
+Write-Ahead Log Coding
+----------------------
+
+The WAL subsystem (also called XLOG in the code) exists to guarantee crash
+recovery.  It can also be used to provide point-in-time recovery, as well as
+hot-standby replication via log shipping.  Here are some notes about
+non-obvious aspects of its design.
+
+A basic assumption of a write AHEAD log is that log entries must reach stable
+storage before the data-page changes they describe.  This ensures that
+replaying the log to its end will bring us to a consistent state where there
+are no partially-performed transactions.  To guarantee this, each data page
+(either heap or index) is marked with the LSN (log sequence number --- in
+practice, a WAL file location) of the latest XLOG record affecting the page.
+Before the bufmgr can write out a dirty page, it must ensure that xlog has
+been flushed to disk at least up to the page's LSN.  This low-level
+interaction improves performance by not waiting for XLOG I/O until necessary.
+The LSN check exists only in the shared-buffer manager, not in the local
+buffer manager used for temp tables; hence operations on temp tables must not
+be WAL-logged.
+
+During WAL replay, we can check the LSN of a page to detect whether the change
+recorded by the current log entry is already applied (it has been, if the page
+LSN is >= the log entry's WAL location).
+
+Usually, log entries contain just enough information to redo a single
+incremental update on a page (or small group of pages).  This will work only
+if the filesystem and hardware implement data page writes as atomic actions,
+so that a page is never left in a corrupt partly-written state.  Since that's
+often an untenable assumption in practice, we log additional information to
+allow complete reconstruction of modified pages.  The first WAL record
+affecting a given page after a checkpoint is made to contain a copy of the
+entire page, and we implement replay by restoring that page copy instead of
+redoing the update.  (This is more reliable than the data storage itself would
+be because we can check the validity of the WAL record's CRC.)  We can detect
+the "first change after checkpoint" by noting whether the page's old LSN
+precedes the end of WAL as of the last checkpoint (the RedoRecPtr).
+
+The general schema for executing a WAL-logged action is
+
+1. Pin and exclusive-lock the shared buffer(s) containing the data page(s)
+to be modified.
+
+2. START_CRIT_SECTION()  (Any error during the next three steps must cause a
+PANIC because the shared buffers will contain unlogged changes, which we
+have to ensure don't get to disk.  Obviously, you should check conditions
+such as whether there's enough free space on the page before you start the
+critical section.)
+
+3. Apply the required changes to the shared buffer(s).
+
+4. Mark the shared buffer(s) as dirty with MarkBufferDirty().  (This must
+happen before the WAL record is inserted; see notes in SyncOneBuffer().)
+Note that marking a buffer dirty with MarkBufferDirty() should only
+happen iff you write a WAL record; see Writing Hints below.
+
+5. If the relation requires WAL-logging, build a WAL record using
+XLogBeginInsert and XLogRegister* functions, and insert it.  (See
+"Constructing a WAL record" below).  Then update the page's LSN using the
+returned XLOG location.  For instance,
+
+		XLogBeginInsert();
+		XLogRegisterBuffer(...)
+		XLogRegisterData(...)
+		recptr = XLogInsert(rmgr_id, info);
+
+		PageSetLSN(dp, recptr);
+
+6. END_CRIT_SECTION()
+
+7. Unlock and unpin the buffer(s).
+
+Complex changes (such as a multilevel index insertion) normally need to be
+described by a series of atomic-action WAL records.  The intermediate states
+must be self-consistent, so that if the replay is interrupted between any
+two actions, the system is fully functional.  In btree indexes, for example,
+a page split requires a new page to be allocated, and an insertion of a new
+key in the parent btree level, but for locking reasons this has to be
+reflected by two separate WAL records.  Replaying the first record, to
+allocate the new page and move tuples to it, sets a flag on the page to
+indicate that the key has not been inserted to the parent yet.  Replaying the
+second record clears the flag.  This intermediate state is never seen by
+other backends during normal operation, because the lock on the child page
+is held across the two actions, but will be seen if the operation is
+interrupted before writing the second WAL record.  The search algorithm works
+with the intermediate state as normal, but if an insertion encounters a page
+with the incomplete-split flag set, it will finish the interrupted split by
+inserting the key to the parent, before proceeding.
+
+
+Constructing a WAL record
+-------------------------
+
+A WAL record consists of a header common to all WAL record types,
+record-specific data, and information about the data blocks modified.  Each
+modified data block is identified by an ID number, and can optionally have
+more record-specific data associated with the block.  If XLogInsert decides
+that a full-page image of a block needs to be taken, the data associated
+with that block is not included.
+
+The API for constructing a WAL record consists of five functions:
+XLogBeginInsert, XLogRegisterBuffer, XLogRegisterData, XLogRegisterBufData,
+and XLogInsert.  First, call XLogBeginInsert().  Then register all the buffers
+modified, and data needed to replay the changes, using XLogRegister*
+functions.  Finally, insert the constructed record to the WAL by calling
+XLogInsert().
+
+	XLogBeginInsert();
+
+	/* register buffers modified as part of this WAL-logged action */
+	XLogRegisterBuffer(0, lbuffer, REGBUF_STANDARD);
+	XLogRegisterBuffer(1, rbuffer, REGBUF_STANDARD);
+
+	/* register data that is always included in the WAL record */
+	XLogRegisterData(&xlrec, SizeOfFictionalAction);
+
+	/*
+	 * register data associated with a buffer. This will not be included
+	 * in the record if a full-page image is taken.
+	 */
+	XLogRegisterBufData(0, tuple->data, tuple->len);
+
+	/* more data associated with the buffer */
+	XLogRegisterBufData(0, data2, len2);
+
+	/*
+	 * Ok, all the data and buffers to include in the WAL record have
+	 * been registered. Insert the record.
+	 */
+	recptr = XLogInsert(RM_FOO_ID, XLOG_FOOBAR_DO_STUFF);
+
+Details of the API functions:
+
+void XLogBeginInsert(void)
+
+    Must be called before XLogRegisterBuffer and XLogRegisterData.
+
+void XLogResetInsertion(void)
+
+    Clear any currently registered data and buffers from the WAL record
+    construction workspace.  This is only needed if you have already called
+    XLogBeginInsert(), but decide to not insert the record after all.
+
+void XLogEnsureRecordSpace(int max_block_id, int ndatas)
+
+    Normally, the WAL record construction buffers have the following limits:
+
+    * highest block ID that can be used is 4 (allowing five block references)
+    * Max 20 chunks of registered data
+
+    These default limits are enough for most record types that change some
+    on-disk structures.  For the odd case that requires more data, or needs to
+    modify more buffers, these limits can be raised by calling
+    XLogEnsureRecordSpace().  XLogEnsureRecordSpace() must be called before
+    XLogBeginInsert(), and outside a critical section.
+
+void XLogRegisterBuffer(uint8 block_id, Buffer buf, uint8 flags);
+
+    XLogRegisterBuffer adds information about a data block to the WAL record.
+    block_id is an arbitrary number used to identify this page reference in
+    the redo routine.  The information needed to re-find the page at redo -
+    relfilenode, fork, and block number - are included in the WAL record.
+
+    XLogInsert will automatically include a full copy of the page contents, if
+    this is the first modification of the buffer since the last checkpoint.
+    It is important to register every buffer modified by the action with
+    XLogRegisterBuffer, to avoid torn-page hazards.
+
+    The flags control when and how the buffer contents are included in the
+    WAL record.  Normally, a full-page image is taken only if the page has not
+    been modified since the last checkpoint, and only if full_page_writes=on
+    or an online backup is in progress.  The REGBUF_FORCE_IMAGE flag can be
+    used to force a full-page image to always be included; that is useful
+    e.g. for an operation that rewrites most of the page, so that tracking the
+    details is not worth it.  For the rare case where it is not necessary to
+    protect from torn pages, REGBUF_NO_IMAGE flag can be used to suppress
+    full page image from being taken.  REGBUF_WILL_INIT also suppresses a full
+    page image, but the redo routine must re-generate the page from scratch,
+    without looking at the old page contents.  Re-initializing the page
+    protects from torn page hazards like a full page image does.
+
+    The REGBUF_STANDARD flag can be specified together with the other flags to
+    indicate that the page follows the standard page layout.  It causes the
+    area between pd_lower and pd_upper to be left out from the image, reducing
+    WAL volume.
+
+    If the REGBUF_KEEP_DATA flag is given, any per-buffer data registered with
+    XLogRegisterBufData() is included in the WAL record even if a full-page
+    image is taken.
+
+void XLogRegisterData(char *data, int len);
+
+    XLogRegisterData is used to include arbitrary data in the WAL record.  If
+    XLogRegisterData() is called multiple times, the data are appended, and
+    will be made available to the redo routine as one contiguous chunk.
+
+void XLogRegisterBufData(uint8 block_id, char *data, int len);
+
+    XLogRegisterBufData is used to include data associated with a particular
+    buffer that was registered earlier with XLogRegisterBuffer().  If
+    XLogRegisterBufData() is called multiple times with the same block ID, the
+    data are appended, and will be made available to the redo routine as one
+    contiguous chunk.
+
+    If a full-page image of the buffer is taken at insertion, the data is not
+    included in the WAL record, unless the REGBUF_KEEP_DATA flag is used.
+
+
+Writing a REDO routine
+----------------------
+
+A REDO routine uses the data and page references included in the WAL record
+to reconstruct the new state of the page.  The record decoding functions
+and macros in xlogreader.c/h can be used to extract the data from the record.
+
+When replaying a WAL record that describes changes on multiple pages, you
+must be careful to lock the pages properly to prevent concurrent Hot Standby
+queries from seeing an inconsistent state.  If this requires that two
+or more buffer locks be held concurrently, you must lock the pages in
+appropriate order, and not release the locks until all the changes are done.
+
+Note that we must only use PageSetLSN/PageGetLSN() when we know the action
+is serialised. Only Startup process may modify data blocks during recovery,
+so Startup process may execute PageGetLSN() without fear of serialisation
+problems. All other processes must only call PageSet/GetLSN when holding
+either an exclusive buffer lock or a shared lock plus buffer header lock,
+or be writing the data block directly rather than through shared buffers
+while holding AccessExclusiveLock on the relation.
+
+
+Writing Hints
+-------------
+
+In some cases, we write additional information to data blocks without
+writing a preceding WAL record. This should only happen iff the data can
+be reconstructed later following a crash and the action is simply a way
+of optimising for performance. When a hint is written we use
+MarkBufferDirtyHint() to mark the block dirty.
+
+If the buffer is clean and checksums are in use then MarkBufferDirtyHint()
+inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image
+that includes the hint. We do this to avoid a partial page write, when we
+write the dirtied page. WAL is not written during recovery, so we simply skip
+dirtying blocks because of hints when in recovery.
+
+If you do decide to optimise away a WAL record, then any calls to
+MarkBufferDirty() must be replaced by MarkBufferDirtyHint(),
+otherwise you will expose the risk of partial page writes.
+
+
+Write-Ahead Logging for Filesystem Actions
+------------------------------------------
+
+The previous section described how to WAL-log actions that only change page
+contents within shared buffers.  For that type of action it is generally
+possible to check all likely error cases (such as insufficient space on the
+page) before beginning to make the actual change.  Therefore we can make
+the change and the creation of the associated WAL log record "atomic" by
+wrapping them into a critical section --- the odds of failure partway
+through are low enough that PANIC is acceptable if it does happen.
+
+Clearly, that approach doesn't work for cases where there's a significant
+probability of failure within the action to be logged, such as creation
+of a new file or database.  We don't want to PANIC, and we especially don't
+want to PANIC after having already written a WAL record that says we did
+the action --- if we did, replay of the record would probably fail again
+and PANIC again, making the failure unrecoverable.  This means that the
+ordinary WAL rule of "write WAL before the changes it describes" doesn't
+work, and we need a different design for such cases.
+
+There are several basic types of filesystem actions that have this
+issue.  Here is how we deal with each:
+
+1. Adding a disk page to an existing table.
+
+This action isn't WAL-logged at all.  We extend a table by writing a page
+of zeroes at its end.  We must actually do this write so that we are sure
+the filesystem has allocated the space.  If the write fails we can just
+error out normally.  Once the space is known allocated, we can initialize
+and fill the page via one or more normal WAL-logged actions.  Because it's
+possible that we crash between extending the file and writing out the WAL
+entries, we have to treat discovery of an all-zeroes page in a table or
+index as being a non-error condition.  In such cases we can just reclaim
+the space for re-use.
+
+2. Creating a new table, which requires a new file in the filesystem.
+
+We try to create the file, and if successful we make a WAL record saying
+we did it.  If not successful, we can just throw an error.  Notice that
+there is a window where we have created the file but not yet written any
+WAL about it to disk.  If we crash during this window, the file remains
+on disk as an "orphan".  It would be possible to clean up such orphans
+by having database restart search for files that don't have any committed
+entry in pg_class, but that currently isn't done because of the possibility
+of deleting data that is useful for forensic analysis of the crash.
+Orphan files are harmless --- at worst they waste a bit of disk space ---
+because we check for on-disk collisions when allocating new relfilenode
+OIDs.  So cleaning up isn't really necessary.
+
+3. Deleting a table, which requires an unlink() that could fail.
+
+Our approach here is to WAL-log the operation first, but to treat failure
+of the actual unlink() call as a warning rather than error condition.
+Again, this can leave an orphan file behind, but that's cheap compared to
+the alternatives.  Since we can't actually do the unlink() until after
+we've committed the DROP TABLE transaction, throwing an error would be out
+of the question anyway.  (It may be worth noting that the WAL entry about
+the file deletion is actually part of the commit record for the dropping
+transaction.)
+
+4. Creating and deleting databases and tablespaces, which requires creating
+and deleting directories and entire directory trees.
+
+These cases are handled similarly to creating individual files, ie, we
+try to do the action first and then write a WAL entry if it succeeded.
+The potential amount of wasted disk space is rather larger, of course.
+In the creation case we try to delete the directory tree again if creation
+fails, so as to reduce the risk of wasted space.  Failure partway through
+a deletion operation results in a corrupt database: the DROP failed, but
+some of the data is gone anyway.  There is little we can do about that,
+though, and in any case it was presumably data the user no longer wants.
+
+In all of these cases, if WAL replay fails to redo the original action
+we must panic and abort recovery.  The DBA will have to manually clean up
+(for instance, free up some disk space or fix directory permissions) and
+then restart recovery.  This is part of the reason for not writing a WAL
+entry until we've successfully done the original action.
+
+
+Skipping WAL for New RelFileNode
+--------------------------------
+
+Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK
+would unlink, in-tree access methods write no WAL for that change.  Code that
+writes WAL without calling RelationNeedsWAL() must check for this case.  This
+skipping is mandatory.  If a WAL-writing change preceded a WAL-skipping change
+for the same block, REDO could overwrite the WAL-skipping change.  If a
+WAL-writing change followed a WAL-skipping change for the same block, a
+related problem would arise.  When a WAL record contains no full-page image,
+REDO expects the page to match its contents from just before record insertion.
+A WAL-skipping change may not reach disk at all, violating REDO's expectation
+under full_page_writes=off.  For any access method, CommitTransaction() writes
+and fsyncs affected blocks before recording the commit.
+
+Prefer to do the same in future access methods.  However, two other approaches
+can work.  First, an access method can irreversibly transition a given fork
+from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and
+smgrimmedsync().  Second, an access method can opt to write WAL
+unconditionally for permanent relations.  Under these approaches, the access
+method callbacks must not call functions that react to RelationNeedsWAL().
+
+This applies only to WAL records whose replay would modify bytes stored in the
+new relfilenode.  It does not apply to other records about the relfilenode,
+such as XLOG_SMGR_CREATE.  Because it operates at the level of individual
+relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations.
+Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which
+ALTER TABLE adds a TOAST relation.  The TOAST relation will skip WAL, while
+the table owning it will not.  ALTER TABLE SET TABLESPACE will cause a table
+to skip WAL, but that won't affect its indexes.
+
+
+Asynchronous Commit
+-------------------
+
+As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e.,
+we don't wait while the WAL record for the commit is fsync'ed.
+We perform an asynchronous commit when synchronous_commit = off.  Instead
+of performing an XLogFlush() up to the LSN of the commit, we merely note
+the LSN in shared memory.  The backend then continues with other work.
+We record the LSN only for an asynchronous commit, not an abort; there's
+never any need to flush an abort record, since the presumption after a
+crash would be that the transaction aborted anyway.
+
+We always force synchronous commit when the transaction is deleting
+relations, to ensure the commit record is down to disk before the relations
+are removed from the filesystem.  Also, certain utility commands that have
+non-roll-backable side effects (such as filesystem changes) force sync
+commit to minimize the window in which the filesystem change has been made
+but the transaction isn't guaranteed committed.
+
+The walwriter regularly wakes up (via wal_writer_delay) or is woken up
+(via its latch, which is set by backends committing asynchronously) and
+performs an XLogBackgroundFlush().  This checks the location of the last
+completely filled WAL page.  If that has moved forwards, then we write all
+the changed buffers up to that point, so that under full load we write
+only whole buffers.  If there has been a break in activity and the current
+WAL page is the same as before, then we find out the LSN of the most
+recent asynchronous commit, and write up to that point, if required (i.e.
+if it's in the current WAL page).  If more than wal_writer_delay has
+passed, or more than wal_writer_flush_after blocks have been written, since
+the last flush, WAL is also flushed up to the current location.  This
+arrangement in itself would guarantee that an async commit record reaches
+disk after at most two times wal_writer_delay after the transaction
+completes. However, we also allow XLogFlush to write/flush full buffers
+"flexibly" (ie, not wrapping around at the end of the circular WAL buffer
+area), so as to minimize the number of writes issued under high load when
+multiple WAL pages are filled per walwriter cycle. This makes the worst-case
+delay three wal_writer_delay cycles.
+
+There are some other subtle points to consider with asynchronous commits.
+First, for each page of CLOG we must remember the LSN of the latest commit
+affecting the page, so that we can enforce the same flush-WAL-before-write
+rule that we do for ordinary relation pages.  Otherwise the record of the
+commit might reach disk before the WAL record does.  Again, abort records
+need not factor into this consideration.
+
+In fact, we store more than one LSN for each clog page.  This relates to
+the way we set transaction status hint bits during visibility tests.
+We must not set a transaction-committed hint bit on a relation page and
+have that record make it to disk prior to the WAL record of the commit.
+Since visibility tests are normally made while holding buffer share locks,
+we do not have the option of changing the page's LSN to guarantee WAL
+synchronization.  Instead, we defer the setting of the hint bit if we have
+not yet flushed WAL as far as the LSN associated with the transaction.
+This requires tracking the LSN of each unflushed async commit.  It is
+convenient to associate this data with clog buffers: because we will flush
+WAL before writing a clog page, we know that we do not need to remember a
+transaction's LSN longer than the clog page holding its commit status
+remains in memory.  However, the naive approach of storing an LSN for each
+clog position is unattractive: the LSNs are 32x bigger than the two-bit
+commit status fields, and so we'd need 256K of additional shared memory for
+each 8K clog buffer page.  We choose instead to store a smaller number of
+LSNs per page, where each LSN is the highest LSN associated with any
+transaction commit in a contiguous range of transaction IDs on that page.
+This saves storage at the price of some possibly-unnecessary delay in
+setting transaction hint bits.
+
+How many transactions should share the same cached LSN (N)?  If the
+system's workload consists only of small async-commit transactions, then
+it's reasonable to have N similar to the number of transactions per
+walwriter cycle, since that is the granularity with which transactions will
+become truly committed (and thus hintable) anyway.  The worst case is where
+a sync-commit xact shares a cached LSN with an async-commit xact that
+commits a bit later; even though we paid to sync the first xact to disk,
+we won't be able to hint its outputs until the second xact is sync'd, up to
+three walwriter cycles later.  This argues for keeping N (the group size)
+as small as possible.  For the moment we are setting the group size to 32,
+which makes the LSN cache space the same size as the actual clog buffer
+space (independently of BLCKSZ).
+
+It is useful that we can run both synchronous and asynchronous commit
+transactions concurrently, but the safety of this is perhaps not
+immediately obvious.  Assume we have two transactions, T1 and T2.  The Log
+Sequence Number (LSN) is the point in the WAL sequence where a transaction
+commit is recorded, so LSN1 and LSN2 are the commit records of those
+transactions.  If T2 can see changes made by T1 then when T2 commits it
+must be true that LSN2 follows LSN1.  Thus when T2 commits it is certain
+that all of the changes made by T1 are also now recorded in the WAL.  This
+is true whether T1 was asynchronous or synchronous.  As a result, it is
+safe for asynchronous commits and synchronous commits to work concurrently
+without endangering data written by synchronous commits.  Sub-transactions
+are not important here since the final write to disk only occurs at the
+commit of the top level transaction.
+
+Changes to data blocks cannot reach disk unless WAL is flushed up to the
+point of the LSN of the data blocks.  Any attempt to write unsafe data to
+disk will trigger a write which ensures the safety of all data written by
+that and prior transactions.  Data blocks and clog pages are both protected
+by LSNs.
+
+Changes to a temp table are not WAL-logged, hence could reach disk in
+advance of T1's commit, but we don't care since temp table contents don't
+survive crashes anyway.
+
+Database writes that skip WAL for new relfilenodes are also safe.  In these
+cases it's entirely possible for the data to reach disk before T1's commit,
+because T1 will fsync it down to disk without any sort of interlock.  However,
+all these paths are designed to write data that no other transaction can see
+until after T1 commits.  The situation is thus not different from ordinary
+WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining a
+list of XIDs belonging to transactions that are being replayed, so that
+each transaction that has recorded WAL records for database writes exist
+in the array until it commits. Further details are given in comments in
+procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog, multixact and commit_ts entries are made normally.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel
new file mode 100644
index 0000000..99c588d
--- /dev/null
+++ b/src/backend/access/transam/README.parallel
@@ -0,0 +1,237 @@
+Overview
+========
+
+PostgreSQL provides some simple facilities to make writing parallel algorithms
+easier.  Using a data structure called a ParallelContext, you can arrange to
+launch background worker processes, initialize their state to match that of
+the backend which initiated parallelism, communicate with them via dynamic
+shared memory, and write reasonably complex code that can run either in the
+user backend or in one of the parallel workers without needing to be aware of
+where it's running.
+
+The backend which starts a parallel operation (hereafter, the initiating
+backend) starts by creating a dynamic shared memory segment which will last
+for the lifetime of the parallel operation.  This dynamic shared memory segment
+will contain (1) a shm_mq that can be used to transport errors (and other
+messages reported via elog/ereport) from the worker back to the initiating
+backend; (2) serialized representations of the initiating backend's private
+state, so that the worker can synchronize its state with of the initiating
+backend; and (3) any other data structures which a particular user of the
+ParallelContext data structure may wish to add for its own purposes.  Once
+the initiating backend has initialized the dynamic shared memory segment, it
+asks the postmaster to launch the appropriate number of parallel workers.
+These workers then connect to the dynamic shared memory segment, initiate
+their state, and then invoke the appropriate entrypoint, as further detailed
+below.
+
+Error Reporting
+===============
+
+When started, each parallel worker begins by attaching the dynamic shared
+memory segment and locating the shm_mq to be used for error reporting; it
+redirects all of its protocol messages to this shm_mq.  Prior to this point,
+any failure of the background worker will not be reported to the initiating
+backend; from the point of view of the initiating backend, the worker simply
+failed to start.  The initiating backend must anyway be prepared to cope
+with fewer parallel workers than it originally requested, so catering to
+this case imposes no additional burden.
+
+Whenever a new message (or partial message; very large messages may wrap) is
+sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the
+initiating backend.  This causes the next CHECK_FOR_INTERRUPTS() in the
+initiating backend to read and rethrow the message.  For the most part, this
+makes error reporting in parallel mode "just work".  Of course, to work
+properly, it is important that the code the initiating backend is executing
+CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for
+long periods of time, but those are good things to do anyway.
+
+(A currently-unsolved problem is that some messages may get written to the
+system log twice, once in the backend where the report was originally
+generated, and again when the initiating backend rethrows the message.  If
+we decide to suppress one of these reports, it should probably be second one;
+otherwise, if the worker is for some reason unable to propagate the message
+back to the initiating backend, the message will be lost altogether.)
+
+State Sharing
+=============
+
+It's possible to write C code which works correctly without parallelism, but
+which fails when parallelism is used.  No parallel infrastructure can
+completely eliminate this problem, because any global variable is a risk.
+There's no general mechanism for ensuring that every global variable in the
+worker will have the same value that it does in the initiating backend; even
+if we could ensure that, some function we're calling could update the variable
+after each call, and only the backend where that update is performed will see
+the new value.  Similar problems can arise with any more-complex data
+structure we might choose to use.  For example, a pseudo-random number
+generator should, given a particular seed value, produce the same predictable
+series of values every time.  But it does this by relying on some private
+state which won't automatically be shared between cooperating backends.  A
+parallel-safe PRNG would need to store its state in dynamic shared memory, and
+would require locking.  The parallelism infrastructure has no way of knowing
+whether the user intends to call code that has this sort of problem, and can't
+do anything about it anyway.
+
+Instead, we take a more pragmatic approach. First, we try to make as many of
+the operations that are safe outside of parallel mode work correctly in
+parallel mode as well.  Second, we try to prohibit common unsafe operations
+via suitable error checks.  These checks are intended to catch 100% of
+unsafe things that a user might do from the SQL interface, but code written
+in C can do unsafe things that won't trigger these checks.  The error checks
+are engaged via EnterParallelMode(), which should be called before creating
+a parallel context, and disarmed via ExitParallelMode(), which should be
+called after all parallel contexts have been destroyed.  The most
+significant restriction imposed by parallel mode is that all operations must
+be strictly read-only; we allow no writes to the database and no DDL.  We
+might try to relax these restrictions in the future.
+
+To make as many operations as possible safe in parallel mode, we try to copy
+the most important pieces of state from the initiating backend to each parallel
+worker.  This includes:
+
+  - The set of libraries dynamically loaded by dfmgr.c.
+
+  - The authenticated user ID and current database.  Each parallel worker
+    will connect to the same database as the initiating backend, using the
+    same user ID.
+
+  - The values of all GUCs.  Accordingly, permanent changes to the value of
+    any GUC are forbidden while in parallel mode; but temporary changes,
+    such as entering a function with non-NULL proconfig, are OK.
+
+  - The current subtransaction's XID, the top-level transaction's XID, and
+    the list of XIDs considered current (that is, they are in-progress or
+    subcommitted).  This information is needed to ensure that tuple visibility
+    checks return the same results in the worker as they do in the
+    initiating backend.  See also the section Transaction Integration, below.
+
+  - The combo CID mappings.  This is needed to ensure consistent answers to
+    tuple visibility checks.  The need to synchronize this data structure is
+    a major reason why we can't support writes in parallel mode: such writes
+    might create new combo CIDs, and we have no way to let other workers
+    (or the initiating backend) know about them.
+
+  - The transaction snapshot.
+
+  - The active snapshot, which might be different from the transaction
+    snapshot.
+
+  - The currently active user ID and security context.  Note that this is
+    the fourth user ID we restore: the initial step of binding to the correct
+    database also involves restoring the authenticated user ID.  When GUC
+    values are restored, this incidentally sets SessionUserId and OuterUserId
+    to the correct values.  This final step restores CurrentUserId.
+
+  - State related to pending REINDEX operations, which prevents access to
+    an index that is currently being rebuilt.
+
+  - Active relmapper.c mapping state.  This is needed to allow consistent
+    answers when fetching the current relfilenode for relation oids of
+    mapped relations.
+
+To prevent unprincipled deadlocks when running in parallel mode, this code
+also arranges for the leader and all workers to participate in group
+locking.  See src/backend/storage/lmgr/README for more details.
+
+Transaction Integration
+=======================
+
+Regardless of what the TransactionState stack looks like in the parallel
+leader, each parallel worker ends up with a stack of depth 1.  This stack
+entry is marked with the special transaction block state
+TBLOCK_PARALLEL_INPROGRESS so that it's not confused with an ordinary
+toplevel transaction.  The XID of this TransactionState is set to the XID of
+the innermost currently-active subtransaction in the initiating backend.  The
+initiating backend's toplevel XID, and the XIDs of all current (in-progress
+or subcommitted) XIDs are stored separately from the TransactionState stack,
+but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(), and
+TransactionIdIsCurrentTransactionId() return the same values that they would
+in the initiating backend.  We could copy the entire transaction state stack,
+but most of it would be useless: for example, you can't roll back to a
+savepoint from within a parallel worker, and there are no resources to
+associated with the memory contexts or resource owners of intermediate
+subtransactions.
+
+No meaningful change to the transaction state can be made while in parallel
+mode.  No XIDs can be assigned, and no subtransactions can start or end,
+because we have no way of communicating these state changes to cooperating
+backends, or of synchronizing them.  It's clearly unworkable for the initiating
+backend to exit any transaction or subtransaction that was in progress when
+parallelism was started before all parallel workers have exited; and it's even
+more clearly crazy for a parallel worker to try to subcommit or subabort the
+current subtransaction and execute in some other transaction context than was
+present in the initiating backend.  It might be practical to allow internal
+sub-transactions (e.g. to implement a PL/pgSQL EXCEPTION block) to be used in
+parallel mode, provided that they are XID-less, because other backends
+wouldn't really need to know about those transactions or do anything
+differently because of them.  Right now, we don't even allow that.
+
+At the end of a parallel operation, which can happen either because it
+completed successfully or because it was interrupted by an error, parallel
+workers associated with that operation exit.  In the error case, transaction
+abort processing in the parallel leader kills off any remaining workers, and
+the parallel leader then waits for them to die.  In the case of a successful
+parallel operation, the parallel leader does not send any signals, but must
+wait for workers to complete and exit of their own volition.  In either
+case, it is very important that all workers actually exit before the
+parallel leader cleans up the (sub)transaction in which they were created;
+otherwise, chaos can ensue.  For example, if the leader is rolling back the
+transaction that created the relation being scanned by a worker, the
+relation could disappear while the worker is still busy scanning it.  That's
+not safe.
+
+Generally, the cleanup performed by each worker at this point is similar to
+top-level commit or abort.  Each backend has its own resource owners: buffer
+pins, catcache or relcache reference counts, tuple descriptors, and so on
+are managed separately by each backend, and must free them before exiting.
+There are, however, some important differences between parallel worker
+commit or abort and a real top-level transaction commit or abort.  Most
+importantly:
+
+  - No commit or abort record is written; the initiating backend is
+    responsible for this.
+
+  - Cleanup of pg_temp namespaces is not done.  Parallel workers cannot
+    safely access the initiating backend's pg_temp namespace, and should
+    not create one of their own.
+
+Coding Conventions
+===================
+
+Before beginning any parallel operation, call EnterParallelMode(); after all
+parallel operations are completed, call ExitParallelMode().  To actually
+parallelize a particular operation, use a ParallelContext.  The basic coding
+pattern looks like this:
+
+	EnterParallelMode();		/* prohibit unsafe state changes */
+
+	pcxt = CreateParallelContext("library_name", "function_name", nworkers);
+
+	/* Allow space for application-specific data here. */
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, keys);
+
+	InitializeParallelDSM(pcxt);	/* create DSM and copy state to it */
+
+	/* Store the data for which we reserved space. */
+	space = shm_toc_allocate(pcxt->toc, size);
+	shm_toc_insert(pcxt->toc, key, space);
+
+	LaunchParallelWorkers(pcxt);
+
+	/* do parallel stuff */
+
+	WaitForParallelWorkersToFinish(pcxt);
+
+	/* read any final results from dynamic shared memory */
+
+	DestroyParallelContext(pcxt);
+
+	ExitParallelMode();
+
+If desired, after WaitForParallelWorkersToFinish() has been called, the
+context can be reset so that workers can be launched anew using the same
+parallel context.  To do this, first call ReinitializeParallelDSM() to
+reinitialize state managed by the parallel context machinery itself; then,
+perform any other necessary resetting of state; after that, you can again
+call LaunchParallelWorkers.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
new file mode 100644
index 0000000..3d9088a
--- /dev/null
+++ b/src/backend/access/transam/clog.c
@@ -0,0 +1,1030 @@
+/*-------------------------------------------------------------------------
+ *
+ * clog.c
+ *		PostgreSQL transaction-commit-log manager
+ *
+ * This module replaces the old "pg_log" access code, which treated pg_log
+ * essentially like a relation, in that it went through the regular buffer
+ * manager.  The problem with that was that there wasn't any good way to
+ * recycle storage space for transactions so old that they'll never be
+ * looked up again.  Now we use specialized access code so that the commit
+ * log can be broken into relatively small, independent segments.
+ *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * CLOG page is initialized to zeroes.  Other writes of CLOG come from
+ * recording of transaction commit or abort in xact.c, which generates its
+ * own XLOG records for these events and will re-perform the status update
+ * on redo; so we need make no additional XLOG entry here.  For synchronous
+ * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
+ * record before we are called to log a commit, so the WAL rule "write xlog
+ * before data" is satisfied automatically.  However, for async commits we
+ * must track the latest LSN affecting each CLOG page, so that we can flush
+ * XLOG that far and satisfy the WAL rule.  We don't have to worry about this
+ * for aborts (whether sync or async), since the post-crash assumption would
+ * be that such transactions failed anyway.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/clog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/sync.h"
+
+/*
+ * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
+ * and CLOG segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
+ */
+
+/* We need two bits per xact, so four xacts fit in a byte */
+#define CLOG_BITS_PER_XACT	2
+#define CLOG_XACTS_PER_BYTE 4
+#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACT_BITMASK	((1 << CLOG_BITS_PER_XACT) - 1)
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
+#define TransactionIdToByte(xid)	(TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
+#define TransactionIdToBIndex(xid)	((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
+
+/* We store the latest async LSN for each group of transactions */
+#define CLOG_XACTS_PER_LSN_GROUP	32	/* keep this a power of 2 */
+#define CLOG_LSNS_PER_PAGE	(CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+#define GetLSNIndex(slotno, xid)	((slotno) * CLOG_LSNS_PER_PAGE + \
+	((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
+
+/*
+ * The number of subtransactions below which we consider to apply clog group
+ * update optimization.  Testing reveals that the number higher than this can
+ * hurt performance.
+ */
+#define THRESHOLD_SUBTRANS_CLOG_OPT	5
+
+/*
+ * Link to shared-memory data structures for CLOG control
+ */
+static SlruCtlData XactCtlData;
+
+#define XactCtl (&XactCtlData)
+
+
+static int	ZeroCLOGPage(int pageno, bool writeXlog);
+static bool CLOGPagePrecedes(int page1, int page2);
+static void WriteZeroPageXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact,
+								 Oid oldestXactDb);
+static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+									   TransactionId *subxids, XidStatus status,
+									   XLogRecPtr lsn, int pageno,
+									   bool all_xact_same_page);
+static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
+									  XLogRecPtr lsn, int slotno);
+static void set_status_by_pages(int nsubxids, TransactionId *subxids,
+								XidStatus status, XLogRecPtr lsn);
+static bool TransactionGroupUpdateXidStatus(TransactionId xid,
+											XidStatus status, XLogRecPtr lsn, int pageno);
+static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
+											   TransactionId *subxids, XidStatus status,
+											   XLogRecPtr lsn, int pageno);
+
+
+/*
+ * TransactionIdSetTreeStatus
+ *
+ * Record the final state of transaction entries in the commit log for
+ * a transaction and its subtransaction tree. Take care to ensure this is
+ * efficient, and as atomic as possible.
+ *
+ * xid is a single xid to set status for. This will typically be
+ * the top level transactionid for a top level commit or abort. It can
+ * also be a subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ *
+ * lsn must be the WAL location of the commit record when recording an async
+ * commit.  For a synchronous commit it can be InvalidXLogRecPtr, since the
+ * caller guarantees the commit record is already flushed in that case.  It
+ * should be InvalidXLogRecPtr for abort cases, too.
+ *
+ * In the commit case, atomicity is limited by whether all the subxids are in
+ * the same CLOG page as xid.  If they all are, then the lock will be grabbed
+ * only once, and the status will be set to committed directly.  Otherwise
+ * we must
+ *	 1. set sub-committed all subxids that are not on the same page as the
+ *		main xid
+ *	 2. atomically set committed the main xid and the subxids on the same page
+ *	 3. go over the first bunch again and set them committed
+ * Note that as far as concurrent checkers are concerned, main transaction
+ * commit as a whole is still atomic.
+ *
+ * Example:
+ *		TransactionId t commits and has subxids t1, t2, t3, t4
+ *		t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
+ *		1. update pages2-3:
+ *					page2: set t2,t3 as sub-committed
+ *					page3: set t4 as sub-committed
+ *		2. update page1:
+ *					set t1 as sub-committed,
+ *					then set t as committed,
+					then set t1 as committed
+ *		3. update pages2-3:
+ *					page2: set t2,t3 as committed
+ *					page3: set t4 as committed
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; functions in transam.c are the intended callers.
+ *
+ * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
+ * but aren't yet in cache, as well as hinting pages not to fall out of
+ * cache yet.
+ */
+void
+TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
+{
+	int			pageno = TransactionIdToPage(xid);	/* get page of parent */
+	int			i;
+
+	Assert(status == TRANSACTION_STATUS_COMMITTED ||
+		   status == TRANSACTION_STATUS_ABORTED);
+
+	/*
+	 * See how many subxids, if any, are on the same page as the parent, if
+	 * any.
+	 */
+	for (i = 0; i < nsubxids; i++)
+	{
+		if (TransactionIdToPage(subxids[i]) != pageno)
+			break;
+	}
+
+	/*
+	 * Do all items fit on a single page?
+	 */
+	if (i == nsubxids)
+	{
+		/*
+		 * Set the parent and all subtransactions in a single call
+		 */
+		TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
+								   pageno, true);
+	}
+	else
+	{
+		int			nsubxids_on_first_page = i;
+
+		/*
+		 * If this is a commit then we care about doing this correctly (i.e.
+		 * using the subcommitted intermediate status).  By here, we know
+		 * we're updating more than one page of clog, so we must mark entries
+		 * that are *not* on the first page so that they show as subcommitted
+		 * before we then return to update the status to fully committed.
+		 *
+		 * To avoid touching the first page twice, skip marking subcommitted
+		 * for the subxids on that first page.
+		 */
+		if (status == TRANSACTION_STATUS_COMMITTED)
+			set_status_by_pages(nsubxids - nsubxids_on_first_page,
+								subxids + nsubxids_on_first_page,
+								TRANSACTION_STATUS_SUB_COMMITTED, lsn);
+
+		/*
+		 * Now set the parent and subtransactions on same page as the parent,
+		 * if any
+		 */
+		pageno = TransactionIdToPage(xid);
+		TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
+								   lsn, pageno, false);
+
+		/*
+		 * Now work through the rest of the subxids one clog page at a time,
+		 * starting from the second page onwards, like we did above.
+		 */
+		set_status_by_pages(nsubxids - nsubxids_on_first_page,
+							subxids + nsubxids_on_first_page,
+							status, lsn);
+	}
+}
+
+/*
+ * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
+ * transactions, chunking in the separate CLOG pages involved. We never
+ * pass the whole transaction tree to this function, only subtransactions
+ * that are on different pages to the top level transaction id.
+ */
+static void
+set_status_by_pages(int nsubxids, TransactionId *subxids,
+					XidStatus status, XLogRecPtr lsn)
+{
+	int			pageno = TransactionIdToPage(subxids[0]);
+	int			offset = 0;
+	int			i = 0;
+
+	Assert(nsubxids > 0);		/* else the pageno fetch above is unsafe */
+
+	while (i < nsubxids)
+	{
+		int			num_on_page = 0;
+		int			nextpageno;
+
+		do
+		{
+			nextpageno = TransactionIdToPage(subxids[i]);
+			if (nextpageno != pageno)
+				break;
+			num_on_page++;
+			i++;
+		} while (i < nsubxids);
+
+		TransactionIdSetPageStatus(InvalidTransactionId,
+								   num_on_page, subxids + offset,
+								   status, lsn, pageno, false);
+		offset = i;
+		pageno = nextpageno;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the commit log for all
+ * entries on a single page.  Atomic only on this page.
+ */
+static void
+TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids, XidStatus status,
+						   XLogRecPtr lsn, int pageno,
+						   bool all_xact_same_page)
+{
+	/* Can't use group update when PGPROC overflows. */
+	StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
+					 "group clog threshold less than PGPROC cached subxids");
+
+	/*
+	 * When there is contention on XactSLRULock, we try to group multiple
+	 * updates; a single leader process will perform transaction status
+	 * updates for multiple backends so that the number of times XactSLRULock
+	 * needs to be acquired is reduced.
+	 *
+	 * For this optimization to be safe, the XID and subxids in MyProc must be
+	 * the same as the ones for which we're setting the status.  Check that
+	 * this is the case.
+	 *
+	 * For this optimization to be efficient, we shouldn't have too many
+	 * sub-XIDs and all of the XIDs for which we're adjusting clog should be
+	 * on the same page.  Check those conditions, too.
+	 */
+	if (all_xact_same_page && xid == MyProc->xid &&
+		nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
+		nsubxids == MyProc->subxidStatus.count &&
+		(nsubxids == 0 ||
+		 memcmp(subxids, MyProc->subxids.xids,
+				nsubxids * sizeof(TransactionId)) == 0))
+	{
+		/*
+		 * If we can immediately acquire XactSLRULock, we update the status of
+		 * our own XID and release the lock.  If not, try use group XID
+		 * update.  If that doesn't work out, fall back to waiting for the
+		 * lock to perform an update for this transaction only.
+		 */
+		if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE))
+		{
+			/* Got the lock without waiting!  Do the update. */
+			TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
+											   lsn, pageno);
+			LWLockRelease(XactSLRULock);
+			return;
+		}
+		else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
+		{
+			/* Group update mechanism has done the work. */
+			return;
+		}
+
+		/* Fall through only if update isn't done yet. */
+	}
+
+	/* Group update not applicable, or couldn't accept this page number. */
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+	TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
+									   lsn, pageno);
+	LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Record the final state of transaction entry in the commit log
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static void
+TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
+								   TransactionId *subxids, XidStatus status,
+								   XLogRecPtr lsn, int pageno)
+{
+	int			slotno;
+	int			i;
+
+	Assert(status == TRANSACTION_STATUS_COMMITTED ||
+		   status == TRANSACTION_STATUS_ABORTED ||
+		   (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
+	Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE));
+
+	/*
+	 * If we're doing an async commit (ie, lsn is valid), then we must wait
+	 * for any active write on the page slot to complete.  Otherwise our
+	 * update could reach disk in that write, which will not do since we
+	 * mustn't let it reach disk until we've done the appropriate WAL flush.
+	 * But when lsn is invalid, it's OK to scribble on a page while it is
+	 * write-busy, since we don't care if the update reaches disk sooner than
+	 * we think.
+	 */
+	slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
+
+	/*
+	 * Set the main transaction id, if any.
+	 *
+	 * If we update more than one xid on this page while it is being written
+	 * out, we might find that some of the bits go to disk and others don't.
+	 * If we are updating commits on the page with the top-level xid that
+	 * could break atomicity, so we subcommit the subxids first before we mark
+	 * the top-level commit.
+	 */
+	if (TransactionIdIsValid(xid))
+	{
+		/* Subtransactions first, if needed ... */
+		if (status == TRANSACTION_STATUS_COMMITTED)
+		{
+			for (i = 0; i < nsubxids; i++)
+			{
+				Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+				TransactionIdSetStatusBit(subxids[i],
+										  TRANSACTION_STATUS_SUB_COMMITTED,
+										  lsn, slotno);
+			}
+		}
+
+		/* ... then the main transaction */
+		TransactionIdSetStatusBit(xid, status, lsn, slotno);
+	}
+
+	/* Set the subtransactions */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
+	}
+
+	XactCtl->shared->page_dirty[slotno] = true;
+}
+
+/*
+ * When we cannot immediately acquire XactSLRULock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * status update.  The first process to add itself to the list will acquire
+ * XactSLRULock in exclusive mode and set transaction status as required
+ * on behalf of all group members.  This avoids a great deal of contention
+ * around XactSLRULock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ *
+ * Returns true when transaction status has been updated in clog; returns
+ * false if we decided against applying the optimization because the page
+ * number we need to update differs from those processes already waiting.
+ */
+static bool
+TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
+								XLogRecPtr lsn, int pageno)
+{
+	volatile PROC_HDR *procglobal = ProcGlobal;
+	PGPROC	   *proc = MyProc;
+	uint32		nextidx;
+	uint32		wakeidx;
+
+	/* We should definitely have an XID whose status needs to be updated. */
+	Assert(TransactionIdIsValid(xid));
+
+	/*
+	 * Add ourselves to the list of processes needing a group XID status
+	 * update.
+	 */
+	proc->clogGroupMember = true;
+	proc->clogGroupMemberXid = xid;
+	proc->clogGroupMemberXidStatus = status;
+	proc->clogGroupMemberPage = pageno;
+	proc->clogGroupMemberLsn = lsn;
+
+	nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
+
+	while (true)
+	{
+		/*
+		 * Add the proc to list, if the clog page where we need to update the
+		 * current transaction status is same as group leader's clog page.
+		 *
+		 * There is a race condition here, which is that after doing the below
+		 * check and before adding this proc's clog update to a group, the
+		 * group leader might have already finished the group update for this
+		 * page and becomes group leader of another group. This will lead to a
+		 * situation where a single group can have different clog page
+		 * updates.  This isn't likely and will still work, just maybe a bit
+		 * less efficiently.
+		 */
+		if (nextidx != INVALID_PGPROCNO &&
+			ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage)
+		{
+			/*
+			 * Ensure that this proc is not a member of any clog group that
+			 * needs an XID status update.
+			 */
+			proc->clogGroupMember = false;
+			pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO);
+			return false;
+		}
+
+		pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
+
+		if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
+										   &nextidx,
+										   (uint32) proc->pgprocno))
+			break;
+	}
+
+	/*
+	 * If the list was not empty, the leader will update the status of our
+	 * XID. It is impossible to have followers without a leader because the
+	 * first process that has added itself to the list will always have
+	 * nextidx as INVALID_PGPROCNO.
+	 */
+	if (nextidx != INVALID_PGPROCNO)
+	{
+		int			extraWaits = 0;
+
+		/* Sleep until the leader updates our XID status. */
+		pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
+		for (;;)
+		{
+			/* acts as a read barrier */
+			PGSemaphoreLock(proc->sem);
+			if (!proc->clogGroupMember)
+				break;
+			extraWaits++;
+		}
+		pgstat_report_wait_end();
+
+		Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO);
+
+		/* Fix semaphore count for any absorbed wakeups */
+		while (extraWaits-- > 0)
+			PGSemaphoreUnlock(proc->sem);
+		return true;
+	}
+
+	/* We are the leader.  Acquire the lock on behalf of everyone. */
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * Now that we've got the lock, clear the list of processes waiting for
+	 * group XID status update, saving a pointer to the head of the list.
+	 * Trying to pop elements one at a time could lead to an ABA problem.
+	 */
+	nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
+									 INVALID_PGPROCNO);
+
+	/* Remember head of list so we can perform wakeups after dropping lock. */
+	wakeidx = nextidx;
+
+	/* Walk the list and update the status of all XIDs. */
+	while (nextidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[nextidx];
+
+		/*
+		 * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
+		 * should not use group XID status update mechanism.
+		 */
+		Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
+
+		TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid,
+										   proc->subxidStatus.count,
+										   proc->subxids.xids,
+										   proc->clogGroupMemberXidStatus,
+										   proc->clogGroupMemberLsn,
+										   proc->clogGroupMemberPage);
+
+		/* Move to next proc in list. */
+		nextidx = pg_atomic_read_u32(&proc->clogGroupNext);
+	}
+
+	/* We're done with the lock now. */
+	LWLockRelease(XactSLRULock);
+
+	/*
+	 * Now that we've released the lock, go back and wake everybody up.  We
+	 * don't do this under the lock so as to keep lock hold times to a
+	 * minimum.
+	 */
+	while (wakeidx != INVALID_PGPROCNO)
+	{
+		PGPROC	   *proc = &ProcGlobal->allProcs[wakeidx];
+
+		wakeidx = pg_atomic_read_u32(&proc->clogGroupNext);
+		pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO);
+
+		/* ensure all previous writes are visible before follower continues. */
+		pg_write_barrier();
+
+		proc->clogGroupMember = false;
+
+		if (proc != MyProc)
+			PGSemaphoreUnlock(proc->sem);
+	}
+
+	return true;
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ *
+ * Must be called with XactSLRULock held
+ */
+static void
+TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
+{
+	int			byteno = TransactionIdToByte(xid);
+	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+	char	   *byteptr;
+	char		byteval;
+	char		curval;
+
+	byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+	curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
+
+	/*
+	 * When replaying transactions during recovery we still need to perform
+	 * the two phases of subcommit and then commit. However, some transactions
+	 * are already correctly marked, so we just treat those as a no-op which
+	 * allows us to keep the following Assert as restrictive as possible.
+	 */
+	if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
+		curval == TRANSACTION_STATUS_COMMITTED)
+		return;
+
+	/*
+	 * Current state change should be from 0 or subcommitted to target state
+	 * or we should already be there when replaying changes during recovery.
+	 */
+	Assert(curval == 0 ||
+		   (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
+			status != TRANSACTION_STATUS_IN_PROGRESS) ||
+		   curval == status);
+
+	/* note this assumes exclusive access to the clog page */
+	byteval = *byteptr;
+	byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
+	byteval |= (status << bshift);
+	*byteptr = byteval;
+
+	/*
+	 * Update the group LSN if the transaction completion LSN is higher.
+	 *
+	 * Note: lsn will be invalid when supplied during InRecovery processing,
+	 * so we don't need to do anything special to avoid LSN updates during
+	 * recovery. After recovery completes the next clog change will set the
+	 * LSN correctly.
+	 */
+	if (!XLogRecPtrIsInvalid(lsn))
+	{
+		int			lsnindex = GetLSNIndex(slotno, xid);
+
+		if (XactCtl->shared->group_lsn[lsnindex] < lsn)
+			XactCtl->shared->group_lsn[lsnindex] = lsn;
+	}
+}
+
+/*
+ * Interrogate the state of a transaction in the commit log.
+ *
+ * Aside from the actual commit status, this function returns (into *lsn)
+ * an LSN that is late enough to be able to guarantee that if we flush up to
+ * that LSN then we will have flushed the transaction's commit record to disk.
+ * The result is not necessarily the exact LSN of the transaction's commit
+ * record!	For example, for long-past transactions (those whose clog pages
+ * already migrated to disk), we'll return InvalidXLogRecPtr.  Also, because
+ * we group transactions on the same clog page to conserve storage, we might
+ * return the LSN of a later transaction that falls into the same group.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionLogFetch() in transam.c is the intended caller.
+ */
+XidStatus
+TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			byteno = TransactionIdToByte(xid);
+	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+	int			slotno;
+	int			lsnindex;
+	char	   *byteptr;
+	XidStatus	status;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+	slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
+	byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+
+	status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
+
+	lsnindex = GetLSNIndex(slotno, xid);
+	*lsn = XactCtl->shared->group_lsn[lsnindex];
+
+	LWLockRelease(XactSLRULock);
+
+	return status;
+}
+
+/*
+ * Number of shared CLOG buffers.
+ *
+ * On larger multi-processor systems, it is possible to have many CLOG page
+ * requests in flight at one time which could lead to disk access for CLOG
+ * page if the required page is not found in memory.  Testing revealed that we
+ * can get the best performance by having 128 CLOG buffers, more than that it
+ * doesn't improve performance.
+ *
+ * Unconditionally keeping the number of CLOG buffers to 128 did not seem like
+ * a good idea, because it would increase the minimum amount of shared memory
+ * required to start, which could be a problem for people running very small
+ * configurations.  The following formula seems to represent a reasonable
+ * compromise: people with very low values for shared_buffers will get fewer
+ * CLOG buffers as well, and everyone else will get 128.
+ */
+Size
+CLOGShmemBuffers(void)
+{
+	return Min(128, Max(4, NBuffers / 512));
+}
+
+/*
+ * Initialization of shared memory for CLOG
+ */
+Size
+CLOGShmemSize(void)
+{
+	return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+}
+
+void
+CLOGShmemInit(void)
+{
+	XactCtl->PagePrecedes = CLOGPagePrecedes;
+	SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+				  XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
+				  SYNC_HANDLER_CLOG);
+	SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates
+ * the initial CLOG segment.  (The CLOG directory is assumed to
+ * have been created by initdb, and CLOGShmemInit must have been
+ * called already.)
+ */
+void
+BootStrapCLOG(void)
+{
+	int			slotno;
+
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the commit log */
+	slotno = ZeroCLOGPage(0, false);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(XactCtl, slotno);
+	Assert(!XactCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of CLOG to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCLOGPage(int pageno, bool writeXlog)
+{
+	int			slotno;
+
+	slotno = SimpleLruZeroPage(XactCtl, pageno);
+
+	if (writeXlog)
+		WriteZeroPageXlogRec(pageno);
+
+	return slotno;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ */
+void
+StartupCLOG(void)
+{
+	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	int			pageno = TransactionIdToPage(xid);
+
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * Initialize our idea of the latest page number.
+	 */
+	XactCtl->shared->latest_page_number = pageno;
+
+	LWLockRelease(XactSLRULock);
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ */
+void
+TrimCLOG(void)
+{
+	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	int			pageno = TransactionIdToPage(xid);
+
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * Zero out the remainder of the current clog page.  Under normal
+	 * circumstances it should be zeroes already, but it seems at least
+	 * theoretically possible that XLOG replay will have settled on a nextXID
+	 * value that is less than the last XID actually used and marked by the
+	 * previous database lifecycle (since subtransaction commit writes clog
+	 * but makes no WAL entry).  Let's just be safe. (We need not worry about
+	 * pages beyond the current one, since those will be zeroed when first
+	 * used.  For the same reason, there is no need to do anything when
+	 * nextXid is exactly at a page boundary; and it's likely that the
+	 * "current" page doesn't exist yet in that case.)
+	 */
+	if (TransactionIdToPgIndex(xid) != 0)
+	{
+		int			byteno = TransactionIdToByte(xid);
+		int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+		int			slotno;
+		char	   *byteptr;
+
+		slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
+		byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+
+		/* Zero so-far-unused positions in the current byte */
+		*byteptr &= (1 << bshift) - 1;
+		/* Zero the rest of the page */
+		MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+
+		XactCtl->shared->page_dirty[slotno] = true;
+	}
+
+	LWLockRelease(XactSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCLOG(void)
+{
+	/*
+	 * Write dirty CLOG pages to disk.  This may result in sync requests
+	 * queued for later handling by ProcessSyncRequests(), as part of the
+	 * checkpoint.
+	 */
+	TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
+	SimpleLruWriteAll(XactCtl, true);
+	TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
+}
+
+
+/*
+ * Make sure that CLOG has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCLOG(TransactionId newestXact)
+{
+	int			pageno;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCLOGPage(pageno, true);
+
+	LWLockRelease(XactSLRULock);
+}
+
+
+/*
+ * Remove all CLOG segments before the one holding the passed transaction ID
+ *
+ * Before removing any CLOG data, we must flush XLOG to disk, to ensure
+ * that any recently-emitted FREEZE_PAGE records have reached disk; otherwise
+ * a crash and restart might leave us with some unfrozen tuples referencing
+ * removed CLOG data.  We choose to emit a special TRUNCATE XLOG record too.
+ * Replaying the deletion from XLOG is not critical, since the files could
+ * just as well be removed later, but doing so prevents a long-running hot
+ * standby server from acquiring an unreasonably bloated CLOG directory.
+ *
+ * Since CLOG segments hold a large number of transactions, the opportunity to
+ * actually remove a segment is fairly rare, and so it seems best not to do
+ * the XLOG flush unless we have confirmed that there is a removable segment.
+ */
+void
+TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
+{
+	int			cutoffPage;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate.
+	 */
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	/* Check to see if there's any files that could be removed */
+	if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
+		return;					/* nothing to remove */
+
+	/*
+	 * Advance oldestClogXid before truncating clog, so concurrent xact status
+	 * lookups can ensure they don't attempt to access truncated-away clog.
+	 *
+	 * It's only necessary to do this if we will actually truncate away clog
+	 * pages.
+	 */
+	AdvanceOldestClogXid(oldestXact);
+
+	/*
+	 * Write XLOG record and flush XLOG to disk. We record the oldest xid
+	 * we're keeping information about here so we can ensure that it's always
+	 * ahead of clog truncation in case we crash, and so a standby finds out
+	 * the new valid xid before the next checkpoint.
+	 */
+	WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
+
+	/* Now we can remove the old CLOG segment(s) */
+	SimpleLruTruncate(XactCtl, cutoffPage);
+}
+
+
+/*
+ * Decide whether a CLOG page number is "older" for truncation purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, TransactionIdPrecedes()
+ * would get weird about permanent xact IDs.  So, offset both such that xid1,
+ * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
+ * is relevant to page 0 and to the page preceding page 0.
+ *
+ * The page containing oldestXact-2^31 is the important edge case.  The
+ * portion of that page equaling or following oldestXact-2^31 is expendable,
+ * but the portion preceding oldestXact-2^31 is not.  When oldestXact-2^31 is
+ * the first XID of a page and segment, the entire page and segment is
+ * expendable, and we could truncate the segment.  Recognizing that case would
+ * require making oldestXact, not just the page containing oldestXact,
+ * available to this callback.  The benefit would be rare and small, so we
+ * don't optimize that edge case.
+ */
+static bool
+CLOGPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId + 1;
+	xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId + 1;
+
+	return (TransactionIdPrecedes(xid1, xid2) &&
+			TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
+}
+
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroPageXlogRec(int pageno)
+{
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ *
+ * We must flush the xlog record to disk before returning --- see notes
+ * in TruncateCLOG().
+ */
+static void
+WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb)
+{
+	XLogRecPtr	recptr;
+	xl_clog_truncate xlrec;
+
+	xlrec.pageno = pageno;
+	xlrec.oldestXact = oldestXact;
+	xlrec.oldestXactDb = oldestXactDb;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate));
+	recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
+	XLogFlush(recptr);
+}
+
+/*
+ * CLOG resource manager's routines
+ */
+void
+clog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in clog records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == CLOG_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
+
+		slotno = ZeroCLOGPage(pageno, false);
+		SimpleLruWritePage(XactCtl, slotno);
+		Assert(!XactCtl->shared->page_dirty[slotno]);
+
+		LWLockRelease(XactSLRULock);
+	}
+	else if (info == CLOG_TRUNCATE)
+	{
+		xl_clog_truncate xlrec;
+
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
+
+		AdvanceOldestClogXid(xlrec.oldestXact);
+
+		SimpleLruTruncate(XactCtl, xlrec.pageno);
+	}
+	else
+		elog(PANIC, "clog_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(XactCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
new file mode 100644
index 0000000..4dc8d40
--- /dev/null
+++ b/src/backend/access/transam/commit_ts.c
@@ -0,0 +1,1035 @@
+/*-------------------------------------------------------------------------
+ *
+ * commit_ts.c
+ *		PostgreSQL commit timestamp manager
+ *
+ * This module is a pg_xact-like system that stores the commit timestamp
+ * for each transaction.
+ *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * CommitTs page is initialized to zeroes.  Also, one XLOG record is
+ * generated for setting of values when the caller requests it; this allows
+ * us to support values coming from places other than transaction commit.
+ * Other writes of CommitTS come from recording of transaction commit in
+ * xact.c, which generates its own XLOG records for these events and will
+ * re-perform the status update on redo; so we need make no additional XLOG
+ * entry here.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/commit_ts.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/commit_ts.h"
+#include "access/htup_details.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+/*
+ * Defines for CommitTs page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CommitTs page numbering also wraps around at
+ * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE, and CommitTs segment numbering at
+ * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCommitTs (see CommitTsPagePrecedes).
+ */
+
+/*
+ * We need 8+2 bytes per xact.  Note that enlarging this struct might mean
+ * the largest possible file name is more than 5 chars long; see
+ * SlruScanDirectory.
+ */
+typedef struct CommitTimestampEntry
+{
+	TimestampTz time;
+	RepOriginId nodeid;
+} CommitTimestampEntry;
+
+#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \
+									sizeof(RepOriginId))
+
+#define COMMIT_TS_XACTS_PER_PAGE \
+	(BLCKSZ / SizeOfCommitTimestampEntry)
+
+#define TransactionIdToCTsPage(xid) \
+	((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+#define TransactionIdToCTsEntry(xid)	\
+	((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+
+/*
+ * Link to shared-memory data structures for CommitTs control
+ */
+static SlruCtlData CommitTsCtlData;
+
+#define CommitTsCtl (&CommitTsCtlData)
+
+/*
+ * We keep a cache of the last value set in shared memory.
+ *
+ * This is also good place to keep the activation status.  We keep this
+ * separate from the GUC so that the standby can activate the module if the
+ * primary has it active independently of the value of the GUC.
+ *
+ * This is protected by CommitTsLock.  In some places, we use commitTsActive
+ * without acquiring the lock; where this happens, a comment explains the
+ * rationale for it.
+ */
+typedef struct CommitTimestampShared
+{
+	TransactionId xidLastCommit;
+	CommitTimestampEntry dataLastCommit;
+	bool		commitTsActive;
+} CommitTimestampShared;
+
+static CommitTimestampShared *commitTsShared;
+
+
+/* GUC variable */
+bool		track_commit_timestamp;
+
+static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+								 TransactionId *subxids, TimestampTz ts,
+								 RepOriginId nodeid, int pageno);
+static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+									 RepOriginId nodeid, int slotno);
+static void error_commit_ts_disabled(void);
+static int	ZeroCommitTsPage(int pageno, bool writeXlog);
+static bool CommitTsPagePrecedes(int page1, int page2);
+static void ActivateCommitTs(void);
+static void DeactivateCommitTs(void);
+static void WriteZeroPageXlogRec(int pageno);
+static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid);
+
+/*
+ * TransactionTreeSetCommitTsData
+ *
+ * Record the final commit timestamp of transaction entries in the commit log
+ * for a transaction and its subtransaction tree, as efficiently as possible.
+ *
+ * xid is the top level transaction id.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
+ * The reason why tracking just the parent xid commit timestamp is not enough
+ * is that the subtrans SLRU does not stay valid across crashes (it's not
+ * permanent) so we need to keep the information about them here. If the
+ * subtrans implementation changes in the future, we might want to revisit the
+ * decision of storing timestamp info for each subxid.
+ */
+void
+TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
+							   TransactionId *subxids, TimestampTz timestamp,
+							   RepOriginId nodeid)
+{
+	int			i;
+	TransactionId headxid;
+	TransactionId newestXact;
+
+	/*
+	 * No-op if the module is not active.
+	 *
+	 * An unlocked read here is fine, because in a standby (the only place
+	 * where the flag can change in flight) this routine is only called by the
+	 * recovery process, which is also the only process which can change the
+	 * flag.
+	 */
+	if (!commitTsShared->commitTsActive)
+		return;
+
+	/*
+	 * Figure out the latest Xid in this batch: either the last subxid if
+	 * there's any, otherwise the parent xid.
+	 */
+	if (nsubxids > 0)
+		newestXact = subxids[nsubxids - 1];
+	else
+		newestXact = xid;
+
+	/*
+	 * We split the xids to set the timestamp to in groups belonging to the
+	 * same SLRU page; the first element in each such set is its head.  The
+	 * first group has the main XID as the head; subsequent sets use the first
+	 * subxid not on the previous page as head.  This way, we only have to
+	 * lock/modify each SLRU page once.
+	 */
+	headxid = xid;
+	i = 0;
+	for (;;)
+	{
+		int			pageno = TransactionIdToCTsPage(headxid);
+		int			j;
+
+		for (j = i; j < nsubxids; j++)
+		{
+			if (TransactionIdToCTsPage(subxids[j]) != pageno)
+				break;
+		}
+		/* subxids[i..j] are on the same page as the head */
+
+		SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
+							 pageno);
+
+		/* if we wrote out all subxids, we're done. */
+		if (j >= nsubxids)
+			break;
+
+		/*
+		 * Set the new head and skip over it, as well as over the subxids we
+		 * just wrote.
+		 */
+		headxid = subxids[j];
+		i = j + 1;
+	}
+
+	/* update the cached value in shared memory */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	commitTsShared->xidLastCommit = xid;
+	commitTsShared->dataLastCommit.time = timestamp;
+	commitTsShared->dataLastCommit.nodeid = nodeid;
+
+	/* and move forwards our endpoint, if needed */
+	if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact))
+		ShmemVariableCache->newestCommitTsXid = newestXact;
+	LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Record the commit timestamp of transaction entries in the commit log for all
+ * entries on a single page.  Atomic only on this page.
+ */
+static void
+SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+					 TransactionId *subxids, TimestampTz ts,
+					 RepOriginId nodeid, int pageno)
+{
+	int			slotno;
+	int			i;
+
+	LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
+
+	TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
+	for (i = 0; i < nsubxids; i++)
+		TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
+
+	CommitTsCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Sets the commit timestamp of a single transaction.
+ *
+ * Must be called with CommitTsSLRULock held
+ */
+static void
+TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+						 RepOriginId nodeid, int slotno)
+{
+	int			entryno = TransactionIdToCTsEntry(xid);
+	CommitTimestampEntry entry;
+
+	Assert(TransactionIdIsNormal(xid));
+
+	entry.time = ts;
+	entry.nodeid = nodeid;
+
+	memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+		   SizeOfCommitTimestampEntry * entryno,
+		   &entry, SizeOfCommitTimestampEntry);
+}
+
+/*
+ * Interrogate the commit timestamp of a transaction.
+ *
+ * The return value indicates whether a commit timestamp record was found for
+ * the given xid.  The timestamp value is returned in *ts (which may not be
+ * null), and the origin node for the Xid is returned in *nodeid, if it's not
+ * null.
+ */
+bool
+TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
+							 RepOriginId *nodeid)
+{
+	int			pageno = TransactionIdToCTsPage(xid);
+	int			entryno = TransactionIdToCTsEntry(xid);
+	int			slotno;
+	CommitTimestampEntry entry;
+	TransactionId oldestCommitTsXid;
+	TransactionId newestCommitTsXid;
+
+	if (!TransactionIdIsValid(xid))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
+	else if (!TransactionIdIsNormal(xid))
+	{
+		/* frozen and bootstrap xids are always committed far in the past */
+		*ts = 0;
+		if (nodeid)
+			*nodeid = 0;
+		return false;
+	}
+
+	LWLockAcquire(CommitTsLock, LW_SHARED);
+
+	/* Error if module not enabled */
+	if (!commitTsShared->commitTsActive)
+		error_commit_ts_disabled();
+
+	/*
+	 * If we're asked for the cached value, return that.  Otherwise, fall
+	 * through to read from SLRU.
+	 */
+	if (commitTsShared->xidLastCommit == xid)
+	{
+		*ts = commitTsShared->dataLastCommit.time;
+		if (nodeid)
+			*nodeid = commitTsShared->dataLastCommit.nodeid;
+
+		LWLockRelease(CommitTsLock);
+		return *ts != 0;
+	}
+
+	oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
+	newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
+	/* neither is invalid, or both are */
+	Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid));
+	LWLockRelease(CommitTsLock);
+
+	/*
+	 * Return empty if the requested value is outside our valid range.
+	 */
+	if (!TransactionIdIsValid(oldestCommitTsXid) ||
+		TransactionIdPrecedes(xid, oldestCommitTsXid) ||
+		TransactionIdPrecedes(newestCommitTsXid, xid))
+	{
+		*ts = 0;
+		if (nodeid)
+			*nodeid = InvalidRepOriginId;
+		return false;
+	}
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
+	memcpy(&entry,
+		   CommitTsCtl->shared->page_buffer[slotno] +
+		   SizeOfCommitTimestampEntry * entryno,
+		   SizeOfCommitTimestampEntry);
+
+	*ts = entry.time;
+	if (nodeid)
+		*nodeid = entry.nodeid;
+
+	LWLockRelease(CommitTsSLRULock);
+	return *ts != 0;
+}
+
+/*
+ * Return the Xid of the latest committed transaction.  (As far as this module
+ * is concerned, anyway; it's up to the caller to ensure the value is useful
+ * for its purposes.)
+ *
+ * ts and nodeid are filled with the corresponding data; they can be passed
+ * as NULL if not wanted.
+ */
+TransactionId
+GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid)
+{
+	TransactionId xid;
+
+	LWLockAcquire(CommitTsLock, LW_SHARED);
+
+	/* Error if module not enabled */
+	if (!commitTsShared->commitTsActive)
+		error_commit_ts_disabled();
+
+	xid = commitTsShared->xidLastCommit;
+	if (ts)
+		*ts = commitTsShared->dataLastCommit.time;
+	if (nodeid)
+		*nodeid = commitTsShared->dataLastCommit.nodeid;
+	LWLockRelease(CommitTsLock);
+
+	return xid;
+}
+
+static void
+error_commit_ts_disabled(void)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+			 errmsg("could not get commit timestamp data"),
+			 RecoveryInProgress() ?
+			 errhint("Make sure the configuration parameter \"%s\" is set on the primary server.",
+					 "track_commit_timestamp") :
+			 errhint("Make sure the configuration parameter \"%s\" is set.",
+					 "track_commit_timestamp")));
+}
+
+/*
+ * SQL-callable wrapper to obtain commit time of a transaction
+ */
+Datum
+pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
+{
+	TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+	TimestampTz ts;
+	bool		found;
+
+	found = TransactionIdGetCommitTsData(xid, &ts, NULL);
+
+	if (!found)
+		PG_RETURN_NULL();
+
+	PG_RETURN_TIMESTAMPTZ(ts);
+}
+
+
+/*
+ * pg_last_committed_xact
+ *
+ * SQL-callable wrapper to obtain some information about the latest
+ * committed transaction: transaction ID, timestamp and replication
+ * origin.
+ */
+Datum
+pg_last_committed_xact(PG_FUNCTION_ARGS)
+{
+	TransactionId xid;
+	RepOriginId nodeid;
+	TimestampTz ts;
+	Datum		values[3];
+	bool		nulls[3];
+	TupleDesc	tupdesc;
+	HeapTuple	htup;
+
+	/* and construct a tuple with our data */
+	xid = GetLatestCommitTsData(&ts, &nodeid);
+
+	/*
+	 * Construct a tuple descriptor for the result row.  This must match this
+	 * function's pg_proc entry!
+	 */
+	tupdesc = CreateTemplateTupleDesc(3);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+					   XIDOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp",
+					   TIMESTAMPTZOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "roident",
+					   OIDOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	if (!TransactionIdIsNormal(xid))
+	{
+		memset(nulls, true, sizeof(nulls));
+	}
+	else
+	{
+		values[0] = TransactionIdGetDatum(xid);
+		nulls[0] = false;
+
+		values[1] = TimestampTzGetDatum(ts);
+		nulls[1] = false;
+
+		values[2] = ObjectIdGetDatum((Oid) nodeid);
+		nulls[2] = false;
+	}
+
+	htup = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
+
+/*
+ * pg_xact_commit_timestamp_origin
+ *
+ * SQL-callable wrapper to obtain commit timestamp and replication origin
+ * of a given transaction.
+ */
+Datum
+pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
+{
+	TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+	RepOriginId nodeid;
+	TimestampTz ts;
+	Datum		values[2];
+	bool		nulls[2];
+	TupleDesc	tupdesc;
+	HeapTuple	htup;
+	bool		found;
+
+	found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
+
+	/*
+	 * Construct a tuple descriptor for the result row.  This must match this
+	 * function's pg_proc entry!
+	 */
+	tupdesc = CreateTemplateTupleDesc(2);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "timestamp",
+					   TIMESTAMPTZOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "roident",
+					   OIDOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	if (!found)
+	{
+		memset(nulls, true, sizeof(nulls));
+	}
+	else
+	{
+		values[0] = TimestampTzGetDatum(ts);
+		nulls[0] = false;
+
+		values[1] = ObjectIdGetDatum((Oid) nodeid);
+		nulls[1] = false;
+	}
+
+	htup = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
+
+/*
+ * Number of shared CommitTS buffers.
+ *
+ * We use a very similar logic as for the number of CLOG buffers (except we
+ * scale up twice as fast with shared buffers, and the maximum is twice as
+ * high); see comments in CLOGShmemBuffers.
+ */
+Size
+CommitTsShmemBuffers(void)
+{
+	return Min(256, Max(4, NBuffers / 256));
+}
+
+/*
+ * Shared memory sizing for CommitTs
+ */
+Size
+CommitTsShmemSize(void)
+{
+	return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+		sizeof(CommitTimestampShared);
+}
+
+/*
+ * Initialize CommitTs at system startup (postmaster start or standalone
+ * backend)
+ */
+void
+CommitTsShmemInit(void)
+{
+	bool		found;
+
+	CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
+	SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+				  CommitTsSLRULock, "pg_commit_ts",
+				  LWTRANCHE_COMMITTS_BUFFER,
+				  SYNC_HANDLER_COMMIT_TS);
+	SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
+
+	commitTsShared = ShmemInitStruct("CommitTs shared",
+									 sizeof(CommitTimestampShared),
+									 &found);
+
+	if (!IsUnderPostmaster)
+	{
+		Assert(!found);
+
+		commitTsShared->xidLastCommit = InvalidTransactionId;
+		TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+		commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+		commitTsShared->commitTsActive = false;
+	}
+	else
+		Assert(found);
+}
+
+/*
+ * This function must be called ONCE on system install.
+ *
+ * (The CommitTs directory is assumed to have been created by initdb, and
+ * CommitTsShmemInit must have been called already.)
+ */
+void
+BootStrapCommitTs(void)
+{
+	/*
+	 * Nothing to do here at present, unlike most other SLRU modules; segments
+	 * are created when the server is started with this module enabled. See
+	 * ActivateCommitTs.
+	 */
+}
+
+/*
+ * Initialize (or reinitialize) a page of CommitTs to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCommitTsPage(int pageno, bool writeXlog)
+{
+	int			slotno;
+
+	slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+
+	if (writeXlog)
+		WriteZeroPageXlogRec(pageno);
+
+	return slotno;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ */
+void
+StartupCommitTs(void)
+{
+	ActivateCommitTs();
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after recovery has finished.
+ */
+void
+CompleteCommitTsInitialization(void)
+{
+	/*
+	 * If the feature is not enabled, turn it off for good.  This also removes
+	 * any leftover data.
+	 *
+	 * Conversely, we activate the module if the feature is enabled.  This is
+	 * necessary for primary and standby as the activation depends on the
+	 * control file contents at the beginning of recovery or when a
+	 * XLOG_PARAMETER_CHANGE is replayed.
+	 */
+	if (!track_commit_timestamp)
+		DeactivateCommitTs();
+	else
+		ActivateCommitTs();
+}
+
+/*
+ * Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE
+ * XLog record during recovery.
+ */
+void
+CommitTsParameterChange(bool newvalue, bool oldvalue)
+{
+	/*
+	 * If the commit_ts module is disabled in this server and we get word from
+	 * the primary server that it is enabled there, activate it so that we can
+	 * replay future WAL records involving it; also mark it as active on
+	 * pg_control.  If the old value was already set, we already did this, so
+	 * don't do anything.
+	 *
+	 * If the module is disabled in the primary, disable it here too, unless
+	 * the module is enabled locally.
+	 *
+	 * Note this only runs in the recovery process, so an unlocked read is
+	 * fine.
+	 */
+	if (newvalue)
+	{
+		if (!commitTsShared->commitTsActive)
+			ActivateCommitTs();
+	}
+	else if (commitTsShared->commitTsActive)
+		DeactivateCommitTs();
+}
+
+/*
+ * Activate this module whenever necessary.
+ *		This must happen during postmaster or standalone-backend startup,
+ *		or during WAL replay anytime the track_commit_timestamp setting is
+ *		changed in the primary.
+ *
+ * The reason why this SLRU needs separate activation/deactivation functions is
+ * that it can be enabled/disabled during start and the activation/deactivation
+ * on the primary is propagated to the standby via replay. Other SLRUs don't
+ * have this property and they can be just initialized during normal startup.
+ *
+ * This is in charge of creating the currently active segment, if it's not
+ * already there.  The reason for this is that the server might have been
+ * running with this module disabled for a while and thus might have skipped
+ * the normal creation point.
+ */
+static void
+ActivateCommitTs(void)
+{
+	TransactionId xid;
+	int			pageno;
+
+	/* If we've done this already, there's nothing to do */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	if (commitTsShared->commitTsActive)
+	{
+		LWLockRelease(CommitTsLock);
+		return;
+	}
+	LWLockRelease(CommitTsLock);
+
+	xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	pageno = TransactionIdToCTsPage(xid);
+
+	/*
+	 * Re-Initialize our idea of the latest page number.
+	 */
+	LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+	CommitTsCtl->shared->latest_page_number = pageno;
+	LWLockRelease(CommitTsSLRULock);
+
+	/*
+	 * If CommitTs is enabled, but it wasn't in the previous server run, we
+	 * need to set the oldest and newest values to the next Xid; that way, we
+	 * will not try to read data that might not have been set.
+	 *
+	 * XXX does this have a problem if a server is started with commitTs
+	 * enabled, then started with commitTs disabled, then restarted with it
+	 * enabled again?  It doesn't look like it does, because there should be a
+	 * checkpoint that sets the value to InvalidTransactionId at end of
+	 * recovery; and so any chance of injecting new transactions without
+	 * CommitTs values would occur after the oldestCommitTsXid has been set to
+	 * Invalid temporarily.
+	 */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	if (ShmemVariableCache->oldestCommitTsXid == InvalidTransactionId)
+	{
+		ShmemVariableCache->oldestCommitTsXid =
+			ShmemVariableCache->newestCommitTsXid = ReadNextTransactionId();
+	}
+	LWLockRelease(CommitTsLock);
+
+	/* Create the current segment file, if necessary */
+	if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno))
+	{
+		int			slotno;
+
+		LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+		slotno = ZeroCommitTsPage(pageno, false);
+		SimpleLruWritePage(CommitTsCtl, slotno);
+		Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+		LWLockRelease(CommitTsSLRULock);
+	}
+
+	/* Change the activation status in shared memory. */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	commitTsShared->commitTsActive = true;
+	LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Deactivate this module.
+ *
+ * This must be called when the track_commit_timestamp parameter is turned off.
+ * This happens during postmaster or standalone-backend startup, or during WAL
+ * replay.
+ *
+ * Resets CommitTs into invalid state to make sure we don't hand back
+ * possibly-invalid data; also removes segments of old data.
+ */
+static void
+DeactivateCommitTs(void)
+{
+	/*
+	 * Cleanup the status in the shared memory.
+	 *
+	 * We reset everything in the commitTsShared record to prevent user from
+	 * getting confusing data about last committed transaction on the standby
+	 * when the module was activated repeatedly on the primary.
+	 */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+
+	commitTsShared->commitTsActive = false;
+	commitTsShared->xidLastCommit = InvalidTransactionId;
+	TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+	commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+
+	ShmemVariableCache->oldestCommitTsXid = InvalidTransactionId;
+	ShmemVariableCache->newestCommitTsXid = InvalidTransactionId;
+
+	LWLockRelease(CommitTsLock);
+
+	/*
+	 * Remove *all* files.  This is necessary so that there are no leftover
+	 * files; in the case where this feature is later enabled after running
+	 * with it disabled for some time there may be a gap in the file sequence.
+	 * (We can probably tolerate out-of-sequence files, as they are going to
+	 * be overwritten anyway when we wrap around, but it seems better to be
+	 * tidy.)
+	 */
+	LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+	(void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
+	LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCommitTs(void)
+{
+	/*
+	 * Write dirty CommitTs pages to disk.  This may result in sync requests
+	 * queued for later handling by ProcessSyncRequests(), as part of the
+	 * checkpoint.
+	 */
+	SimpleLruWriteAll(CommitTsCtl, true);
+}
+
+/*
+ * Make sure that CommitTs has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty CommitTs or xlog page to make room
+ * in shared memory.
+ *
+ * NB: the current implementation relies on track_commit_timestamp being
+ * PGC_POSTMASTER.
+ */
+void
+ExtendCommitTs(TransactionId newestXact)
+{
+	int			pageno;
+
+	/*
+	 * Nothing to do if module not enabled.  Note we do an unlocked read of
+	 * the flag here, which is okay because this routine is only called from
+	 * GetNewTransactionId, which is never called in a standby.
+	 */
+	Assert(!InRecovery);
+	if (!commitTsShared->commitTsActive)
+		return;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToCTsEntry(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToCTsPage(newestXact);
+
+	LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCommitTsPage(pageno, !InRecovery);
+
+	LWLockRelease(CommitTsSLRULock);
+}
+
+/*
+ * Remove all CommitTs segments before the one holding the passed
+ * transaction ID.
+ *
+ * Note that we don't need to flush XLOG here.
+ */
+void
+TruncateCommitTs(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate.
+	 */
+	cutoffPage = TransactionIdToCTsPage(oldestXact);
+
+	/* Check to see if there's any files that could be removed */
+	if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence,
+						   &cutoffPage))
+		return;					/* nothing to remove */
+
+	/* Write XLOG record */
+	WriteTruncateXlogRec(cutoffPage, oldestXact);
+
+	/* Now we can remove the old CommitTs segment(s) */
+	SimpleLruTruncate(CommitTsCtl, cutoffPage);
+}
+
+/*
+ * Set the limit values between which commit TS can be consulted.
+ */
+void
+SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact)
+{
+	/*
+	 * Be careful not to overwrite values that are either further into the
+	 * "future" or signal a disabled committs.
+	 */
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId)
+	{
+		if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact))
+			ShmemVariableCache->oldestCommitTsXid = oldestXact;
+		if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTsXid))
+			ShmemVariableCache->newestCommitTsXid = newestXact;
+	}
+	else
+	{
+		Assert(ShmemVariableCache->newestCommitTsXid == InvalidTransactionId);
+		ShmemVariableCache->oldestCommitTsXid = oldestXact;
+		ShmemVariableCache->newestCommitTsXid = newestXact;
+	}
+	LWLockRelease(CommitTsLock);
+}
+
+/*
+ * Move forwards the oldest commitTS value that can be consulted
+ */
+void
+AdvanceOldestCommitTsXid(TransactionId oldestXact)
+{
+	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+	if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId &&
+		TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact))
+		ShmemVariableCache->oldestCommitTsXid = oldestXact;
+	LWLockRelease(CommitTsLock);
+}
+
+
+/*
+ * Decide whether a commitTS page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ *
+ * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128.  This
+ * introduces differences compared to CLOG and the other SLRUs having (1 <<
+ * 31) % per_page == 0.  This function never tests exactly
+ * TransactionIdPrecedes(x-2^31, x).  When the system reaches xidStopLimit,
+ * there are two possible counts of page boundaries between oldestXact and the
+ * latest XID assigned, depending on whether oldestXact is within the first
+ * 128 entries of its page.  Since this function doesn't know the location of
+ * oldestXact within page2, it returns false for one page that actually is
+ * expendable.  This is a wider (yet still negligible) version of the
+ * truncation opportunity that CLOGPagePrecedes() cannot recognize.
+ *
+ * For the sake of a worked example, number entries with decimal values such
+ * that page1==1 entries range from 1.0 to 1.999.  Let N+0.15 be the number of
+ * pages that 2^31 entries will span (N is an integer).  If oldestXact=N+2.1,
+ * then the final safe XID assignment leaves newestXact=1.95.  We keep page 2,
+ * because entry=2.85 is the border that toggles whether entries precede the
+ * last entry of the oldestXact page.  While page 2 is expendable at
+ * oldestXact=N+2.1, it would be precious at oldestXact=N+2.9.
+ */
+static bool
+CommitTsPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId + 1;
+	xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId + 1;
+
+	return (TransactionIdPrecedes(xid1, xid2) &&
+			TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
+}
+
+
+/*
+ * Write a ZEROPAGE xlog record
+ */
+static void
+WriteZeroPageXlogRec(int pageno)
+{
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ */
+static void
+WriteTruncateXlogRec(int pageno, TransactionId oldestXid)
+{
+	xl_commit_ts_truncate xlrec;
+
+	xlrec.pageno = pageno;
+	xlrec.oldestXid = oldestXid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate);
+	(void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE);
+}
+
+/*
+ * CommitTS resource manager's routines
+ */
+void
+commit_ts_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in commit_ts records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == COMMIT_TS_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
+
+		slotno = ZeroCommitTsPage(pageno, false);
+		SimpleLruWritePage(CommitTsCtl, slotno);
+		Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+
+		LWLockRelease(CommitTsSLRULock);
+	}
+	else if (info == COMMIT_TS_TRUNCATE)
+	{
+		xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record);
+
+		AdvanceOldestCommitTsXid(trunc->oldestXid);
+
+		/*
+		 * During XLOG replay, latest_page_number isn't set up yet; insert a
+		 * suitable value to bypass the sanity test in SimpleLruTruncate.
+		 */
+		CommitTsCtl->shared->latest_page_number = trunc->pageno;
+
+		SimpleLruTruncate(CommitTsCtl, trunc->pageno);
+	}
+	else
+		elog(PANIC, "commit_ts_redo: unknown op code %u", info);
+}
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(CommitTsCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
new file mode 100644
index 0000000..0136ca7
--- /dev/null
+++ b/src/backend/access/transam/generic_xlog.c
@@ -0,0 +1,540 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic_xlog.c
+ *	 Implementation of generic xlog records.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/generic_xlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/bufmask.h"
+#include "access/generic_xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "utils/memutils.h"
+
+/*-------------------------------------------------------------------------
+ * Internally, a delta between pages consists of a set of fragments.  Each
+ * fragment represents changes made in a given region of a page.  A fragment
+ * is made up as follows:
+ *
+ * - offset of page region (OffsetNumber)
+ * - length of page region (OffsetNumber)
+ * - data - the data to place into the region ('length' number of bytes)
+ *
+ * Unchanged regions of a page are not represented in its delta.  As a result,
+ * a delta can be more compact than the full page image.  But having an
+ * unchanged region between two fragments that is smaller than the fragment
+ * header (offset+length) does not pay off in terms of the overall size of
+ * the delta.  For this reason, we merge adjacent fragments if the unchanged
+ * region between them is <= MATCH_THRESHOLD bytes.
+ *
+ * We do not bother to merge fragments across the "lower" and "upper" parts
+ * of a page; it's very seldom the case that pd_lower and pd_upper are within
+ * MATCH_THRESHOLD bytes of each other, and handling that infrequent case
+ * would complicate and slow down the delta-computation code unduly.
+ * Therefore, the worst-case delta size includes two fragment headers plus
+ * a full page's worth of data.
+ *-------------------------------------------------------------------------
+ */
+#define FRAGMENT_HEADER_SIZE	(2 * sizeof(OffsetNumber))
+#define MATCH_THRESHOLD			FRAGMENT_HEADER_SIZE
+#define MAX_DELTA_SIZE			(BLCKSZ + 2 * FRAGMENT_HEADER_SIZE)
+
+/* Struct of generic xlog data for single page */
+typedef struct
+{
+	Buffer		buffer;			/* registered buffer */
+	int			flags;			/* flags for this buffer */
+	int			deltaLen;		/* space consumed in delta field */
+	char	   *image;			/* copy of page image for modification, do not
+								 * do it in-place to have aligned memory chunk */
+	char		delta[MAX_DELTA_SIZE];	/* delta between page images */
+} PageData;
+
+/* State of generic xlog record construction */
+struct GenericXLogState
+{
+	/* Info about each page, see above */
+	PageData	pages[MAX_GENERIC_XLOG_PAGES];
+	bool		isLogged;
+	/* Page images (properly aligned) */
+	PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
+};
+
+static void writeFragment(PageData *pageData, OffsetNumber offset,
+						  OffsetNumber len, const char *data);
+static void computeRegionDelta(PageData *pageData,
+							   const char *curpage, const char *targetpage,
+							   int targetStart, int targetEnd,
+							   int validStart, int validEnd);
+static void computeDelta(PageData *pageData, Page curpage, Page targetpage);
+static void applyPageRedo(Page page, const char *delta, Size deltaSize);
+
+
+/*
+ * Write next fragment into pageData's delta.
+ *
+ * The fragment has the given offset and length, and data points to the
+ * actual data (of length length).
+ */
+static void
+writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length,
+			  const char *data)
+{
+	char	   *ptr = pageData->delta + pageData->deltaLen;
+
+	/* Verify we have enough space */
+	Assert(pageData->deltaLen + sizeof(offset) +
+		   sizeof(length) + length <= sizeof(pageData->delta));
+
+	/* Write fragment data */
+	memcpy(ptr, &offset, sizeof(offset));
+	ptr += sizeof(offset);
+	memcpy(ptr, &length, sizeof(length));
+	ptr += sizeof(length);
+	memcpy(ptr, data, length);
+	ptr += length;
+
+	pageData->deltaLen = ptr - pageData->delta;
+}
+
+/*
+ * Compute the XLOG fragments needed to transform a region of curpage into the
+ * corresponding region of targetpage, and append them to pageData's delta
+ * field.  The region to transform runs from targetStart to targetEnd-1.
+ * Bytes in curpage outside the range validStart to validEnd-1 should be
+ * considered invalid, and always overwritten with target data.
+ *
+ * This function is a hot spot, so it's worth being as tense as possible
+ * about the data-matching loops.
+ */
+static void
+computeRegionDelta(PageData *pageData,
+				   const char *curpage, const char *targetpage,
+				   int targetStart, int targetEnd,
+				   int validStart, int validEnd)
+{
+	int			i,
+				loopEnd,
+				fragmentBegin = -1,
+				fragmentEnd = -1;
+
+	/* Deal with any invalid start region by including it in first fragment */
+	if (validStart > targetStart)
+	{
+		fragmentBegin = targetStart;
+		targetStart = validStart;
+	}
+
+	/* We'll deal with any invalid end region after the main loop */
+	loopEnd = Min(targetEnd, validEnd);
+
+	/* Examine all the potentially matchable bytes */
+	i = targetStart;
+	while (i < loopEnd)
+	{
+		if (curpage[i] != targetpage[i])
+		{
+			/* On unmatched byte, start new fragment if not already in one */
+			if (fragmentBegin < 0)
+				fragmentBegin = i;
+			/* Mark unmatched-data endpoint as uncertain */
+			fragmentEnd = -1;
+			/* Extend the fragment as far as possible in a tight loop */
+			i++;
+			while (i < loopEnd && curpage[i] != targetpage[i])
+				i++;
+			if (i >= loopEnd)
+				break;
+		}
+
+		/* Found a matched byte, so remember end of unmatched fragment */
+		fragmentEnd = i;
+
+		/*
+		 * Extend the match as far as possible in a tight loop.  (On typical
+		 * workloads, this inner loop is the bulk of this function's runtime.)
+		 */
+		i++;
+		while (i < loopEnd && curpage[i] == targetpage[i])
+			i++;
+
+		/*
+		 * There are several possible cases at this point:
+		 *
+		 * 1. We have no unwritten fragment (fragmentBegin < 0).  There's
+		 * nothing to write; and it doesn't matter what fragmentEnd is.
+		 *
+		 * 2. We found more than MATCH_THRESHOLD consecutive matching bytes.
+		 * Dump out the unwritten fragment, stopping at fragmentEnd.
+		 *
+		 * 3. The match extends to loopEnd.  We'll do nothing here, exit the
+		 * loop, and then dump the unwritten fragment, after merging it with
+		 * the invalid end region if any.  If we don't so merge, fragmentEnd
+		 * establishes how much the final writeFragment call needs to write.
+		 *
+		 * 4. We found an unmatched byte before loopEnd.  The loop will repeat
+		 * and will enter the unmatched-byte stanza above.  So in this case
+		 * also, it doesn't matter what fragmentEnd is.  The matched bytes
+		 * will get merged into the continuing unmatched fragment.
+		 *
+		 * Only in case 3 do we reach the bottom of the loop with a meaningful
+		 * fragmentEnd value, which is why it's OK that we unconditionally
+		 * assign "fragmentEnd = i" above.
+		 */
+		if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD)
+		{
+			writeFragment(pageData, fragmentBegin,
+						  fragmentEnd - fragmentBegin,
+						  targetpage + fragmentBegin);
+			fragmentBegin = -1;
+			fragmentEnd = -1;	/* not really necessary */
+		}
+	}
+
+	/* Deal with any invalid end region by including it in final fragment */
+	if (loopEnd < targetEnd)
+	{
+		if (fragmentBegin < 0)
+			fragmentBegin = loopEnd;
+		fragmentEnd = targetEnd;
+	}
+
+	/* Write final fragment if any */
+	if (fragmentBegin >= 0)
+	{
+		if (fragmentEnd < 0)
+			fragmentEnd = targetEnd;
+		writeFragment(pageData, fragmentBegin,
+					  fragmentEnd - fragmentBegin,
+					  targetpage + fragmentBegin);
+	}
+}
+
+/*
+ * Compute the XLOG delta record needed to transform curpage into targetpage,
+ * and store it in pageData's delta field.
+ */
+static void
+computeDelta(PageData *pageData, Page curpage, Page targetpage)
+{
+	int			targetLower = ((PageHeader) targetpage)->pd_lower,
+				targetUpper = ((PageHeader) targetpage)->pd_upper,
+				curLower = ((PageHeader) curpage)->pd_lower,
+				curUpper = ((PageHeader) curpage)->pd_upper;
+
+	pageData->deltaLen = 0;
+
+	/* Compute delta records for lower part of page ... */
+	computeRegionDelta(pageData, curpage, targetpage,
+					   0, targetLower,
+					   0, curLower);
+	/* ... and for upper part, ignoring what's between */
+	computeRegionDelta(pageData, curpage, targetpage,
+					   targetUpper, BLCKSZ,
+					   curUpper, BLCKSZ);
+
+	/*
+	 * If xlog debug is enabled, then check produced delta.  Result of delta
+	 * application to curpage should be equivalent to targetpage.
+	 */
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG)
+	{
+		PGAlignedBlock tmp;
+
+		memcpy(tmp.data, curpage, BLCKSZ);
+		applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen);
+		if (memcmp(tmp.data, targetpage, targetLower) != 0 ||
+			memcmp(tmp.data + targetUpper, targetpage + targetUpper,
+				   BLCKSZ - targetUpper) != 0)
+			elog(ERROR, "result of generic xlog apply does not match");
+	}
+#endif
+}
+
+/*
+ * Start new generic xlog record for modifications to specified relation.
+ */
+GenericXLogState *
+GenericXLogStart(Relation relation)
+{
+	GenericXLogState *state;
+	int			i;
+
+	state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+	state->isLogged = RelationNeedsWAL(relation);
+
+	for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+	{
+		state->pages[i].image = state->images[i].data;
+		state->pages[i].buffer = InvalidBuffer;
+	}
+
+	return state;
+}
+
+/*
+ * Register new buffer for generic xlog record.
+ *
+ * Returns pointer to the page's image in the GenericXLogState, which
+ * is what the caller should modify.
+ *
+ * If the buffer is already registered, just return its existing entry.
+ * (It's not very clear what to do with the flags in such a case, but
+ * for now we stay with the original flags.)
+ */
+Page
+GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags)
+{
+	int			block_id;
+
+	/* Search array for existing entry or first unused slot */
+	for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++)
+	{
+		PageData   *page = &state->pages[block_id];
+
+		if (BufferIsInvalid(page->buffer))
+		{
+			/* Empty slot, so use it (there cannot be a match later) */
+			page->buffer = buffer;
+			page->flags = flags;
+			memcpy(page->image, BufferGetPage(buffer), BLCKSZ);
+			return (Page) page->image;
+		}
+		else if (page->buffer == buffer)
+		{
+			/*
+			 * Buffer is already registered.  Just return the image, which is
+			 * already prepared.
+			 */
+			return (Page) page->image;
+		}
+	}
+
+	elog(ERROR, "maximum number %d of generic xlog buffers is exceeded",
+		 MAX_GENERIC_XLOG_PAGES);
+	/* keep compiler quiet */
+	return NULL;
+}
+
+/*
+ * Apply changes represented by GenericXLogState to the actual buffers,
+ * and emit a generic xlog record.
+ */
+XLogRecPtr
+GenericXLogFinish(GenericXLogState *state)
+{
+	XLogRecPtr	lsn;
+	int			i;
+
+	if (state->isLogged)
+	{
+		/* Logged relation: make xlog record in critical section. */
+		XLogBeginInsert();
+
+		START_CRIT_SECTION();
+
+		/*
+		 * Compute deltas if necessary, write changes to buffers, mark
+		 * buffers dirty, and register changes.
+		 */
+		for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+		{
+			PageData   *pageData = &state->pages[i];
+			Page		page;
+			PageHeader	pageHeader;
+
+			if (BufferIsInvalid(pageData->buffer))
+				continue;
+
+			page = BufferGetPage(pageData->buffer);
+			pageHeader = (PageHeader) pageData->image;
+
+			/*
+			 * Compute delta while we still have both the unmodified page and
+			 * the new image. Not needed if we are logging the full image.
+			 */
+			if (!(pageData->flags & GENERIC_XLOG_FULL_IMAGE))
+				computeDelta(pageData, page, (Page) pageData->image);
+
+			/*
+			 * Apply the image, being careful to zero the "hole" between
+			 * pd_lower and pd_upper in order to avoid divergence between
+			 * actual page state and what replay would produce.
+			 */
+			memcpy(page, pageData->image, pageHeader->pd_lower);
+			memset(page + pageHeader->pd_lower, 0,
+				   pageHeader->pd_upper - pageHeader->pd_lower);
+			memcpy(page + pageHeader->pd_upper,
+				   pageData->image + pageHeader->pd_upper,
+				   BLCKSZ - pageHeader->pd_upper);
+
+			MarkBufferDirty(pageData->buffer);
+
+			if (pageData->flags & GENERIC_XLOG_FULL_IMAGE)
+			{
+				XLogRegisterBuffer(i, pageData->buffer,
+								   REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+			}
+			else
+			{
+				XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD);
+				XLogRegisterBufData(i, pageData->delta, pageData->deltaLen);
+			}
+		}
+
+		/* Insert xlog record */
+		lsn = XLogInsert(RM_GENERIC_ID, 0);
+
+		/* Set LSN */
+		for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+		{
+			PageData   *pageData = &state->pages[i];
+
+			if (BufferIsInvalid(pageData->buffer))
+				continue;
+			PageSetLSN(BufferGetPage(pageData->buffer), lsn);
+		}
+		END_CRIT_SECTION();
+	}
+	else
+	{
+		/* Unlogged relation: skip xlog-related stuff */
+		START_CRIT_SECTION();
+		for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
+		{
+			PageData   *pageData = &state->pages[i];
+
+			if (BufferIsInvalid(pageData->buffer))
+				continue;
+			memcpy(BufferGetPage(pageData->buffer),
+				   pageData->image,
+				   BLCKSZ);
+			/* We don't worry about zeroing the "hole" in this case */
+			MarkBufferDirty(pageData->buffer);
+		}
+		END_CRIT_SECTION();
+		/* We don't have a LSN to return, in this case */
+		lsn = InvalidXLogRecPtr;
+	}
+
+	pfree(state);
+
+	return lsn;
+}
+
+/*
+ * Abort generic xlog record construction.  No changes are applied to buffers.
+ *
+ * Note: caller is responsible for releasing locks/pins on buffers, if needed.
+ */
+void
+GenericXLogAbort(GenericXLogState *state)
+{
+	pfree(state);
+}
+
+/*
+ * Apply delta to given page image.
+ */
+static void
+applyPageRedo(Page page, const char *delta, Size deltaSize)
+{
+	const char *ptr = delta;
+	const char *end = delta + deltaSize;
+
+	while (ptr < end)
+	{
+		OffsetNumber offset,
+					length;
+
+		memcpy(&offset, ptr, sizeof(offset));
+		ptr += sizeof(offset);
+		memcpy(&length, ptr, sizeof(length));
+		ptr += sizeof(length);
+
+		memcpy(page + offset, ptr, length);
+
+		ptr += length;
+	}
+}
+
+/*
+ * Redo function for generic xlog record.
+ */
+void
+generic_redo(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	Buffer		buffers[MAX_GENERIC_XLOG_PAGES];
+	uint8		block_id;
+
+	/* Protect limited size of buffers[] array */
+	Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES);
+
+	/* Iterate over blocks */
+	for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+	{
+		XLogRedoAction action;
+
+		if (!XLogRecHasBlockRef(record, block_id))
+		{
+			buffers[block_id] = InvalidBuffer;
+			continue;
+		}
+
+		action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]);
+
+		/* Apply redo to given block if needed */
+		if (action == BLK_NEEDS_REDO)
+		{
+			Page		page;
+			PageHeader	pageHeader;
+			char	   *blockDelta;
+			Size		blockDeltaSize;
+
+			page = BufferGetPage(buffers[block_id]);
+			blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize);
+			applyPageRedo(page, blockDelta, blockDeltaSize);
+
+			/*
+			 * Since the delta contains no information about what's in the
+			 * "hole" between pd_lower and pd_upper, set that to zero to
+			 * ensure we produce the same page state that application of the
+			 * logged action by GenericXLogFinish did.
+			 */
+			pageHeader = (PageHeader) page;
+			memset(page + pageHeader->pd_lower, 0,
+				   pageHeader->pd_upper - pageHeader->pd_lower);
+
+			PageSetLSN(page, lsn);
+			MarkBufferDirty(buffers[block_id]);
+		}
+	}
+
+	/* Changes are done: unlock and release all buffers */
+	for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+	{
+		if (BufferIsValid(buffers[block_id]))
+			UnlockReleaseBuffer(buffers[block_id]);
+	}
+}
+
+/*
+ * Mask a generic page before performing consistency checks on it.
+ */
+void
+generic_mask(char *page, BlockNumber blkno)
+{
+	mask_page_lsn_and_checksum(page);
+
+	mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
new file mode 100644
index 0000000..b8b1773
--- /dev/null
+++ b/src/backend/access/transam/multixact.c
@@ -0,0 +1,3428 @@
+/*-------------------------------------------------------------------------
+ *
+ * multixact.c
+ *		PostgreSQL multi-transaction-log manager
+ *
+ * The pg_multixact manager is a pg_xact-like manager that stores an array of
+ * MultiXactMember for each MultiXactId.  It is a fundamental part of the
+ * shared-row-lock implementation.  Each MultiXactMember is comprised of a
+ * TransactionId and a set of flag bits.  The name is a bit historical:
+ * originally, a MultiXactId consisted of more than one TransactionId (except
+ * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
+ * legitimate to have MultiXactIds that only include a single Xid.
+ *
+ * The meaning of the flag bits is opaque to this module, but they are mostly
+ * used in heapam.c to identify lock modes that each of the member transactions
+ * is holding on any given tuple.  This module just contains support to store
+ * and retrieve the arrays.
+ *
+ * We use two SLRU areas, one for storing the offsets at which the data
+ * starts for each MultiXactId in the other one.  This trick allows us to
+ * store variable length arrays of TransactionIds.  (We could alternatively
+ * use one area containing counts and TransactionIds, with valid MultiXactId
+ * values pointing at slots containing counts; but that way seems less robust
+ * since it would get completely confused if someone inquired about a bogus
+ * MultiXactId that pointed to an intermediate slot containing an XID.)
+ *
+ * XLOG interactions: this module generates a record whenever a new OFFSETs or
+ * MEMBERs page is initialized to zeroes, as well as an
+ * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined.
+ * This module ignores the WAL rule "write xlog before data," because it
+ * suffices that actions recording a MultiXactId in a heap xmax do follow that
+ * rule.  The only way for the MXID to be referenced from any data page is for
+ * heap_lock_tuple() or heap_update() to have put it there, and each generates
+ * an XLOG record that must follow ours.  The normal LSN interlock between the
+ * data page and that XLOG record will ensure that our XLOG record reaches
+ * disk first.  If the SLRU members/offsets data reaches disk sooner than the
+ * XLOG records, we do not care; after recovery, no xmax will refer to it.  On
+ * the flip side, to ensure that all referenced entries _do_ reach disk, this
+ * module's XLOG records completely rebuild the data entered since the last
+ * checkpoint.  We flush and sync all dirty OFFSETs and MEMBERs pages to disk
+ * before each checkpoint is considered complete.
+ *
+ * Like clog.c, and unlike subtrans.c, we have to preserve state across
+ * crashes and ensure that MXID and offset numbering increases monotonically
+ * across a crash.  We do this in the same way as it's done for transaction
+ * IDs: the WAL record is guaranteed to contain evidence of every MXID we
+ * could need to worry about, and we just make sure that at the end of
+ * replay, the next-MXID and next-offset counters are at least as large as
+ * anything we saw during replay.
+ *
+ * We are able to remove segments no longer necessary by carefully tracking
+ * each table's used values: during vacuum, any multixact older than a certain
+ * value is removed; the cutoff value is stored in pg_class.  The minimum value
+ * across all tables in each database is stored in pg_database, and the global
+ * minimum across all databases is part of pg_control and is kept in shared
+ * memory.  Whenever that minimum is advanced, the SLRUs are truncated.
+ *
+ * When new multixactid values are to be created, care is taken that the
+ * counter does not fall within the wraparound horizon considering the global
+ * minimum value.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/multixact.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/multixact.h"
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
+#include "funcapi.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "postmaster/autovacuum.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * Defines for MultiXactOffset page sizes.  A page is the same BLCKSZ as is
+ * used everywhere else in Postgres.
+ *
+ * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
+ * MultiXact page numbering also wraps around at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in TruncateMultiXact (see
+ * MultiXactOffsetPagePrecedes).
+ */
+
+/* We need four bytes per offset */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+
+#define MultiXactIdToOffsetPage(xid) \
+	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
+#define MultiXactIdToOffsetEntry(xid) \
+	((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
+#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId.  To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT			8
+#define MXACT_MEMBER_FLAGS_PER_BYTE			1
+#define MXACT_MEMBER_XACT_BITMASK	((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP		4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP	\
+	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE	\
+	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/*
+ * Because the number of items per page is not a divisor of the last item
+ * number (member 0xFFFFFFFF), the last segment does not use the maximum number
+ * of pages, and moreover the last used page therein does not use the same
+ * number of items as previous pages.  (Another way to say it is that the
+ * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
+ * has some empty space after that item.)
+ *
+ * This constant is the number of members in the last page of the last segment.
+ */
+#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
+		((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
+
+/* page in which a member is to be found */
+#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
+#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT)
+
+/* Location (byte offset within page) of flag word for a given member */
+#define MXOffsetToFlagsOffset(xid) \
+	((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
+	  (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
+	 (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
+#define MXOffsetToFlagsBitShift(xid) \
+	(((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
+	 MXACT_MEMBER_BITS_PER_XACT)
+
+/* Location (byte offset within page) of TransactionId of given member */
+#define MXOffsetToMemberOffset(xid) \
+	(MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
+	 ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
+
+/* Multixact members wraparound thresholds. */
+#define MULTIXACT_MEMBER_SAFE_THRESHOLD		(MaxMultiXactOffset / 2)
+#define MULTIXACT_MEMBER_DANGER_THRESHOLD	\
+	(MaxMultiXactOffset - MaxMultiXactOffset / 4)
+
+#define PreviousMultiXactId(xid) \
+	((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1)
+
+/*
+ * Links to shared-memory data structures for MultiXact control
+ */
+static SlruCtlData MultiXactOffsetCtlData;
+static SlruCtlData MultiXactMemberCtlData;
+
+#define MultiXactOffsetCtl	(&MultiXactOffsetCtlData)
+#define MultiXactMemberCtl	(&MultiXactMemberCtlData)
+
+/*
+ * MultiXact state shared across all backends.  All this state is protected
+ * by MultiXactGenLock.  (We also use MultiXactOffsetSLRULock and
+ * MultiXactMemberSLRULock to guard accesses to the two sets of SLRU
+ * buffers.  For concurrency's sake, we avoid holding more than one of these
+ * locks at a time.)
+ */
+typedef struct MultiXactStateData
+{
+	/* next-to-be-assigned MultiXactId */
+	MultiXactId nextMXact;
+
+	/* next-to-be-assigned offset */
+	MultiXactOffset nextOffset;
+
+	/* Have we completed multixact startup? */
+	bool		finishedStartup;
+
+	/*
+	 * Oldest multixact that is still potentially referenced by a relation.
+	 * Anything older than this should not be consulted.  These values are
+	 * updated by vacuum.
+	 */
+	MultiXactId oldestMultiXactId;
+	Oid			oldestMultiXactDB;
+
+	/*
+	 * Oldest multixact offset that is potentially referenced by a multixact
+	 * referenced by a relation.  We don't always know this value, so there's
+	 * a flag here to indicate whether or not we currently do.
+	 */
+	MultiXactOffset oldestOffset;
+	bool		oldestOffsetKnown;
+
+	/* support for anti-wraparound measures */
+	MultiXactId multiVacLimit;
+	MultiXactId multiWarnLimit;
+	MultiXactId multiStopLimit;
+	MultiXactId multiWrapLimit;
+
+	/* support for members anti-wraparound measures */
+	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
+
+	/*
+	 * Per-backend data starts here.  We have two arrays stored in the area
+	 * immediately following the MultiXactStateData struct. Each is indexed by
+	 * BackendId.
+	 *
+	 * In both arrays, there's a slot for all normal backends (1..MaxBackends)
+	 * followed by a slot for max_prepared_xacts prepared transactions. Valid
+	 * BackendIds start from 1; element zero of each array is never used.
+	 *
+	 * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
+	 * transaction(s) could possibly be a member of, or InvalidMultiXactId
+	 * when the backend has no live transaction that could possibly be a
+	 * member of a MultiXact.  Each backend sets its entry to the current
+	 * nextMXact counter just before first acquiring a shared lock in a given
+	 * transaction, and clears it at transaction end. (This works because only
+	 * during or after acquiring a shared lock could an XID possibly become a
+	 * member of a MultiXact, and that MultiXact would have to be created
+	 * during or after the lock acquisition.)
+	 *
+	 * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
+	 * current transaction(s) think is potentially live, or InvalidMultiXactId
+	 * when not in a transaction or not in a transaction that's paid any
+	 * attention to MultiXacts yet.  This is computed when first needed in a
+	 * given transaction, and cleared at transaction end.  We can compute it
+	 * as the minimum of the valid OldestMemberMXactId[] entries at the time
+	 * we compute it (using nextMXact if none are valid).  Each backend is
+	 * required not to attempt to access any SLRU data for MultiXactIds older
+	 * than its own OldestVisibleMXactId[] setting; this is necessary because
+	 * the checkpointer could truncate away such data at any instant.
+	 *
+	 * The oldest valid value among all of the OldestMemberMXactId[] and
+	 * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
+	 * possible value still having any live member transaction.  Subtracting
+	 * vacuum_multixact_freeze_min_age from that value we obtain the freezing
+	 * point for multixacts for that table.  Any value older than that is
+	 * removed from tuple headers (or "frozen"; see FreezeMultiXactId.  Note
+	 * that multis that have member xids that are older than the cutoff point
+	 * for xids must also be frozen, even if the multis themselves are newer
+	 * than the multixid cutoff point).  Whenever a full table vacuum happens,
+	 * the freezing point so computed is used as the new pg_class.relminmxid
+	 * value.  The minimum of all those values in a database is stored as
+	 * pg_database.datminmxid.  In turn, the minimum of all of those values is
+	 * stored in pg_control and used as truncation point for pg_multixact.  At
+	 * checkpoint or restartpoint, unneeded segments are removed.
+	 */
+	MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER];
+} MultiXactStateData;
+
+/*
+ * Last element of OldestMemberMXactId and OldestVisibleMXactId arrays.
+ * Valid elements are (1..MaxOldestSlot); element 0 is never used.
+ */
+#define MaxOldestSlot	(MaxBackends + max_prepared_xacts)
+
+/* Pointers to the state data in shared memory */
+static MultiXactStateData *MultiXactState;
+static MultiXactId *OldestMemberMXactId;
+static MultiXactId *OldestVisibleMXactId;
+
+
+/*
+ * Definitions for the backend-local MultiXactId cache.
+ *
+ * We use this cache to store known MultiXacts, so we don't need to go to
+ * SLRU areas every time.
+ *
+ * The cache lasts for the duration of a single transaction, the rationale
+ * for this being that most entries will contain our own TransactionId and
+ * so they will be uninteresting by the time our next transaction starts.
+ * (XXX not clear that this is correct --- other members of the MultiXact
+ * could hang around longer than we did.  However, it's not clear what a
+ * better policy for flushing old cache entries would be.)	FIXME actually
+ * this is plain wrong now that multixact's may contain update Xids.
+ *
+ * We allocate the cache entries in a memory context that is deleted at
+ * transaction end, so we don't need to do retail freeing of entries.
+ */
+typedef struct mXactCacheEnt
+{
+	MultiXactId multi;
+	int			nmembers;
+	dlist_node	node;
+	MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
+} mXactCacheEnt;
+
+#define MAX_CACHE_ENTRIES	256
+static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
+static int	MXactCacheMembers = 0;
+static MemoryContext MXactContext = NULL;
+
+#ifdef MULTIXACT_DEBUG
+#define debug_elog2(a,b) elog(a,b)
+#define debug_elog3(a,b,c) elog(a,b,c)
+#define debug_elog4(a,b,c,d) elog(a,b,c,d)
+#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
+#else
+#define debug_elog2(a,b)
+#define debug_elog3(a,b,c)
+#define debug_elog4(a,b,c,d)
+#define debug_elog5(a,b,c,d,e)
+#define debug_elog6(a,b,c,d,e,f)
+#endif
+
+/* internal MultiXactId management */
+static void MultiXactIdSetOldestVisible(void);
+static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
+							   int nmembers, MultiXactMember *members);
+static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
+
+/* MultiXact cache management */
+static int	mxactMemberComparator(const void *arg1, const void *arg2);
+static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
+static int	mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
+static void mXactCachePut(MultiXactId multi, int nmembers,
+						  MultiXactMember *members);
+
+static char *mxstatus_to_string(MultiXactStatus status);
+
+/* management of SLRU infrastructure */
+static int	ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
+static int	ZeroMultiXactMemberPage(int pageno, bool writeXlog);
+static bool MultiXactOffsetPagePrecedes(int page1, int page2);
+static bool MultiXactMemberPagePrecedes(int page1, int page2);
+static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
+									MultiXactOffset offset2);
+static void ExtendMultiXactOffset(MultiXactId multi);
+static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
+static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
+									 MultiXactOffset start, uint32 distance);
+static bool SetOffsetVacuumLimit(bool is_startup);
+static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
+static void WriteMZeroPageXlogRec(int pageno, uint8 info);
+static void WriteMTruncateXlogRec(Oid oldestMultiDB,
+								  MultiXactId startTruncOff,
+								  MultiXactId endTruncOff,
+								  MultiXactOffset startTruncMemb,
+								  MultiXactOffset endTruncMemb);
+
+
+/*
+ * MultiXactIdCreate
+ *		Construct a MultiXactId representing two TransactionIds.
+ *
+ * The two XIDs must be different, or be requesting different statuses.
+ *
+ * NB - we don't worry about our local MultiXactId cache here, because that
+ * is handled by the lower-level routines.
+ */
+MultiXactId
+MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
+				  TransactionId xid2, MultiXactStatus status2)
+{
+	MultiXactId newMulti;
+	MultiXactMember members[2];
+
+	AssertArg(TransactionIdIsValid(xid1));
+	AssertArg(TransactionIdIsValid(xid2));
+
+	Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
+
+	/* MultiXactIdSetOldestMember() must have been called already. */
+	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+
+	/*
+	 * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
+	 * are still running.  In typical usage, xid2 will be our own XID and the
+	 * caller just did a check on xid1, so it'd be wasted effort.
+	 */
+
+	members[0].xid = xid1;
+	members[0].status = status1;
+	members[1].xid = xid2;
+	members[1].status = status2;
+
+	newMulti = MultiXactIdCreateFromMembers(2, members);
+
+	debug_elog3(DEBUG2, "Create: %s",
+				mxid_to_string(newMulti, 2, members));
+
+	return newMulti;
+}
+
+/*
+ * MultiXactIdExpand
+ *		Add a TransactionId to a pre-existing MultiXactId.
+ *
+ * If the TransactionId is already a member of the passed MultiXactId with the
+ * same status, just return it as-is.
+ *
+ * Note that we do NOT actually modify the membership of a pre-existing
+ * MultiXactId; instead we create a new one.  This is necessary to avoid
+ * a race condition against code trying to wait for one MultiXactId to finish;
+ * see notes in heapam.c.
+ *
+ * NB - we don't worry about our local MultiXactId cache here, because that
+ * is handled by the lower-level routines.
+ *
+ * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
+ * one upgraded by pg_upgrade from a cluster older than this feature) are not
+ * passed in.
+ */
+MultiXactId
+MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
+{
+	MultiXactId newMulti;
+	MultiXactMember *members;
+	MultiXactMember *newMembers;
+	int			nmembers;
+	int			i;
+	int			j;
+
+	AssertArg(MultiXactIdIsValid(multi));
+	AssertArg(TransactionIdIsValid(xid));
+
+	/* MultiXactIdSetOldestMember() must have been called already. */
+	Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
+
+	debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
+				multi, xid, mxstatus_to_string(status));
+
+	/*
+	 * Note: we don't allow for old multis here.  The reason is that the only
+	 * caller of this function does a check that the multixact is no longer
+	 * running.
+	 */
+	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
+
+	if (nmembers < 0)
+	{
+		MultiXactMember member;
+
+		/*
+		 * The MultiXactId is obsolete.  This can only happen if all the
+		 * MultiXactId members stop running between the caller checking and
+		 * passing it to us.  It would be better to return that fact to the
+		 * caller, but it would complicate the API and it's unlikely to happen
+		 * too often, so just deal with it by creating a singleton MultiXact.
+		 */
+		member.xid = xid;
+		member.status = status;
+		newMulti = MultiXactIdCreateFromMembers(1, &member);
+
+		debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
+					multi, newMulti);
+		return newMulti;
+	}
+
+	/*
+	 * If the TransactionId is already a member of the MultiXactId with the
+	 * same status, just return the existing MultiXactId.
+	 */
+	for (i = 0; i < nmembers; i++)
+	{
+		if (TransactionIdEquals(members[i].xid, xid) &&
+			(members[i].status == status))
+		{
+			debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
+						xid, multi);
+			pfree(members);
+			return multi;
+		}
+	}
+
+	/*
+	 * Determine which of the members of the MultiXactId are still of
+	 * interest. This is any running transaction, and also any transaction
+	 * that grabbed something stronger than just a lock and was committed. (An
+	 * update that aborted is of no interest here; and having more than one
+	 * update Xid in a multixact would cause errors elsewhere.)
+	 *
+	 * Removing dead members is not just an optimization: freezing of tuples
+	 * whose Xmax are multis depends on this behavior.
+	 *
+	 * Note we have the same race condition here as above: j could be 0 at the
+	 * end of the loop.
+	 */
+	newMembers = (MultiXactMember *)
+		palloc(sizeof(MultiXactMember) * (nmembers + 1));
+
+	for (i = 0, j = 0; i < nmembers; i++)
+	{
+		if (TransactionIdIsInProgress(members[i].xid) ||
+			(ISUPDATE_from_mxstatus(members[i].status) &&
+			 TransactionIdDidCommit(members[i].xid)))
+		{
+			newMembers[j].xid = members[i].xid;
+			newMembers[j++].status = members[i].status;
+		}
+	}
+
+	newMembers[j].xid = xid;
+	newMembers[j++].status = status;
+	newMulti = MultiXactIdCreateFromMembers(j, newMembers);
+
+	pfree(members);
+	pfree(newMembers);
+
+	debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
+
+	return newMulti;
+}
+
+/*
+ * MultiXactIdIsRunning
+ *		Returns whether a MultiXactId is "running".
+ *
+ * We return true if at least one member of the given MultiXactId is still
+ * running.  Note that a "false" result is certain not to change,
+ * because it is not legal to add members to an existing MultiXactId.
+ *
+ * Caller is expected to have verified that the multixact does not come from
+ * a pg_upgraded share-locked tuple.
+ */
+bool
+MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
+{
+	MultiXactMember *members;
+	int			nmembers;
+	int			i;
+
+	debug_elog3(DEBUG2, "IsRunning %u?", multi);
+
+	/*
+	 * "false" here means we assume our callers have checked that the given
+	 * multi cannot possibly come from a pg_upgraded database.
+	 */
+	nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly);
+
+	if (nmembers <= 0)
+	{
+		debug_elog2(DEBUG2, "IsRunning: no members");
+		return false;
+	}
+
+	/*
+	 * Checking for myself is cheap compared to looking in shared memory;
+	 * return true if any live subtransaction of the current top-level
+	 * transaction is a member.
+	 *
+	 * This is not needed for correctness, it's just a fast path.
+	 */
+	for (i = 0; i < nmembers; i++)
+	{
+		if (TransactionIdIsCurrentTransactionId(members[i].xid))
+		{
+			debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
+			pfree(members);
+			return true;
+		}
+	}
+
+	/*
+	 * This could be made faster by having another entry point in procarray.c,
+	 * walking the PGPROC array only once for all the members.  But in most
+	 * cases nmembers should be small enough that it doesn't much matter.
+	 */
+	for (i = 0; i < nmembers; i++)
+	{
+		if (TransactionIdIsInProgress(members[i].xid))
+		{
+			debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
+						i, members[i].xid);
+			pfree(members);
+			return true;
+		}
+	}
+
+	pfree(members);
+
+	debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
+
+	return false;
+}
+
+/*
+ * MultiXactIdSetOldestMember
+ *		Save the oldest MultiXactId this transaction could be a member of.
+ *
+ * We set the OldestMemberMXactId for a given transaction the first time it's
+ * going to do some operation that might require a MultiXactId (tuple lock,
+ * update or delete).  We need to do this even if we end up using a
+ * TransactionId instead of a MultiXactId, because there is a chance that
+ * another transaction would add our XID to a MultiXactId.
+ *
+ * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
+ * be called just before doing any such possibly-MultiXactId-able operation.
+ */
+void
+MultiXactIdSetOldestMember(void)
+{
+	if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
+	{
+		MultiXactId nextMXact;
+
+		/*
+		 * You might think we don't need to acquire a lock here, since
+		 * fetching and storing of TransactionIds is probably atomic, but in
+		 * fact we do: suppose we pick up nextMXact and then lose the CPU for
+		 * a long time.  Someone else could advance nextMXact, and then
+		 * another someone else could compute an OldestVisibleMXactId that
+		 * would be after the value we are going to store when we get control
+		 * back.  Which would be wrong.
+		 *
+		 * Note that a shared lock is sufficient, because it's enough to stop
+		 * someone from advancing nextMXact; and nobody else could be trying
+		 * to write to our OldestMember entry, only reading (and we assume
+		 * storing it is atomic.)
+		 */
+		LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+		/*
+		 * We have to beware of the possibility that nextMXact is in the
+		 * wrapped-around state.  We don't fix the counter itself here, but we
+		 * must be sure to store a valid value in our array entry.
+		 */
+		nextMXact = MultiXactState->nextMXact;
+		if (nextMXact < FirstMultiXactId)
+			nextMXact = FirstMultiXactId;
+
+		OldestMemberMXactId[MyBackendId] = nextMXact;
+
+		LWLockRelease(MultiXactGenLock);
+
+		debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
+					MyBackendId, nextMXact);
+	}
+}
+
+/*
+ * MultiXactIdSetOldestVisible
+ *		Save the oldest MultiXactId this transaction considers possibly live.
+ *
+ * We set the OldestVisibleMXactId for a given transaction the first time
+ * it's going to inspect any MultiXactId.  Once we have set this, we are
+ * guaranteed that the checkpointer won't truncate off SLRU data for
+ * MultiXactIds at or after our OldestVisibleMXactId.
+ *
+ * The value to set is the oldest of nextMXact and all the valid per-backend
+ * OldestMemberMXactId[] entries.  Because of the locking we do, we can be
+ * certain that no subsequent call to MultiXactIdSetOldestMember can set
+ * an OldestMemberMXactId[] entry older than what we compute here.  Therefore
+ * there is no live transaction, now or later, that can be a member of any
+ * MultiXactId older than the OldestVisibleMXactId we compute here.
+ */
+static void
+MultiXactIdSetOldestVisible(void)
+{
+	if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
+	{
+		MultiXactId oldestMXact;
+		int			i;
+
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+		/*
+		 * We have to beware of the possibility that nextMXact is in the
+		 * wrapped-around state.  We don't fix the counter itself here, but we
+		 * must be sure to store a valid value in our array entry.
+		 */
+		oldestMXact = MultiXactState->nextMXact;
+		if (oldestMXact < FirstMultiXactId)
+			oldestMXact = FirstMultiXactId;
+
+		for (i = 1; i <= MaxOldestSlot; i++)
+		{
+			MultiXactId thisoldest = OldestMemberMXactId[i];
+
+			if (MultiXactIdIsValid(thisoldest) &&
+				MultiXactIdPrecedes(thisoldest, oldestMXact))
+				oldestMXact = thisoldest;
+		}
+
+		OldestVisibleMXactId[MyBackendId] = oldestMXact;
+
+		LWLockRelease(MultiXactGenLock);
+
+		debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
+					MyBackendId, oldestMXact);
+	}
+}
+
+/*
+ * ReadNextMultiXactId
+ *		Return the next MultiXactId to be assigned, but don't allocate it
+ */
+MultiXactId
+ReadNextMultiXactId(void)
+{
+	MultiXactId mxid;
+
+	/* XXX we could presumably do this without a lock. */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	mxid = MultiXactState->nextMXact;
+	LWLockRelease(MultiXactGenLock);
+
+	if (mxid < FirstMultiXactId)
+		mxid = FirstMultiXactId;
+
+	return mxid;
+}
+
+/*
+ * ReadMultiXactIdRange
+ *		Get the range of IDs that may still be referenced by a relation.
+ */
+void
+ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next)
+{
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	*oldest = MultiXactState->oldestMultiXactId;
+	*next = MultiXactState->nextMXact;
+	LWLockRelease(MultiXactGenLock);
+
+	if (*oldest < FirstMultiXactId)
+		*oldest = FirstMultiXactId;
+	if (*next < FirstMultiXactId)
+		*next = FirstMultiXactId;
+}
+
+
+/*
+ * MultiXactIdCreateFromMembers
+ *		Make a new MultiXactId from the specified set of members
+ *
+ * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
+ * given TransactionIds as members.  Returns the newly created MultiXactId.
+ *
+ * NB: the passed members[] array will be sorted in-place.
+ */
+MultiXactId
+MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
+{
+	MultiXactId multi;
+	MultiXactOffset offset;
+	xl_multixact_create xlrec;
+
+	debug_elog3(DEBUG2, "Create: %s",
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
+
+	/*
+	 * See if the same set of members already exists in our cache; if so, just
+	 * re-use that MultiXactId.  (Note: it might seem that looking in our
+	 * cache is insufficient, and we ought to search disk to see if a
+	 * duplicate definition already exists.  But since we only ever create
+	 * MultiXacts containing our own XID, in most cases any such MultiXacts
+	 * were in fact created by us, and so will be in our cache.  There are
+	 * corner cases where someone else added us to a MultiXact without our
+	 * knowledge, but it's not worth checking for.)
+	 */
+	multi = mXactCacheGetBySet(nmembers, members);
+	if (MultiXactIdIsValid(multi))
+	{
+		debug_elog2(DEBUG2, "Create: in cache!");
+		return multi;
+	}
+
+	/* Verify that there is a single update Xid among the given members. */
+	{
+		int			i;
+		bool		has_update = false;
+
+		for (i = 0; i < nmembers; i++)
+		{
+			if (ISUPDATE_from_mxstatus(members[i].status))
+			{
+				if (has_update)
+					elog(ERROR, "new multixact has more than one updating member: %s",
+						 mxid_to_string(InvalidMultiXactId, nmembers, members));
+				has_update = true;
+			}
+		}
+	}
+
+	/*
+	 * Assign the MXID and offsets range to use, and make sure there is space
+	 * in the OFFSETs and MEMBERs files.  NB: this routine does
+	 * START_CRIT_SECTION().
+	 *
+	 * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
+	 * that we've called MultiXactIdSetOldestMember here.  This is because
+	 * this routine is used in some places to create new MultiXactIds of which
+	 * the current backend is not a member, notably during freezing of multis
+	 * in vacuum.  During vacuum, in particular, it would be unacceptable to
+	 * keep OldestMulti set, in case it runs for long.
+	 */
+	multi = GetNewMultiXactId(nmembers, &offset);
+
+	/* Make an XLOG entry describing the new MXID. */
+	xlrec.mid = multi;
+	xlrec.moff = offset;
+	xlrec.nmembers = nmembers;
+
+	/*
+	 * XXX Note: there's a lot of padding space in MultiXactMember.  We could
+	 * find a more compact representation of this Xlog record -- perhaps all
+	 * the status flags in one XLogRecData, then all the xids in another one?
+	 * Not clear that it's worth the trouble though.
+	 */
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate);
+	XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember));
+
+	(void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID);
+
+	/* Now enter the information into the OFFSETs and MEMBERs logs */
+	RecordNewMultiXact(multi, offset, nmembers, members);
+
+	/* Done with critical section */
+	END_CRIT_SECTION();
+
+	/* Store the new MultiXactId in the local cache, too */
+	mXactCachePut(multi, nmembers, members);
+
+	debug_elog2(DEBUG2, "Create: all done");
+
+	return multi;
+}
+
+/*
+ * RecordNewMultiXact
+ *		Write info about a new multixact into the offsets and members files
+ *
+ * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
+ * use it.
+ */
+static void
+RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
+				   int nmembers, MultiXactMember *members)
+{
+	int			pageno;
+	int			prev_pageno;
+	int			entryno;
+	int			slotno;
+	MultiXactOffset *offptr;
+	int			i;
+
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	pageno = MultiXactIdToOffsetPage(multi);
+	entryno = MultiXactIdToOffsetEntry(multi);
+
+	/*
+	 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
+	 * to complain about if there's any I/O error.  This is kinda bogus, but
+	 * since the errors will always give the full pathname, it should be clear
+	 * enough that a MultiXactId is really involved.  Perhaps someday we'll
+	 * take the trouble to generalize the slru.c error reporting code.
+	 */
+	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
+	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+	offptr += entryno;
+
+	*offptr = offset;
+
+	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+
+	/* Exchange our lock */
+	LWLockRelease(MultiXactOffsetSLRULock);
+
+	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+	prev_pageno = -1;
+
+	for (i = 0; i < nmembers; i++, offset++)
+	{
+		TransactionId *memberptr;
+		uint32	   *flagsptr;
+		uint32		flagsval;
+		int			bshift;
+		int			flagsoff;
+		int			memberoff;
+
+		Assert(members[i].status <= MultiXactStatusUpdate);
+
+		pageno = MXOffsetToMemberPage(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+
+		if (pageno != prev_pageno)
+		{
+			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
+			prev_pageno = pageno;
+		}
+
+		memberptr = (TransactionId *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+		*memberptr = members[i].xid;
+
+		flagsptr = (uint32 *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+		flagsval = *flagsptr;
+		flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+		flagsval |= (members[i].status << bshift);
+		*flagsptr = flagsval;
+
+		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+	}
+
+	LWLockRelease(MultiXactMemberSLRULock);
+}
+
+/*
+ * GetNewMultiXactId
+ *		Get the next MultiXactId.
+ *
+ * Also, reserve the needed amount of space in the "members" area.  The
+ * starting offset of the reserved space is returned in *offset.
+ *
+ * This may generate XLOG records for expansion of the offsets and/or members
+ * files.  Unfortunately, we have to do that while holding MultiXactGenLock
+ * to avoid race conditions --- the XLOG record for zeroing a page must appear
+ * before any backend can possibly try to store data in that page!
+ *
+ * We start a critical section before advancing the shared counters.  The
+ * caller must end the critical section after writing SLRU data.
+ */
+static MultiXactId
+GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
+{
+	MultiXactId result;
+	MultiXactOffset nextOffset;
+
+	debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
+
+	/* safety check, we should never get this far in a HS standby */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign MultiXactIds during recovery");
+
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+	/* Handle wraparound of the nextMXact counter */
+	if (MultiXactState->nextMXact < FirstMultiXactId)
+		MultiXactState->nextMXact = FirstMultiXactId;
+
+	/* Assign the MXID */
+	result = MultiXactState->nextMXact;
+
+	/*----------
+	 * Check to see if it's safe to assign another MultiXactId.  This protects
+	 * against catastrophic data loss due to multixact wraparound.  The basic
+	 * rules are:
+	 *
+	 * If we're past multiVacLimit or the safe threshold for member storage
+	 * space, or we don't know what the safe threshold for member storage is,
+	 * start trying to force autovacuum cycles.
+	 * If we're past multiWarnLimit, start issuing warnings.
+	 * If we're past multiStopLimit, refuse to create new MultiXactIds.
+	 *
+	 * Note these are pretty much the same protections in GetNewTransactionId.
+	 *----------
+	 */
+	if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
+	{
+		/*
+		 * For safety's sake, we release MultiXactGenLock while sending
+		 * signals, warnings, etc.  This is not so much because we care about
+		 * preserving concurrency in this situation, as to avoid any
+		 * possibility of deadlock while doing get_database_name(). First,
+		 * copy all the shared values we'll need in this path.
+		 */
+		MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
+		MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
+		MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
+		Oid			oldest_datoid = MultiXactState->oldestMultiXactDB;
+
+		LWLockRelease(MultiXactGenLock);
+
+		if (IsUnderPostmaster &&
+			!MultiXactIdPrecedes(result, multiStopLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/*
+			 * Immediately kick autovacuum into action as we're already in
+			 * ERROR territory.
+			 */
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
+								oldest_datname),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
+								oldest_datoid),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		}
+
+		/*
+		 * To avoid swamping the postmaster with signals, we issue the autovac
+		 * request only once per 64K multis generated.  This still gives
+		 * plenty of chances before we get into real trouble.
+		 */
+		if (IsUnderPostmaster && (result % 65536) == 0)
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+		if (!MultiXactIdPrecedes(result, multiWarnLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(WARNING,
+						(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
+									   "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+									   multiWrapLimit - result,
+									   oldest_datname,
+									   multiWrapLimit - result),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+			else
+				ereport(WARNING,
+						(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
+									   "database with OID %u must be vacuumed before %u more MultiXactIds are used",
+									   multiWrapLimit - result,
+									   oldest_datoid,
+									   multiWrapLimit - result),
+						 errhint("Execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		}
+
+		/* Re-acquire lock and start over */
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		result = MultiXactState->nextMXact;
+		if (result < FirstMultiXactId)
+			result = FirstMultiXactId;
+	}
+
+	/* Make sure there is room for the MXID in the file.  */
+	ExtendMultiXactOffset(result);
+
+	/*
+	 * Reserve the members space, similarly to above.  Also, be careful not to
+	 * return zero as the starting offset for any multixact. See
+	 * GetMultiXactIdMembers() for motivation.
+	 */
+	nextOffset = MultiXactState->nextOffset;
+	if (nextOffset == 0)
+	{
+		*offset = 1;
+		nmembers++;				/* allocate member slot 0 too */
+	}
+	else
+		*offset = nextOffset;
+
+	/*----------
+	 * Protect against overrun of the members space as well, with the
+	 * following rules:
+	 *
+	 * If we're past offsetStopLimit, refuse to generate more multis.
+	 * If we're close to offsetStopLimit, emit a warning.
+	 *
+	 * Arbitrarily, we start emitting warnings when we're 20 segments or less
+	 * from offsetStopLimit.
+	 *
+	 * Note we haven't updated the shared state yet, so if we fail at this
+	 * point, the multixact ID we grabbed can still be used by the next guy.
+	 *
+	 * Note that there is no point in forcing autovacuum runs here: the
+	 * multixact freeze settings would have to be reduced for that to have any
+	 * effect.
+	 *----------
+	 */
+#define OFFSET_WARN_SEGMENTS	20
+	if (MultiXactState->oldestOffsetKnown &&
+		MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset,
+								 nmembers))
+	{
+		/* see comment in the corresponding offsets wraparound case */
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("multixact \"members\" limit exceeded"),
+				 errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.",
+								  "This command would create a multixact with %u members, but the remaining space is only enough for %u members.",
+								  MultiXactState->offsetStopLimit - nextOffset - 1,
+								  nmembers,
+								  MultiXactState->offsetStopLimit - nextOffset - 1),
+				 errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.",
+						 MultiXactState->oldestMultiXactDB)));
+	}
+
+	/*
+	 * Check whether we should kick autovacuum into action, to prevent members
+	 * wraparound. NB we use a much larger window to trigger autovacuum than
+	 * just the warning limit. The warning is just a measure of last resort -
+	 * this is in line with GetNewTransactionId's behaviour.
+	 */
+	if (!MultiXactState->oldestOffsetKnown ||
+		(MultiXactState->nextOffset - MultiXactState->oldestOffset
+		 > MULTIXACT_MEMBER_SAFE_THRESHOLD))
+	{
+		/*
+		 * To avoid swamping the postmaster with signals, we issue the autovac
+		 * request only when crossing a segment boundary. With default
+		 * compilation settings that's roughly after 50k members.  This still
+		 * gives plenty of chances before we get into real trouble.
+		 */
+		if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) !=
+			(MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT))
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+	}
+
+	if (MultiXactState->oldestOffsetKnown &&
+		MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit,
+								 nextOffset,
+								 nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS))
+		ereport(WARNING,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used",
+							   "database with OID %u must be vacuumed before %d more multixact members are used",
+							   MultiXactState->offsetStopLimit - nextOffset + nmembers,
+							   MultiXactState->oldestMultiXactDB,
+							   MultiXactState->offsetStopLimit - nextOffset + nmembers),
+				 errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.")));
+
+	ExtendMultiXactMember(nextOffset, nmembers);
+
+	/*
+	 * Critical section from here until caller has written the data into the
+	 * just-reserved SLRU space; we don't want to error out with a partly
+	 * written MultiXact structure.  (In particular, failing to write our
+	 * start offset after advancing nextMXact would effectively corrupt the
+	 * previous MultiXact.)
+	 */
+	START_CRIT_SECTION();
+
+	/*
+	 * Advance counters.  As in GetNewTransactionId(), this must not happen
+	 * until after file extension has succeeded!
+	 *
+	 * We don't care about MultiXactId wraparound here; it will be handled by
+	 * the next iteration.  But note that nextMXact may be InvalidMultiXactId
+	 * or the first value on a segment-beginning page after this routine
+	 * exits, so anyone else looking at the variable must be prepared to deal
+	 * with either case.  Similarly, nextOffset may be zero, but we won't use
+	 * that as the actual start offset of the next multixact.
+	 */
+	(MultiXactState->nextMXact)++;
+
+	MultiXactState->nextOffset += nmembers;
+
+	LWLockRelease(MultiXactGenLock);
+
+	debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
+	return result;
+}
+
+/*
+ * GetMultiXactIdMembers
+ *		Return the set of MultiXactMembers that make up a MultiXactId
+ *
+ * Return value is the number of members found, or -1 if there are none,
+ * and *members is set to a newly palloc'ed array of members.  It's the
+ * caller's responsibility to free it when done with it.
+ *
+ * from_pgupgrade must be passed as true if and only if only the multixact
+ * corresponds to a value from a tuple that was locked in a 9.2-or-older
+ * installation and later pg_upgrade'd (that is, the infomask is
+ * HEAP_LOCKED_UPGRADED).  In this case, we know for certain that no members
+ * can still be running, so we return -1 just like for an empty multixact
+ * without any further checking.  It would be wrong to try to resolve such a
+ * multixact: either the multixact is within the current valid multixact
+ * range, in which case the returned result would be bogus, or outside that
+ * range, in which case an error would be raised.
+ *
+ * In all other cases, the passed multixact must be within the known valid
+ * range, that is, greater to or equal than oldestMultiXactId, and less than
+ * nextMXact.  Otherwise, an error is raised.
+ *
+ * onlyLock must be set to true if caller is certain that the given multi
+ * is used only to lock tuples; can be false without loss of correctness,
+ * but passing a true means we can return quickly without checking for
+ * old updates.
+ */
+int
+GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
+					  bool from_pgupgrade, bool onlyLock)
+{
+	int			pageno;
+	int			prev_pageno;
+	int			entryno;
+	int			slotno;
+	MultiXactOffset *offptr;
+	MultiXactOffset offset;
+	int			length;
+	int			truelength;
+	int			i;
+	MultiXactId oldestMXact;
+	MultiXactId nextMXact;
+	MultiXactId tmpMXact;
+	MultiXactOffset nextOffset;
+	MultiXactMember *ptr;
+
+	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
+
+	if (!MultiXactIdIsValid(multi) || from_pgupgrade)
+	{
+		*members = NULL;
+		return -1;
+	}
+
+	/* See if the MultiXactId is in the local cache */
+	length = mXactCacheGetById(multi, members);
+	if (length >= 0)
+	{
+		debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
+					mxid_to_string(multi, length, *members));
+		return length;
+	}
+
+	/* Set our OldestVisibleMXactId[] entry if we didn't already */
+	MultiXactIdSetOldestVisible();
+
+	/*
+	 * If we know the multi is used only for locking and not for updates, then
+	 * we can skip checking if the value is older than our oldest visible
+	 * multi.  It cannot possibly still be running.
+	 */
+	if (onlyLock &&
+		MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId]))
+	{
+		debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old");
+		*members = NULL;
+		return -1;
+	}
+
+	/*
+	 * We check known limits on MultiXact before resorting to the SLRU area.
+	 *
+	 * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
+	 * useful; it has already been removed, or will be removed shortly, by
+	 * truncation.  If one is passed, an error is raised.
+	 *
+	 * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it
+	 * implies undetected ID wraparound has occurred.  This raises a hard
+	 * error.
+	 *
+	 * Shared lock is enough here since we aren't modifying any global state.
+	 * Acquire it just long enough to grab the current counter values.  We may
+	 * need both nextMXact and nextOffset; see below.
+	 */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+	oldestMXact = MultiXactState->oldestMultiXactId;
+	nextMXact = MultiXactState->nextMXact;
+	nextOffset = MultiXactState->nextOffset;
+
+	LWLockRelease(MultiXactGenLock);
+
+	if (MultiXactIdPrecedes(multi, oldestMXact))
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
+						multi)));
+
+	if (!MultiXactIdPrecedes(multi, nextMXact))
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
+						multi)));
+
+	/*
+	 * Find out the offset at which we need to start reading MultiXactMembers
+	 * and the number of members in the multixact.  We determine the latter as
+	 * the difference between this multixact's starting offset and the next
+	 * one's.  However, there are some corner cases to worry about:
+	 *
+	 * 1. This multixact may be the latest one created, in which case there is
+	 * no next one to look at.  In this case the nextOffset value we just
+	 * saved is the correct endpoint.
+	 *
+	 * 2. The next multixact may still be in process of being filled in: that
+	 * is, another process may have done GetNewMultiXactId but not yet written
+	 * the offset entry for that ID.  In that scenario, it is guaranteed that
+	 * the offset entry for that multixact exists (because GetNewMultiXactId
+	 * won't release MultiXactGenLock until it does) but contains zero
+	 * (because we are careful to pre-zero offset pages). Because
+	 * GetNewMultiXactId will never return zero as the starting offset for a
+	 * multixact, when we read zero as the next multixact's offset, we know we
+	 * have this case.  We sleep for a bit and try again.
+	 *
+	 * 3. Because GetNewMultiXactId increments offset zero to offset one to
+	 * handle case #2, there is an ambiguity near the point of offset
+	 * wraparound.  If we see next multixact's offset is one, is that our
+	 * multixact's actual endpoint, or did it end at zero with a subsequent
+	 * increment?  We handle this using the knowledge that if the zero'th
+	 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+	 * transaction ID so it can't be a multixact member.  Therefore, if we
+	 * read a zero from the members array, just ignore it.
+	 *
+	 * This is all pretty messy, but the mess occurs only in infrequent corner
+	 * cases, so it seems better than holding the MultiXactGenLock for a long
+	 * time on every multixact creation.
+	 */
+retry:
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	pageno = MultiXactIdToOffsetPage(multi);
+	entryno = MultiXactIdToOffsetEntry(multi);
+
+	slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
+	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+	offptr += entryno;
+	offset = *offptr;
+
+	Assert(offset != 0);
+
+	/*
+	 * Use the same increment rule as GetNewMultiXactId(), that is, don't
+	 * handle wraparound explicitly until needed.
+	 */
+	tmpMXact = multi + 1;
+
+	if (nextMXact == tmpMXact)
+	{
+		/* Corner case 1: there is no next multixact */
+		length = nextOffset - offset;
+	}
+	else
+	{
+		MultiXactOffset nextMXOffset;
+
+		/* handle wraparound if needed */
+		if (tmpMXact < FirstMultiXactId)
+			tmpMXact = FirstMultiXactId;
+
+		prev_pageno = pageno;
+
+		pageno = MultiXactIdToOffsetPage(tmpMXact);
+		entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+		if (pageno != prev_pageno)
+			slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
+
+		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		offptr += entryno;
+		nextMXOffset = *offptr;
+
+		if (nextMXOffset == 0)
+		{
+			/* Corner case 2: next multixact is still being filled in */
+			LWLockRelease(MultiXactOffsetSLRULock);
+			CHECK_FOR_INTERRUPTS();
+			pg_usleep(1000L);
+			goto retry;
+		}
+
+		length = nextMXOffset - offset;
+	}
+
+	LWLockRelease(MultiXactOffsetSLRULock);
+
+	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
+
+	/* Now get the members themselves. */
+	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+	truelength = 0;
+	prev_pageno = -1;
+	for (i = 0; i < length; i++, offset++)
+	{
+		TransactionId *xactptr;
+		uint32	   *flagsptr;
+		int			flagsoff;
+		int			bshift;
+		int			memberoff;
+
+		pageno = MXOffsetToMemberPage(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+
+		if (pageno != prev_pageno)
+		{
+			slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
+			prev_pageno = pageno;
+		}
+
+		xactptr = (TransactionId *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+		if (!TransactionIdIsValid(*xactptr))
+		{
+			/* Corner case 3: we must be looking at unused slot zero */
+			Assert(offset == 0);
+			continue;
+		}
+
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+		flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+
+		ptr[truelength].xid = *xactptr;
+		ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+		truelength++;
+	}
+
+	LWLockRelease(MultiXactMemberSLRULock);
+
+	/* A multixid with zero members should not happen */
+	Assert(truelength > 0);
+
+	/*
+	 * Copy the result into the local cache.
+	 */
+	mXactCachePut(multi, truelength, ptr);
+
+	debug_elog3(DEBUG2, "GetMembers: no cache for %s",
+				mxid_to_string(multi, truelength, ptr));
+	*members = ptr;
+	return truelength;
+}
+
+/*
+ * mxactMemberComparator
+ *		qsort comparison function for MultiXactMember
+ *
+ * We can't use wraparound comparison for XIDs because that does not respect
+ * the triangle inequality!  Any old sort order will do.
+ */
+static int
+mxactMemberComparator(const void *arg1, const void *arg2)
+{
+	MultiXactMember member1 = *(const MultiXactMember *) arg1;
+	MultiXactMember member2 = *(const MultiXactMember *) arg2;
+
+	if (member1.xid > member2.xid)
+		return 1;
+	if (member1.xid < member2.xid)
+		return -1;
+	if (member1.status > member2.status)
+		return 1;
+	if (member1.status < member2.status)
+		return -1;
+	return 0;
+}
+
+/*
+ * mXactCacheGetBySet
+ *		returns a MultiXactId from the cache based on the set of
+ *		TransactionIds that compose it, or InvalidMultiXactId if
+ *		none matches.
+ *
+ * This is helpful, for example, if two transactions want to lock a huge
+ * table.  By using the cache, the second will use the same MultiXactId
+ * for the majority of tuples, thus keeping MultiXactId usage low (saving
+ * both I/O and wraparound issues).
+ *
+ * NB: the passed members array will be sorted in-place.
+ */
+static MultiXactId
+mXactCacheGetBySet(int nmembers, MultiXactMember *members)
+{
+	dlist_iter	iter;
+
+	debug_elog3(DEBUG2, "CacheGet: looking for %s",
+				mxid_to_string(InvalidMultiXactId, nmembers, members));
+
+	/* sort the array so comparison is easy */
+	qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
+
+	dlist_foreach(iter, &MXactCache)
+	{
+		mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
+
+		if (entry->nmembers != nmembers)
+			continue;
+
+		/*
+		 * We assume the cache entries are sorted, and that the unused bits in
+		 * "status" are zeroed.
+		 */
+		if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
+		{
+			debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
+			dlist_move_head(&MXactCache, iter.cur);
+			return entry->multi;
+		}
+	}
+
+	debug_elog2(DEBUG2, "CacheGet: not found :-(");
+	return InvalidMultiXactId;
+}
+
+/*
+ * mXactCacheGetById
+ *		returns the composing MultiXactMember set from the cache for a
+ *		given MultiXactId, if present.
+ *
+ * If successful, *xids is set to the address of a palloc'd copy of the
+ * MultiXactMember set.  Return value is number of members, or -1 on failure.
+ */
+static int
+mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
+{
+	dlist_iter	iter;
+
+	debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
+
+	dlist_foreach(iter, &MXactCache)
+	{
+		mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur);
+
+		if (entry->multi == multi)
+		{
+			MultiXactMember *ptr;
+			Size		size;
+
+			size = sizeof(MultiXactMember) * entry->nmembers;
+			ptr = (MultiXactMember *) palloc(size);
+
+			memcpy(ptr, entry->members, size);
+
+			debug_elog3(DEBUG2, "CacheGet: found %s",
+						mxid_to_string(multi,
+									   entry->nmembers,
+									   entry->members));
+
+			/*
+			 * Note we modify the list while not using a modifiable iterator.
+			 * This is acceptable only because we exit the iteration
+			 * immediately afterwards.
+			 */
+			dlist_move_head(&MXactCache, iter.cur);
+
+			*members = ptr;
+			return entry->nmembers;
+		}
+	}
+
+	debug_elog2(DEBUG2, "CacheGet: not found");
+	return -1;
+}
+
+/*
+ * mXactCachePut
+ *		Add a new MultiXactId and its composing set into the local cache.
+ */
+static void
+mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+	mXactCacheEnt *entry;
+
+	debug_elog3(DEBUG2, "CachePut: storing %s",
+				mxid_to_string(multi, nmembers, members));
+
+	if (MXactContext == NULL)
+	{
+		/* The cache only lives as long as the current transaction */
+		debug_elog2(DEBUG2, "CachePut: initializing memory context");
+		MXactContext = AllocSetContextCreate(TopTransactionContext,
+											 "MultiXact cache context",
+											 ALLOCSET_SMALL_SIZES);
+	}
+
+	entry = (mXactCacheEnt *)
+		MemoryContextAlloc(MXactContext,
+						   offsetof(mXactCacheEnt, members) +
+						   nmembers * sizeof(MultiXactMember));
+
+	entry->multi = multi;
+	entry->nmembers = nmembers;
+	memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
+
+	/* mXactCacheGetBySet assumes the entries are sorted, so sort them */
+	qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
+
+	dlist_push_head(&MXactCache, &entry->node);
+	if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES)
+	{
+		dlist_node *node;
+		mXactCacheEnt *entry;
+
+		node = dlist_tail_node(&MXactCache);
+		dlist_delete(node);
+		MXactCacheMembers--;
+
+		entry = dlist_container(mXactCacheEnt, node, node);
+		debug_elog3(DEBUG2, "CachePut: pruning cached multi %u",
+					entry->multi);
+
+		pfree(entry);
+	}
+}
+
+static char *
+mxstatus_to_string(MultiXactStatus status)
+{
+	switch (status)
+	{
+		case MultiXactStatusForKeyShare:
+			return "keysh";
+		case MultiXactStatusForShare:
+			return "sh";
+		case MultiXactStatusForNoKeyUpdate:
+			return "fornokeyupd";
+		case MultiXactStatusForUpdate:
+			return "forupd";
+		case MultiXactStatusNoKeyUpdate:
+			return "nokeyupd";
+		case MultiXactStatusUpdate:
+			return "upd";
+		default:
+			elog(ERROR, "unrecognized multixact status %d", status);
+			return "";
+	}
+}
+
+char *
+mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+	static char *str = NULL;
+	StringInfoData buf;
+	int			i;
+
+	if (str != NULL)
+		pfree(str);
+
+	initStringInfo(&buf);
+
+	appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
+					 mxstatus_to_string(members[0].status));
+
+	for (i = 1; i < nmembers; i++)
+		appendStringInfo(&buf, ", %u (%s)", members[i].xid,
+						 mxstatus_to_string(members[i].status));
+
+	appendStringInfoChar(&buf, ']');
+	str = MemoryContextStrdup(TopMemoryContext, buf.data);
+	pfree(buf.data);
+	return str;
+}
+
+/*
+ * AtEOXact_MultiXact
+ *		Handle transaction end for MultiXact
+ *
+ * This is called at top transaction commit or abort (we don't care which).
+ */
+void
+AtEOXact_MultiXact(void)
+{
+	/*
+	 * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
+	 * which should only be valid while within a transaction.
+	 *
+	 * We assume that storing a MultiXactId is atomic and so we need not take
+	 * MultiXactGenLock to do this.
+	 */
+	OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
+	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
+
+	/*
+	 * Discard the local MultiXactId cache.  Since MXactContext was created as
+	 * a child of TopTransactionContext, we needn't delete it explicitly.
+	 */
+	MXactContext = NULL;
+	dlist_init(&MXactCache);
+	MXactCacheMembers = 0;
+}
+
+/*
+ * AtPrepare_MultiXact
+ *		Save multixact state at 2PC transaction prepare
+ *
+ * In this phase, we only store our OldestMemberMXactId value in the two-phase
+ * state file.
+ */
+void
+AtPrepare_MultiXact(void)
+{
+	MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
+
+	if (MultiXactIdIsValid(myOldestMember))
+		RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
+							   &myOldestMember, sizeof(MultiXactId));
+}
+
+/*
+ * PostPrepare_MultiXact
+ *		Clean up after successful PREPARE TRANSACTION
+ */
+void
+PostPrepare_MultiXact(TransactionId xid)
+{
+	MultiXactId myOldestMember;
+
+	/*
+	 * Transfer our OldestMemberMXactId value to the slot reserved for the
+	 * prepared transaction.
+	 */
+	myOldestMember = OldestMemberMXactId[MyBackendId];
+	if (MultiXactIdIsValid(myOldestMember))
+	{
+		BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
+
+		/*
+		 * Even though storing MultiXactId is atomic, acquire lock to make
+		 * sure others see both changes, not just the reset of the slot of the
+		 * current backend. Using a volatile pointer might suffice, but this
+		 * isn't a hot spot.
+		 */
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+
+		OldestMemberMXactId[dummyBackendId] = myOldestMember;
+		OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
+
+		LWLockRelease(MultiXactGenLock);
+	}
+
+	/*
+	 * We don't need to transfer OldestVisibleMXactId value, because the
+	 * transaction is not going to be looking at any more multixacts once it's
+	 * prepared.
+	 *
+	 * We assume that storing a MultiXactId is atomic and so we need not take
+	 * MultiXactGenLock to do this.
+	 */
+	OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
+
+	/*
+	 * Discard the local MultiXactId cache like in AtEOXact_MultiXact.
+	 */
+	MXactContext = NULL;
+	dlist_init(&MXactCache);
+	MXactCacheMembers = 0;
+}
+
+/*
+ * multixact_twophase_recover
+ *		Recover the state of a prepared transaction at startup
+ */
+void
+multixact_twophase_recover(TransactionId xid, uint16 info,
+						   void *recdata, uint32 len)
+{
+	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, false);
+	MultiXactId oldestMember;
+
+	/*
+	 * Get the oldest member XID from the state file record, and set it in the
+	 * OldestMemberMXactId slot reserved for this prepared transaction.
+	 */
+	Assert(len == sizeof(MultiXactId));
+	oldestMember = *((MultiXactId *) recdata);
+
+	OldestMemberMXactId[dummyBackendId] = oldestMember;
+}
+
+/*
+ * multixact_twophase_postcommit
+ *		Similar to AtEOXact_MultiXact but for COMMIT PREPARED
+ */
+void
+multixact_twophase_postcommit(TransactionId xid, uint16 info,
+							  void *recdata, uint32 len)
+{
+	BackendId	dummyBackendId = TwoPhaseGetDummyBackendId(xid, true);
+
+	Assert(len == sizeof(MultiXactId));
+
+	OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
+}
+
+/*
+ * multixact_twophase_postabort
+ *		This is actually just the same as the COMMIT case.
+ */
+void
+multixact_twophase_postabort(TransactionId xid, uint16 info,
+							 void *recdata, uint32 len)
+{
+	multixact_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ * Initialization of shared memory for MultiXact.  We use two SLRU areas,
+ * thus double memory.  Also, reserve space for the shared MultiXactState
+ * struct and the per-backend MultiXactId arrays (two of those, too).
+ */
+Size
+MultiXactShmemSize(void)
+{
+	Size		size;
+
+	/* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */
+#define SHARED_MULTIXACT_STATE_SIZE \
+	add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \
+			 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
+
+	size = SHARED_MULTIXACT_STATE_SIZE;
+	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+
+	return size;
+}
+
+void
+MultiXactShmemInit(void)
+{
+	bool		found;
+
+	debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
+
+	MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
+	MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
+
+	SimpleLruInit(MultiXactOffsetCtl,
+				  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+				  MultiXactOffsetSLRULock, "pg_multixact/offsets",
+				  LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_OFFSET);
+	SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
+	SimpleLruInit(MultiXactMemberCtl,
+				  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+				  MultiXactMemberSLRULock, "pg_multixact/members",
+				  LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+				  SYNC_HANDLER_MULTIXACT_MEMBER);
+	/* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
+
+	/* Initialize our shared state struct */
+	MultiXactState = ShmemInitStruct("Shared MultiXact State",
+									 SHARED_MULTIXACT_STATE_SIZE,
+									 &found);
+	if (!IsUnderPostmaster)
+	{
+		Assert(!found);
+
+		/* Make sure we zero out the per-backend state */
+		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
+	}
+	else
+		Assert(found);
+
+	/*
+	 * Set up array pointers.  Note that perBackendXactIds[0] is wasted space
+	 * since we only use indexes 1..MaxOldestSlot in each array.
+	 */
+	OldestMemberMXactId = MultiXactState->perBackendXactIds;
+	OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates the initial
+ * MultiXact segments.  (The MultiXacts directories are assumed to have been
+ * created by initdb, and MultiXactShmemInit must have been called already.)
+ */
+void
+BootStrapMultiXact(void)
+{
+	int			slotno;
+
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the offsets log */
+	slotno = ZeroMultiXactOffsetPage(0, false);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+	Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(MultiXactOffsetSLRULock);
+
+	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the members log */
+	slotno = ZeroMultiXactMemberPage(0, false);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(MultiXactMemberCtl, slotno);
+	Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(MultiXactMemberSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
+ * If writeXlog is true, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
+{
+	int			slotno;
+
+	slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+
+	if (writeXlog)
+		WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+
+	return slotno;
+}
+
+/*
+ * Ditto, for MultiXactMember
+ */
+static int
+ZeroMultiXactMemberPage(int pageno, bool writeXlog)
+{
+	int			slotno;
+
+	slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+
+	if (writeXlog)
+		WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+
+	return slotno;
+}
+
+/*
+ * MaybeExtendOffsetSlru
+ *		Extend the offsets SLRU area, if necessary
+ *
+ * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might
+ * contain files that are shorter than necessary; this would occur if the old
+ * installation had used multixacts beyond the first page (files cannot be
+ * copied, because the on-disk representation is different).  pg_upgrade would
+ * update pg_control to set the next offset value to be at that position, so
+ * that tuples marked as locked by such MultiXacts would be seen as visible
+ * without having to consult multixact.  However, trying to create and use a
+ * new MultiXactId would result in an error because the page on which the new
+ * value would reside does not exist.  This routine is in charge of creating
+ * such pages.
+ */
+static void
+MaybeExtendOffsetSlru(void)
+{
+	int			pageno;
+
+	pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
+
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
+	{
+		int			slotno;
+
+		/*
+		 * Fortunately for us, SimpleLruWritePage is already prepared to deal
+		 * with creating a new segment file even if the page we're writing is
+		 * not the first in it, so this is enough.
+		 */
+		slotno = ZeroMultiXactOffsetPage(pageno, false);
+		SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+	}
+
+	LWLockRelease(MultiXactOffsetSLRULock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup.
+ *
+ * StartupXLOG has already established nextMXact/nextOffset by calling
+ * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
+ * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
+ * replayed WAL.
+ */
+void
+StartupMultiXact(void)
+{
+	MultiXactId multi = MultiXactState->nextMXact;
+	MultiXactOffset offset = MultiXactState->nextOffset;
+	int			pageno;
+
+	/*
+	 * Initialize offset's idea of the latest page number.
+	 */
+	pageno = MultiXactIdToOffsetPage(multi);
+	MultiXactOffsetCtl->shared->latest_page_number = pageno;
+
+	/*
+	 * Initialize member's idea of the latest page number.
+	 */
+	pageno = MXOffsetToMemberPage(offset);
+	MultiXactMemberCtl->shared->latest_page_number = pageno;
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ */
+void
+TrimMultiXact(void)
+{
+	MultiXactId nextMXact;
+	MultiXactOffset offset;
+	MultiXactId oldestMXact;
+	Oid			oldestMXactDB;
+	int			pageno;
+	int			entryno;
+	int			flagsoff;
+
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextMXact = MultiXactState->nextMXact;
+	offset = MultiXactState->nextOffset;
+	oldestMXact = MultiXactState->oldestMultiXactId;
+	oldestMXactDB = MultiXactState->oldestMultiXactDB;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Clean up offsets state */
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * (Re-)Initialize our idea of the latest page number for offsets.
+	 */
+	pageno = MultiXactIdToOffsetPage(nextMXact);
+	MultiXactOffsetCtl->shared->latest_page_number = pageno;
+
+	/*
+	 * Zero out the remainder of the current offsets page.  See notes in
+	 * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
+	 * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
+	 * rule "write xlog before data," nextMXact successors may carry obsolete,
+	 * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
+	 * operates normally.
+	 */
+	entryno = MultiXactIdToOffsetEntry(nextMXact);
+	if (entryno != 0)
+	{
+		int			slotno;
+		MultiXactOffset *offptr;
+
+		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
+		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		offptr += entryno;
+
+		MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+
+		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	}
+
+	LWLockRelease(MultiXactOffsetSLRULock);
+
+	/* And the same for members */
+	LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+	/*
+	 * (Re-)Initialize our idea of the latest page number for members.
+	 */
+	pageno = MXOffsetToMemberPage(offset);
+	MultiXactMemberCtl->shared->latest_page_number = pageno;
+
+	/*
+	 * Zero out the remainder of the current members page.  See notes in
+	 * TrimCLOG() for motivation.
+	 */
+	flagsoff = MXOffsetToFlagsOffset(offset);
+	if (flagsoff != 0)
+	{
+		int			slotno;
+		TransactionId *xidptr;
+		int			memberoff;
+
+		memberoff = MXOffsetToMemberOffset(offset);
+		slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
+		xidptr = (TransactionId *)
+			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+
+		MemSet(xidptr, 0, BLCKSZ - memberoff);
+
+		/*
+		 * Note: we don't need to zero out the flag bits in the remaining
+		 * members of the current group, because they are always reset before
+		 * writing.
+		 */
+
+		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+	}
+
+	LWLockRelease(MultiXactMemberSLRULock);
+
+	/* signal that we're officially up */
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->finishedStartup = true;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Now compute how far away the next members wraparound is. */
+	SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
+}
+
+/*
+ * Get the MultiXact data to save in a checkpoint record
+ */
+void
+MultiXactGetCheckptMulti(bool is_shutdown,
+						 MultiXactId *nextMulti,
+						 MultiXactOffset *nextMultiOffset,
+						 MultiXactId *oldestMulti,
+						 Oid *oldestMultiDB)
+{
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	*nextMulti = MultiXactState->nextMXact;
+	*nextMultiOffset = MultiXactState->nextOffset;
+	*oldestMulti = MultiXactState->oldestMultiXactId;
+	*oldestMultiDB = MultiXactState->oldestMultiXactDB;
+	LWLockRelease(MultiXactGenLock);
+
+	debug_elog6(DEBUG2,
+				"MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
+				*nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointMultiXact(void)
+{
+	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
+
+	/*
+	 * Write dirty MultiXact pages to disk.  This may result in sync requests
+	 * queued for later handling by ProcessSyncRequests(), as part of the
+	 * checkpoint.
+	 */
+	SimpleLruWriteAll(MultiXactOffsetCtl, true);
+	SimpleLruWriteAll(MultiXactMemberCtl, true);
+
+	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Set the next-to-be-assigned MultiXactId and offset
+ *
+ * This is used when we can determine the correct next ID/offset exactly
+ * from a checkpoint record.  Although this is only called during bootstrap
+ * and XLog replay, we take the lock in case any hot-standby backends are
+ * examining the values.
+ */
+void
+MultiXactSetNextMXact(MultiXactId nextMulti,
+					  MultiXactOffset nextMultiOffset)
+{
+	debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
+				nextMulti, nextMultiOffset);
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->nextMXact = nextMulti;
+	MultiXactState->nextOffset = nextMultiOffset;
+	LWLockRelease(MultiXactGenLock);
+
+	/*
+	 * During a binary upgrade, make sure that the offsets SLRU is large
+	 * enough to contain the next value that would be created.
+	 *
+	 * We need to do this pretty early during the first startup in binary
+	 * upgrade mode: before StartupMultiXact() in fact, because this routine
+	 * is called even before that by StartupXLOG().  And we can't do it
+	 * earlier than at this point, because during that first call of this
+	 * routine we determine the MultiXactState->nextMXact value that
+	 * MaybeExtendOffsetSlru needs.
+	 */
+	if (IsBinaryUpgrade)
+		MaybeExtendOffsetSlru();
+}
+
+/*
+ * Determine the last safe MultiXactId to allocate given the currently oldest
+ * datminmxid (ie, the oldest MultiXactId that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ *
+ * is_startup is true when we are just starting the cluster, false when we
+ * are updating state in a running cluster.  This only affects log messages.
+ */
+void
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
+					bool is_startup)
+{
+	MultiXactId multiVacLimit;
+	MultiXactId multiWarnLimit;
+	MultiXactId multiStopLimit;
+	MultiXactId multiWrapLimit;
+	MultiXactId curMulti;
+	bool		needs_offset_vacuum;
+
+	Assert(MultiXactIdIsValid(oldest_datminmxid));
+
+	/*
+	 * We pretend that a wrap will happen halfway through the multixact ID
+	 * space, but that's not really true, because multixacts wrap differently
+	 * from transaction IDs.  Note that, separately from any concern about
+	 * multixact IDs wrapping, we must ensure that multixact members do not
+	 * wrap.  Limits for that are set in SetOffsetVacuumLimit, not here.
+	 */
+	multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
+	if (multiWrapLimit < FirstMultiXactId)
+		multiWrapLimit += FirstMultiXactId;
+
+	/*
+	 * We'll refuse to continue assigning MultiXactIds once we get within 3M
+	 * multi of data loss.  See SetTransactionIdLimit.
+	 */
+	multiStopLimit = multiWrapLimit - 3000000;
+	if (multiStopLimit < FirstMultiXactId)
+		multiStopLimit -= FirstMultiXactId;
+
+	/*
+	 * We'll start complaining loudly when we get within 40M multis of data
+	 * loss.  This is kind of arbitrary, but if you let your gas gauge get
+	 * down to 2% of full, would you be looking for the next gas station?  We
+	 * need to be fairly liberal about this number because there are lots of
+	 * scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
+	 * configurable.  If you know enough to configure it, you know enough to
+	 * not get in this kind of trouble in the first place.)
+	 */
+	multiWarnLimit = multiWrapLimit - 40000000;
+	if (multiWarnLimit < FirstMultiXactId)
+		multiWarnLimit -= FirstMultiXactId;
+
+	/*
+	 * We'll start trying to force autovacuums when oldest_datminmxid gets to
+	 * be more than autovacuum_multixact_freeze_max_age mxids old.
+	 *
+	 * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter
+	 * so that we don't have to worry about dealing with on-the-fly changes in
+	 * its value.  See SetTransactionIdLimit.
+	 */
+	multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age;
+	if (multiVacLimit < FirstMultiXactId)
+		multiVacLimit += FirstMultiXactId;
+
+	/* Grab lock for just long enough to set the new limit values */
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->oldestMultiXactId = oldest_datminmxid;
+	MultiXactState->oldestMultiXactDB = oldest_datoid;
+	MultiXactState->multiVacLimit = multiVacLimit;
+	MultiXactState->multiWarnLimit = multiWarnLimit;
+	MultiXactState->multiStopLimit = multiStopLimit;
+	MultiXactState->multiWrapLimit = multiWrapLimit;
+	curMulti = MultiXactState->nextMXact;
+	LWLockRelease(MultiXactGenLock);
+
+	/* Log the info */
+	ereport(DEBUG1,
+			(errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u",
+							 multiWrapLimit, oldest_datoid)));
+
+	/*
+	 * Computing the actual limits is only possible once the data directory is
+	 * in a consistent state. There's no need to compute the limits while
+	 * still replaying WAL - no decisions about new multis are made even
+	 * though multixact creations might be replayed. So we'll only do further
+	 * checks after TrimMultiXact() has been called.
+	 */
+	if (!MultiXactState->finishedStartup)
+		return;
+
+	Assert(!InRecovery);
+
+	/* Set limits for offset vacuum. */
+	needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
+
+	/*
+	 * If past the autovacuum force point, immediately signal an autovac
+	 * request.  The reason for this is that autovac only processes one
+	 * database per invocation.  Once it's finished cleaning up the oldest
+	 * database, it'll call here, and we'll signal the postmaster to start
+	 * another iteration immediately if there are still any old databases.
+	 */
+	if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
+		 needs_offset_vacuum) && IsUnderPostmaster)
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+	/* Give an immediate warning if past the wrap warn point */
+	if (MultiXactIdPrecedes(multiWarnLimit, curMulti))
+	{
+		char	   *oldest_datname;
+
+		/*
+		 * We can be called when not inside a transaction, for example during
+		 * StartupXLOG().  In such a case we cannot do database access, so we
+		 * must just report the oldest DB's OID.
+		 *
+		 * Note: it's also possible that get_database_name fails and returns
+		 * NULL, for example because the database just got dropped.  We'll
+		 * still warn, even though the warning might now be unnecessary.
+		 */
+		if (IsTransactionState())
+			oldest_datname = get_database_name(oldest_datoid);
+		else
+			oldest_datname = NULL;
+
+		if (oldest_datname)
+			ereport(WARNING,
+					(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
+								   "database \"%s\" must be vacuumed before %u more MultiXactIds are used",
+								   multiWrapLimit - curMulti,
+								   oldest_datname,
+								   multiWrapLimit - curMulti),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		else
+			ereport(WARNING,
+					(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
+								   "database with OID %u must be vacuumed before %u more MultiXactIds are used",
+								   multiWrapLimit - curMulti,
+								   oldest_datoid,
+								   multiWrapLimit - curMulti),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+	}
+}
+
+/*
+ * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
+ * and similarly nextOffset is at least minMultiOffset.
+ *
+ * This is used when we can determine minimum safe values from an XLog
+ * record (either an on-line checkpoint or an mxact creation log entry).
+ * Although this is only called during XLog replay, we take the lock in case
+ * any hot-standby backends are examining the values.
+ */
+void
+MultiXactAdvanceNextMXact(MultiXactId minMulti,
+						  MultiXactOffset minMultiOffset)
+{
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
+	{
+		debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
+		MultiXactState->nextMXact = minMulti;
+	}
+	if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
+	{
+		debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
+					minMultiOffset);
+		MultiXactState->nextOffset = minMultiOffset;
+	}
+	LWLockRelease(MultiXactGenLock);
+}
+
+/*
+ * Update our oldestMultiXactId value, but only if it's more recent than what
+ * we had.
+ *
+ * This may only be called during WAL replay.
+ */
+void
+MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
+{
+	Assert(InRecovery);
+
+	if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
+		SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
+}
+
+/*
+ * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
+ *
+ * NB: this is called while holding MultiXactGenLock.  We want it to be very
+ * fast most of the time; even when it's not so fast, no actual I/O need
+ * happen unless we're forced to write out a dirty log or xlog page to make
+ * room in shared memory.
+ */
+static void
+ExtendMultiXactOffset(MultiXactId multi)
+{
+	int			pageno;
+
+	/*
+	 * No work except at first MultiXactId of a page.  But beware: just after
+	 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
+	 */
+	if (MultiXactIdToOffsetEntry(multi) != 0 &&
+		multi != FirstMultiXactId)
+		return;
+
+	pageno = MultiXactIdToOffsetPage(multi);
+
+	LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroMultiXactOffsetPage(pageno, true);
+
+	LWLockRelease(MultiXactOffsetSLRULock);
+}
+
+/*
+ * Make sure that MultiXactMember has room for the members of a newly-
+ * allocated MultiXactId.
+ *
+ * Like the above routine, this is called while holding MultiXactGenLock;
+ * same comments apply.
+ */
+static void
+ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
+{
+	/*
+	 * It's possible that the members span more than one page of the members
+	 * file, so we loop to ensure we consider each page.  The coding is not
+	 * optimal if the members span several pages, but that seems unusual
+	 * enough to not worry much about.
+	 */
+	while (nmembers > 0)
+	{
+		int			flagsoff;
+		int			flagsbit;
+		uint32		difference;
+
+		/*
+		 * Only zero when at first entry of a page.
+		 */
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		flagsbit = MXOffsetToFlagsBitShift(offset);
+		if (flagsoff == 0 && flagsbit == 0)
+		{
+			int			pageno;
+
+			pageno = MXOffsetToMemberPage(offset);
+
+			LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+			/* Zero the page and make an XLOG entry about it */
+			ZeroMultiXactMemberPage(pageno, true);
+
+			LWLockRelease(MultiXactMemberSLRULock);
+		}
+
+		/*
+		 * Compute the number of items till end of current page.  Careful: if
+		 * addition of unsigned ints wraps around, we're at the last page of
+		 * the last segment; since that page holds a different number of items
+		 * than other pages, we need to do it differently.
+		 */
+		if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset)
+		{
+			/*
+			 * This is the last page of the last segment; we can compute the
+			 * number of items left to allocate in it without modulo
+			 * arithmetic.
+			 */
+			difference = MaxMultiXactOffset - offset + 1;
+		}
+		else
+			difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+
+		/*
+		 * Advance to next page, taking care to properly handle the wraparound
+		 * case.  OK if nmembers goes negative.
+		 */
+		nmembers -= difference;
+		offset += difference;
+	}
+}
+
+/*
+ * GetOldestMultiXactId
+ *
+ * Return the oldest MultiXactId that's still possibly still seen as live by
+ * any running transaction.  Older ones might still exist on disk, but they no
+ * longer have any running member transaction.
+ *
+ * It's not safe to truncate MultiXact SLRU segments on the value returned by
+ * this function; however, it can be used by a full-table vacuum to set the
+ * point at which it will be possible to truncate SLRU for that table.
+ */
+MultiXactId
+GetOldestMultiXactId(void)
+{
+	MultiXactId oldestMXact;
+	MultiXactId nextMXact;
+	int			i;
+
+	/*
+	 * This is the oldest valid value among all the OldestMemberMXactId[] and
+	 * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
+	 */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+
+	/*
+	 * We have to beware of the possibility that nextMXact is in the
+	 * wrapped-around state.  We don't fix the counter itself here, but we
+	 * must be sure to use a valid value in our calculation.
+	 */
+	nextMXact = MultiXactState->nextMXact;
+	if (nextMXact < FirstMultiXactId)
+		nextMXact = FirstMultiXactId;
+
+	oldestMXact = nextMXact;
+	for (i = 1; i <= MaxOldestSlot; i++)
+	{
+		MultiXactId thisoldest;
+
+		thisoldest = OldestMemberMXactId[i];
+		if (MultiXactIdIsValid(thisoldest) &&
+			MultiXactIdPrecedes(thisoldest, oldestMXact))
+			oldestMXact = thisoldest;
+		thisoldest = OldestVisibleMXactId[i];
+		if (MultiXactIdIsValid(thisoldest) &&
+			MultiXactIdPrecedes(thisoldest, oldestMXact))
+			oldestMXact = thisoldest;
+	}
+
+	LWLockRelease(MultiXactGenLock);
+
+	return oldestMXact;
+}
+
+/*
+ * Determine how aggressively we need to vacuum in order to prevent member
+ * wraparound.
+ *
+ * To do so determine what's the oldest member offset and install the limit
+ * info in MultiXactState, where it can be used to prevent overrun of old data
+ * in the members SLRU area.
+ *
+ * The return value is true if emergency autovacuum is required and false
+ * otherwise.
+ */
+static bool
+SetOffsetVacuumLimit(bool is_startup)
+{
+	MultiXactId oldestMultiXactId;
+	MultiXactId nextMXact;
+	MultiXactOffset oldestOffset = 0;	/* placate compiler */
+	MultiXactOffset prevOldestOffset;
+	MultiXactOffset nextOffset;
+	bool		oldestOffsetKnown = false;
+	bool		prevOldestOffsetKnown;
+	MultiXactOffset offsetStopLimit = 0;
+	MultiXactOffset prevOffsetStopLimit;
+
+	/*
+	 * NB: Have to prevent concurrent truncation, we might otherwise try to
+	 * lookup an oldestMulti that's concurrently getting truncated away.
+	 */
+	LWLockAcquire(MultiXactTruncationLock, LW_SHARED);
+
+	/* Read relevant fields from shared memory. */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	oldestMultiXactId = MultiXactState->oldestMultiXactId;
+	nextMXact = MultiXactState->nextMXact;
+	nextOffset = MultiXactState->nextOffset;
+	prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown;
+	prevOldestOffset = MultiXactState->oldestOffset;
+	prevOffsetStopLimit = MultiXactState->offsetStopLimit;
+	Assert(MultiXactState->finishedStartup);
+	LWLockRelease(MultiXactGenLock);
+
+	/*
+	 * Determine the offset of the oldest multixact.  Normally, we can read
+	 * the offset from the multixact itself, but there's an important special
+	 * case: if there are no multixacts in existence at all, oldestMXact
+	 * obviously can't point to one.  It will instead point to the multixact
+	 * ID that will be assigned the next time one is needed.
+	 */
+	if (oldestMultiXactId == nextMXact)
+	{
+		/*
+		 * When the next multixact gets created, it will be stored at the next
+		 * offset.
+		 */
+		oldestOffset = nextOffset;
+		oldestOffsetKnown = true;
+	}
+	else
+	{
+		/*
+		 * Figure out where the oldest existing multixact's offsets are
+		 * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X,
+		 * the supposedly-earliest multixact might not really exist.  We are
+		 * careful not to fail in that case.
+		 */
+		oldestOffsetKnown =
+			find_multixact_start(oldestMultiXactId, &oldestOffset);
+
+		if (oldestOffsetKnown)
+			ereport(DEBUG1,
+					(errmsg_internal("oldest MultiXactId member is at offset %u",
+									 oldestOffset)));
+		else
+			ereport(LOG,
+					(errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk",
+							oldestMultiXactId)));
+	}
+
+	LWLockRelease(MultiXactTruncationLock);
+
+	/*
+	 * If we can, compute limits (and install them MultiXactState) to prevent
+	 * overrun of old data in the members SLRU area. We can only do so if the
+	 * oldest offset is known though.
+	 */
+	if (oldestOffsetKnown)
+	{
+		/* move back to start of the corresponding segment */
+		offsetStopLimit = oldestOffset - (oldestOffset %
+										  (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT));
+
+		/* always leave one segment before the wraparound point */
+		offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT);
+
+		if (!prevOldestOffsetKnown && !is_startup)
+			ereport(LOG,
+					(errmsg("MultiXact member wraparound protections are now enabled")));
+
+		ereport(DEBUG1,
+				(errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u",
+								 offsetStopLimit, oldestMultiXactId)));
+	}
+	else if (prevOldestOffsetKnown)
+	{
+		/*
+		 * If we failed to get the oldest offset this time, but we have a
+		 * value from a previous pass through this function, use the old
+		 * values rather than automatically forcing an emergency autovacuum
+		 * cycle again.
+		 */
+		oldestOffset = prevOldestOffset;
+		oldestOffsetKnown = true;
+		offsetStopLimit = prevOffsetStopLimit;
+	}
+
+	/* Install the computed values */
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->oldestOffset = oldestOffset;
+	MultiXactState->oldestOffsetKnown = oldestOffsetKnown;
+	MultiXactState->offsetStopLimit = offsetStopLimit;
+	LWLockRelease(MultiXactGenLock);
+
+	/*
+	 * Do we need an emergency autovacuum?	If we're not sure, assume yes.
+	 */
+	return !oldestOffsetKnown ||
+		(nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD);
+}
+
+/*
+ * Return whether adding "distance" to "start" would move past "boundary".
+ *
+ * We use this to determine whether the addition is "wrapping around" the
+ * boundary point, hence the name.  The reason we don't want to use the regular
+ * 2^31-modulo arithmetic here is that we want to be able to use the whole of
+ * the 2^32-1 space here, allowing for more multixacts than would fit
+ * otherwise.
+ */
+static bool
+MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start,
+						 uint32 distance)
+{
+	MultiXactOffset finish;
+
+	/*
+	 * Note that offset number 0 is not used (see GetMultiXactIdMembers), so
+	 * if the addition wraps around the UINT_MAX boundary, skip that value.
+	 */
+	finish = start + distance;
+	if (finish < start)
+		finish++;
+
+	/*-----------------------------------------------------------------------
+	 * When the boundary is numerically greater than the starting point, any
+	 * value numerically between the two is not wrapped:
+	 *
+	 *	<----S----B---->
+	 *	[---)			 = F wrapped past B (and UINT_MAX)
+	 *		 [---)		 = F not wrapped
+	 *			  [----] = F wrapped past B
+	 *
+	 * When the boundary is numerically less than the starting point (i.e. the
+	 * UINT_MAX wraparound occurs somewhere in between) then all values in
+	 * between are wrapped:
+	 *
+	 *	<----B----S---->
+	 *	[---)			 = F not wrapped past B (but wrapped past UINT_MAX)
+	 *		 [---)		 = F wrapped past B (and UINT_MAX)
+	 *			  [----] = F not wrapped
+	 *-----------------------------------------------------------------------
+	 */
+	if (start < boundary)
+		return finish >= boundary || finish < start;
+	else
+		return finish >= boundary && finish < start;
+}
+
+/*
+ * Find the starting offset of the given MultiXactId.
+ *
+ * Returns false if the file containing the multi does not exist on disk.
+ * Otherwise, returns true and sets *result to the starting member offset.
+ *
+ * This function does not prevent concurrent truncation, so if that's
+ * required, the caller has to protect against that.
+ */
+static bool
+find_multixact_start(MultiXactId multi, MultiXactOffset *result)
+{
+	MultiXactOffset offset;
+	int			pageno;
+	int			entryno;
+	int			slotno;
+	MultiXactOffset *offptr;
+
+	Assert(MultiXactState->finishedStartup);
+
+	pageno = MultiXactIdToOffsetPage(multi);
+	entryno = MultiXactIdToOffsetEntry(multi);
+
+	/*
+	 * Write out dirty data, so PhysicalPageExists can work correctly.
+	 */
+	SimpleLruWriteAll(MultiXactOffsetCtl, true);
+	SimpleLruWriteAll(MultiXactMemberCtl, true);
+
+	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
+		return false;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+	slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
+	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+	offptr += entryno;
+	offset = *offptr;
+	LWLockRelease(MultiXactOffsetSLRULock);
+
+	*result = offset;
+	return true;
+}
+
+/*
+ * Determine how many multixacts, and how many multixact members, currently
+ * exist.  Return false if unable to determine.
+ */
+static bool
+ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
+{
+	MultiXactOffset nextOffset;
+	MultiXactOffset oldestOffset;
+	MultiXactId oldestMultiXactId;
+	MultiXactId nextMultiXactId;
+	bool		oldestOffsetKnown;
+
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextOffset = MultiXactState->nextOffset;
+	oldestMultiXactId = MultiXactState->oldestMultiXactId;
+	nextMultiXactId = MultiXactState->nextMXact;
+	oldestOffset = MultiXactState->oldestOffset;
+	oldestOffsetKnown = MultiXactState->oldestOffsetKnown;
+	LWLockRelease(MultiXactGenLock);
+
+	if (!oldestOffsetKnown)
+		return false;
+
+	*members = nextOffset - oldestOffset;
+	*multixacts = nextMultiXactId - oldestMultiXactId;
+	return true;
+}
+
+/*
+ * Multixact members can be removed once the multixacts that refer to them
+ * are older than every datminmxid.  autovacuum_multixact_freeze_max_age and
+ * vacuum_multixact_freeze_table_age work together to make sure we never have
+ * too many multixacts; we hope that, at least under normal circumstances,
+ * this will also be sufficient to keep us from using too many offsets.
+ * However, if the average multixact has many members, we might exhaust the
+ * members space while still using few enough members that these limits fail
+ * to trigger full table scans for relminmxid advancement.  At that point,
+ * we'd have no choice but to start failing multixact-creating operations
+ * with an error.
+ *
+ * To prevent that, if more than a threshold portion of the members space is
+ * used, we effectively reduce autovacuum_multixact_freeze_max_age and
+ * to a value just less than the number of multixacts in use.  We hope that
+ * this will quickly trigger autovacuuming on the table or tables with the
+ * oldest relminmxid, thus allowing datminmxid values to advance and removing
+ * some members.
+ *
+ * As the fraction of the member space currently in use grows, we become
+ * more aggressive in clamping this value.  That not only causes autovacuum
+ * to ramp up, but also makes any manual vacuums the user issues more
+ * aggressive.  This happens because vacuum_set_xid_limits() clamps the
+ * freeze table and the minimum freeze age based on the effective
+ * autovacuum_multixact_freeze_max_age this function returns.  In the worst
+ * case, we'll claim the freeze_max_age to zero, and every vacuum of any
+ * table will try to freeze every multixact.
+ *
+ * It's possible that these thresholds should be user-tunable, but for now
+ * we keep it simple.
+ */
+int
+MultiXactMemberFreezeThreshold(void)
+{
+	MultiXactOffset members;
+	uint32		multixacts;
+	uint32		victim_multixacts;
+	double		fraction;
+
+	/* If we can't determine member space utilization, assume the worst. */
+	if (!ReadMultiXactCounts(&multixacts, &members))
+		return 0;
+
+	/* If member space utilization is low, no special action is required. */
+	if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD)
+		return autovacuum_multixact_freeze_max_age;
+
+	/*
+	 * Compute a target for relminmxid advancement.  The number of multixacts
+	 * we try to eliminate from the system is based on how far we are past
+	 * MULTIXACT_MEMBER_SAFE_THRESHOLD.
+	 */
+	fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) /
+		(MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD);
+	victim_multixacts = multixacts * fraction;
+
+	/* fraction could be > 1.0, but lowest possible freeze age is zero */
+	if (victim_multixacts > multixacts)
+		return 0;
+	return multixacts - victim_multixacts;
+}
+
+typedef struct mxtruncinfo
+{
+	int			earliestExistingPage;
+} mxtruncinfo;
+
+/*
+ * SlruScanDirectory callback
+ *		This callback determines the earliest existing page number.
+ */
+static bool
+SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+	mxtruncinfo *trunc = (mxtruncinfo *) data;
+
+	if (trunc->earliestExistingPage == -1 ||
+		ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
+	{
+		trunc->earliestExistingPage = segpage;
+	}
+
+	return false;				/* keep going */
+}
+
+
+/*
+ * Delete members segments [oldest, newOldest)
+ *
+ * The members SLRU can, in contrast to the offsets one, be filled to almost
+ * the full range at once. This means SimpleLruTruncate() can't trivially be
+ * used - instead the to-be-deleted range is computed using the offsets
+ * SLRU. C.f. TruncateMultiXact().
+ */
+static void
+PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset)
+{
+	const int	maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset);
+	int			startsegment = MXOffsetToMemberSegment(oldestOffset);
+	int			endsegment = MXOffsetToMemberSegment(newOldestOffset);
+	int			segment = startsegment;
+
+	/*
+	 * Delete all the segments but the last one. The last segment can still
+	 * contain, possibly partially, valid data.
+	 */
+	while (segment != endsegment)
+	{
+		elog(DEBUG2, "truncating multixact members segment %x", segment);
+		SlruDeleteSegment(MultiXactMemberCtl, segment);
+
+		/* move to next segment, handling wraparound correctly */
+		if (segment == maxsegment)
+			segment = 0;
+		else
+			segment += 1;
+	}
+}
+
+/*
+ * Delete offsets segments [oldest, newOldest)
+ */
+static void
+PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti)
+{
+	/*
+	 * We step back one multixact to avoid passing a cutoff page that hasn't
+	 * been created yet in the rare case that oldestMulti would be the first
+	 * item on a page and oldestMulti == nextMulti.  In that case, if we
+	 * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound
+	 * detection.
+	 */
+	SimpleLruTruncate(MultiXactOffsetCtl,
+					  MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti)));
+}
+
+/*
+ * Remove all MultiXactOffset and MultiXactMember segments before the oldest
+ * ones still of interest.
+ *
+ * This is only called on a primary as part of vacuum (via
+ * vac_truncate_clog()). During recovery truncation is done by replaying
+ * truncation WAL records logged here.
+ *
+ * newOldestMulti is the oldest currently required multixact, newOldestMultiDB
+ * is one of the databases preventing newOldestMulti from increasing.
+ */
+void
+TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
+{
+	MultiXactId oldestMulti;
+	MultiXactId nextMulti;
+	MultiXactOffset newOldestOffset;
+	MultiXactOffset oldestOffset;
+	MultiXactOffset nextOffset;
+	mxtruncinfo trunc;
+	MultiXactId earliest;
+
+	Assert(!RecoveryInProgress());
+	Assert(MultiXactState->finishedStartup);
+
+	/*
+	 * We can only allow one truncation to happen at once. Otherwise parts of
+	 * members might vanish while we're doing lookups or similar. There's no
+	 * need to have an interlock with creating new multis or such, since those
+	 * are constrained by the limits (which only grow, never shrink).
+	 */
+	LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
+
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextMulti = MultiXactState->nextMXact;
+	nextOffset = MultiXactState->nextOffset;
+	oldestMulti = MultiXactState->oldestMultiXactId;
+	LWLockRelease(MultiXactGenLock);
+	Assert(MultiXactIdIsValid(oldestMulti));
+
+	/*
+	 * Make sure to only attempt truncation if there's values to truncate
+	 * away. In normal processing values shouldn't go backwards, but there's
+	 * some corner cases (due to bugs) where that's possible.
+	 */
+	if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti))
+	{
+		LWLockRelease(MultiXactTruncationLock);
+		return;
+	}
+
+	/*
+	 * Note we can't just plow ahead with the truncation; it's possible that
+	 * there are no segments to truncate, which is a problem because we are
+	 * going to attempt to read the offsets page to determine where to
+	 * truncate the members SLRU.  So we first scan the directory to determine
+	 * the earliest offsets page number that we can read without error.
+	 *
+	 * When nextMXact is less than one segment away from multiWrapLimit,
+	 * SlruScanDirCbFindEarliest can find some early segment other than the
+	 * actual earliest.  (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
+	 * returns false, because not all pairs of entries have the same answer.)
+	 * That can also arise when an earlier truncation attempt failed unlink()
+	 * or returned early from this function.  The only consequence is
+	 * returning early, which wastes space that we could have liberated.
+	 *
+	 * NB: It's also possible that the page that oldestMulti is on has already
+	 * been truncated away, and we crashed before updating oldestMulti.
+	 */
+	trunc.earliestExistingPage = -1;
+	SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
+	earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+	if (earliest < FirstMultiXactId)
+		earliest = FirstMultiXactId;
+
+	/* If there's nothing to remove, we can bail out early. */
+	if (MultiXactIdPrecedes(oldestMulti, earliest))
+	{
+		LWLockRelease(MultiXactTruncationLock);
+		return;
+	}
+
+	/*
+	 * First, compute the safe truncation point for MultiXactMember. This is
+	 * the starting offset of the oldest multixact.
+	 *
+	 * Hopefully, find_multixact_start will always work here, because we've
+	 * already checked that it doesn't precede the earliest MultiXact on disk.
+	 * But if it fails, don't truncate anything, and log a message.
+	 */
+	if (oldestMulti == nextMulti)
+	{
+		/* there are NO MultiXacts */
+		oldestOffset = nextOffset;
+	}
+	else if (!find_multixact_start(oldestMulti, &oldestOffset))
+	{
+		ereport(LOG,
+				(errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation",
+						oldestMulti, earliest)));
+		LWLockRelease(MultiXactTruncationLock);
+		return;
+	}
+
+	/*
+	 * Secondly compute up to where to truncate. Lookup the corresponding
+	 * member offset for newOldestMulti for that.
+	 */
+	if (newOldestMulti == nextMulti)
+	{
+		/* there are NO MultiXacts */
+		newOldestOffset = nextOffset;
+	}
+	else if (!find_multixact_start(newOldestMulti, &newOldestOffset))
+	{
+		ereport(LOG,
+				(errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation",
+						newOldestMulti)));
+		LWLockRelease(MultiXactTruncationLock);
+		return;
+	}
+
+	elog(DEBUG1, "performing multixact truncation: "
+		 "offsets [%u, %u), offsets segments [%x, %x), "
+		 "members [%u, %u), members segments [%x, %x)",
+		 oldestMulti, newOldestMulti,
+		 MultiXactIdToOffsetSegment(oldestMulti),
+		 MultiXactIdToOffsetSegment(newOldestMulti),
+		 oldestOffset, newOldestOffset,
+		 MXOffsetToMemberSegment(oldestOffset),
+		 MXOffsetToMemberSegment(newOldestOffset));
+
+	/*
+	 * Do truncation, and the WAL logging of the truncation, in a critical
+	 * section. That way offsets/members cannot get out of sync anymore, i.e.
+	 * once consistent the newOldestMulti will always exist in members, even
+	 * if we crashed in the wrong moment.
+	 */
+	START_CRIT_SECTION();
+
+	/*
+	 * Prevent checkpoints from being scheduled concurrently. This is critical
+	 * because otherwise a truncation record might not be replayed after a
+	 * crash/basebackup, even though the state of the data directory would
+	 * require it.
+	 */
+	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+	/* WAL log truncation */
+	WriteMTruncateXlogRec(newOldestMultiDB,
+						  oldestMulti, newOldestMulti,
+						  oldestOffset, newOldestOffset);
+
+	/*
+	 * Update in-memory limits before performing the truncation, while inside
+	 * the critical section: Have to do it before truncation, to prevent
+	 * concurrent lookups of those values. Has to be inside the critical
+	 * section as otherwise a future call to this function would error out,
+	 * while looking up the oldest member in offsets, if our caller crashes
+	 * before updating the limits.
+	 */
+	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+	MultiXactState->oldestMultiXactId = newOldestMulti;
+	MultiXactState->oldestMultiXactDB = newOldestMultiDB;
+	LWLockRelease(MultiXactGenLock);
+
+	/* First truncate members */
+	PerformMembersTruncation(oldestOffset, newOldestOffset);
+
+	/* Then offsets */
+	PerformOffsetsTruncation(oldestMulti, newOldestMulti);
+
+	MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+	END_CRIT_SECTION();
+	LWLockRelease(MultiXactTruncationLock);
+}
+
+/*
+ * Decide whether a MultiXactOffset page number is "older" for truncation
+ * purposes.  Analogous to CLOGPagePrecedes().
+ *
+ * Offsetting the values is optional, because MultiXactIdPrecedes() has
+ * translational symmetry.
+ */
+static bool
+MultiXactOffsetPagePrecedes(int page1, int page2)
+{
+	MultiXactId multi1;
+	MultiXactId multi2;
+
+	multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
+	multi1 += FirstMultiXactId + 1;
+	multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
+	multi2 += FirstMultiXactId + 1;
+
+	return (MultiXactIdPrecedes(multi1, multi2) &&
+			MultiXactIdPrecedes(multi1,
+								multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
+}
+
+/*
+ * Decide whether a MultiXactMember page number is "older" for truncation
+ * purposes.  There is no "invalid offset number" so use the numbers verbatim.
+ */
+static bool
+MultiXactMemberPagePrecedes(int page1, int page2)
+{
+	MultiXactOffset offset1;
+	MultiXactOffset offset2;
+
+	offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
+	offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
+
+	return (MultiXactOffsetPrecedes(offset1, offset2) &&
+			MultiXactOffsetPrecedes(offset1,
+									offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
+}
+
+/*
+ * Decide which of two MultiXactIds is earlier.
+ *
+ * XXX do we need to do something special for InvalidMultiXactId?
+ * (Doesn't look like it.)
+ */
+bool
+MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
+{
+	int32		diff = (int32) (multi1 - multi2);
+
+	return (diff < 0);
+}
+
+/*
+ * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2?
+ *
+ * XXX do we need to do something special for InvalidMultiXactId?
+ * (Doesn't look like it.)
+ */
+bool
+MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
+{
+	int32		diff = (int32) (multi1 - multi2);
+
+	return (diff <= 0);
+}
+
+
+/*
+ * Decide which of two offsets is earlier.
+ */
+static bool
+MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
+{
+	int32		diff = (int32) (offset1 - offset2);
+
+	return (diff < 0);
+}
+
+/*
+ * Write an xlog record reflecting the zeroing of either a MEMBERs or
+ * OFFSETs page (info shows which)
+ */
+static void
+WriteMZeroPageXlogRec(int pageno, uint8 info)
+{
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&pageno), sizeof(int));
+	(void) XLogInsert(RM_MULTIXACT_ID, info);
+}
+
+/*
+ * Write a TRUNCATE xlog record
+ *
+ * We must flush the xlog record to disk before returning --- see notes in
+ * TruncateCLOG().
+ */
+static void
+WriteMTruncateXlogRec(Oid oldestMultiDB,
+					  MultiXactId startTruncOff, MultiXactId endTruncOff,
+					  MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb)
+{
+	XLogRecPtr	recptr;
+	xl_multixact_truncate xlrec;
+
+	xlrec.oldestMultiDB = oldestMultiDB;
+
+	xlrec.startTruncOff = startTruncOff;
+	xlrec.endTruncOff = endTruncOff;
+
+	xlrec.startTruncMemb = startTruncMemb;
+	xlrec.endTruncMemb = endTruncMemb;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate);
+	recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID);
+	XLogFlush(recptr);
+}
+
+/*
+ * MULTIXACT resource manager's routines
+ */
+void
+multixact_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	/* Backup blocks are not used in multixact records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
+
+		slotno = ZeroMultiXactOffsetPage(pageno, false);
+		SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+		Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+
+		LWLockRelease(MultiXactOffsetSLRULock);
+	}
+	else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
+	{
+		int			pageno;
+		int			slotno;
+
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE);
+
+		slotno = ZeroMultiXactMemberPage(pageno, false);
+		SimpleLruWritePage(MultiXactMemberCtl, slotno);
+		Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+
+		LWLockRelease(MultiXactMemberSLRULock);
+	}
+	else if (info == XLOG_MULTIXACT_CREATE_ID)
+	{
+		xl_multixact_create *xlrec =
+		(xl_multixact_create *) XLogRecGetData(record);
+		TransactionId max_xid;
+		int			i;
+
+		/* Store the data back into the SLRU files */
+		RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
+						   xlrec->members);
+
+		/* Make sure nextMXact/nextOffset are beyond what this record has */
+		MultiXactAdvanceNextMXact(xlrec->mid + 1,
+								  xlrec->moff + xlrec->nmembers);
+
+		/*
+		 * Make sure nextXid is beyond any XID mentioned in the record. This
+		 * should be unnecessary, since any XID found here ought to have other
+		 * evidence in the XLOG, but let's be safe.
+		 */
+		max_xid = XLogRecGetXid(record);
+		for (i = 0; i < xlrec->nmembers; i++)
+		{
+			if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
+				max_xid = xlrec->members[i].xid;
+		}
+
+		AdvanceNextFullTransactionIdPastXid(max_xid);
+	}
+	else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
+	{
+		xl_multixact_truncate xlrec;
+		int			pageno;
+
+		memcpy(&xlrec, XLogRecGetData(record),
+			   SizeOfMultiXactTruncate);
+
+		elog(DEBUG1, "replaying multixact truncation: "
+			 "offsets [%u, %u), offsets segments [%x, %x), "
+			 "members [%u, %u), members segments [%x, %x)",
+			 xlrec.startTruncOff, xlrec.endTruncOff,
+			 MultiXactIdToOffsetSegment(xlrec.startTruncOff),
+			 MultiXactIdToOffsetSegment(xlrec.endTruncOff),
+			 xlrec.startTruncMemb, xlrec.endTruncMemb,
+			 MXOffsetToMemberSegment(xlrec.startTruncMemb),
+			 MXOffsetToMemberSegment(xlrec.endTruncMemb));
+
+		/* should not be required, but more than cheap enough */
+		LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE);
+
+		/*
+		 * Advance the horizon values, so they're current at the end of
+		 * recovery.
+		 */
+		SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
+
+		PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
+
+		/*
+		 * During XLOG replay, latest_page_number isn't necessarily set up
+		 * yet; insert a suitable value to bypass the sanity test in
+		 * SimpleLruTruncate.
+		 */
+		pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff);
+		MultiXactOffsetCtl->shared->latest_page_number = pageno;
+		PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff);
+
+		LWLockRelease(MultiXactTruncationLock);
+	}
+	else
+		elog(PANIC, "multixact_redo: unknown op code %u", info);
+}
+
+Datum
+pg_get_multixact_members(PG_FUNCTION_ARGS)
+{
+	typedef struct
+	{
+		MultiXactMember *members;
+		int			nmembers;
+		int			iter;
+	} mxact;
+	MultiXactId mxid = PG_GETARG_TRANSACTIONID(0);
+	mxact	   *multi;
+	FuncCallContext *funccxt;
+
+	if (mxid < FirstMultiXactId)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid MultiXactId: %u", mxid)));
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcxt;
+		TupleDesc	tupdesc;
+
+		funccxt = SRF_FIRSTCALL_INIT();
+		oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
+
+		multi = palloc(sizeof(mxact));
+		/* no need to allow for old values here */
+		multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false,
+												false);
+		multi->iter = 0;
+
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
+						   XIDOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
+						   TEXTOID, -1, 0);
+
+		funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+		funccxt->user_fctx = multi;
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	funccxt = SRF_PERCALL_SETUP();
+	multi = (mxact *) funccxt->user_fctx;
+
+	while (multi->iter < multi->nmembers)
+	{
+		HeapTuple	tuple;
+		char	   *values[2];
+
+		values[0] = psprintf("%u", multi->members[multi->iter].xid);
+		values[1] = mxstatus_to_string(multi->members[multi->iter].status);
+
+		tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
+
+		multi->iter++;
+		pfree(values[0]);
+		SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
+	}
+
+	SRF_RETURN_DONE(funccxt);
+}
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+	return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
+}
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
new file mode 100644
index 0000000..df0cd77
--- /dev/null
+++ b/src/backend/access/transam/parallel.c
@@ -0,0 +1,1597 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.c
+ *	  Infrastructure for launching parallel workers
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/transam/parallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/nbtree.h"
+#include "access/parallel.h"
+#include "access/session.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/index.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_enum.h"
+#include "catalog/storage.h"
+#include "commands/async.h"
+#include "commands/vacuum.h"
+#include "executor/execParallel.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "libpq/pqmq.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/sinval.h"
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+#include "utils/combocid.h"
+#include "utils/guc.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/relmapper.h"
+#include "utils/snapmgr.h"
+#include "utils/typcache.h"
+
+/*
+ * We don't want to waste a lot of memory on an error queue which, most of
+ * the time, will process only a handful of small messages.  However, it is
+ * desirable to make it large enough that a typical ErrorResponse can be sent
+ * without blocking.  That way, a worker that errors out can write the whole
+ * message into the queue and terminate without waiting for the user backend.
+ */
+#define PARALLEL_ERROR_QUEUE_SIZE			16384
+
+/* Magic number for parallel context TOC. */
+#define PARALLEL_MAGIC						0x50477c7c
+
+/*
+ * Magic numbers for per-context parallel state sharing.  Higher-level code
+ * should use smaller values, leaving these very large ones for use by this
+ * module.
+ */
+#define PARALLEL_KEY_FIXED					UINT64CONST(0xFFFFFFFFFFFF0001)
+#define PARALLEL_KEY_ERROR_QUEUE			UINT64CONST(0xFFFFFFFFFFFF0002)
+#define PARALLEL_KEY_LIBRARY				UINT64CONST(0xFFFFFFFFFFFF0003)
+#define PARALLEL_KEY_GUC					UINT64CONST(0xFFFFFFFFFFFF0004)
+#define PARALLEL_KEY_COMBO_CID				UINT64CONST(0xFFFFFFFFFFFF0005)
+#define PARALLEL_KEY_TRANSACTION_SNAPSHOT	UINT64CONST(0xFFFFFFFFFFFF0006)
+#define PARALLEL_KEY_ACTIVE_SNAPSHOT		UINT64CONST(0xFFFFFFFFFFFF0007)
+#define PARALLEL_KEY_TRANSACTION_STATE		UINT64CONST(0xFFFFFFFFFFFF0008)
+#define PARALLEL_KEY_ENTRYPOINT				UINT64CONST(0xFFFFFFFFFFFF0009)
+#define PARALLEL_KEY_SESSION_DSM			UINT64CONST(0xFFFFFFFFFFFF000A)
+#define PARALLEL_KEY_PENDING_SYNCS			UINT64CONST(0xFFFFFFFFFFFF000B)
+#define PARALLEL_KEY_REINDEX_STATE			UINT64CONST(0xFFFFFFFFFFFF000C)
+#define PARALLEL_KEY_RELMAPPER_STATE		UINT64CONST(0xFFFFFFFFFFFF000D)
+#define PARALLEL_KEY_UNCOMMITTEDENUMS		UINT64CONST(0xFFFFFFFFFFFF000E)
+
+/* Fixed-size parallel state. */
+typedef struct FixedParallelState
+{
+	/* Fixed-size state that workers must restore. */
+	Oid			database_id;
+	Oid			authenticated_user_id;
+	Oid			current_user_id;
+	Oid			outer_user_id;
+	Oid			temp_namespace_id;
+	Oid			temp_toast_namespace_id;
+	int			sec_context;
+	bool		is_superuser;
+	PGPROC	   *parallel_leader_pgproc;
+	pid_t		parallel_leader_pid;
+	BackendId	parallel_leader_backend_id;
+	TimestampTz xact_ts;
+	TimestampTz stmt_ts;
+	SerializableXactHandle serializable_xact_handle;
+
+	/* Mutex protects remaining fields. */
+	slock_t		mutex;
+
+	/* Maximum XactLastRecEnd of any worker. */
+	XLogRecPtr	last_xlog_end;
+} FixedParallelState;
+
+/*
+ * Our parallel worker number.  We initialize this to -1, meaning that we are
+ * not a parallel worker.  In parallel workers, it will be set to a value >= 0
+ * and < the number of workers before any user code is invoked; each parallel
+ * worker will get a different parallel worker number.
+ */
+int			ParallelWorkerNumber = -1;
+
+/* Is there a parallel message pending which we need to receive? */
+volatile bool ParallelMessagePending = false;
+
+/* Are we initializing a parallel worker? */
+bool		InitializingParallelWorker = false;
+
+/* Pointer to our fixed parallel state. */
+static FixedParallelState *MyFixedParallelState;
+
+/* List of active parallel contexts. */
+static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list);
+
+/* Backend-local copy of data from FixedParallelState. */
+static pid_t ParallelLeaderPid;
+
+/*
+ * List of internal parallel worker entry points.  We need this for
+ * reasons explained in LookupParallelWorkerFunction(), below.
+ */
+static const struct
+{
+	const char *fn_name;
+	parallel_worker_main_type fn_addr;
+}			InternalParallelWorkers[] =
+
+{
+	{
+		"ParallelQueryMain", ParallelQueryMain
+	},
+	{
+		"_bt_parallel_build_main", _bt_parallel_build_main
+	},
+	{
+		"parallel_vacuum_main", parallel_vacuum_main
+	}
+};
+
+/* Private functions. */
+static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg);
+static void WaitForParallelWorkersToExit(ParallelContext *pcxt);
+static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname);
+static void ParallelWorkerShutdown(int code, Datum arg);
+
+
+/*
+ * Establish a new parallel context.  This should be done after entering
+ * parallel mode, and (unless there is an error) the context should be
+ * destroyed before exiting the current subtransaction.
+ */
+ParallelContext *
+CreateParallelContext(const char *library_name, const char *function_name,
+					  int nworkers)
+{
+	MemoryContext oldcontext;
+	ParallelContext *pcxt;
+
+	/* It is unsafe to create a parallel context if not in parallel mode. */
+	Assert(IsInParallelMode());
+
+	/* Number of workers should be non-negative. */
+	Assert(nworkers >= 0);
+
+	/* We might be running in a short-lived memory context. */
+	oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+	/* Initialize a new ParallelContext. */
+	pcxt = palloc0(sizeof(ParallelContext));
+	pcxt->subid = GetCurrentSubTransactionId();
+	pcxt->nworkers = nworkers;
+	pcxt->nworkers_to_launch = nworkers;
+	pcxt->library_name = pstrdup(library_name);
+	pcxt->function_name = pstrdup(function_name);
+	pcxt->error_context_stack = error_context_stack;
+	shm_toc_initialize_estimator(&pcxt->estimator);
+	dlist_push_head(&pcxt_list, &pcxt->node);
+
+	/* Restore previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+
+	return pcxt;
+}
+
+/*
+ * Establish the dynamic shared memory segment for a parallel context and
+ * copy state and other bookkeeping information that will be needed by
+ * parallel workers into it.
+ */
+void
+InitializeParallelDSM(ParallelContext *pcxt)
+{
+	MemoryContext oldcontext;
+	Size		library_len = 0;
+	Size		guc_len = 0;
+	Size		combocidlen = 0;
+	Size		tsnaplen = 0;
+	Size		asnaplen = 0;
+	Size		tstatelen = 0;
+	Size		pendingsyncslen = 0;
+	Size		reindexlen = 0;
+	Size		relmapperlen = 0;
+	Size		uncommittedenumslen = 0;
+	Size		segsize = 0;
+	int			i;
+	FixedParallelState *fps;
+	dsm_handle	session_dsm_handle = DSM_HANDLE_INVALID;
+	Snapshot	transaction_snapshot = GetTransactionSnapshot();
+	Snapshot	active_snapshot = GetActiveSnapshot();
+
+	/* We might be running in a very short-lived memory context. */
+	oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+	/* Allow space to store the fixed-size parallel state. */
+	shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState));
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+	/*
+	 * Normally, the user will have requested at least one worker process, but
+	 * if by chance they have not, we can skip a bunch of things here.
+	 */
+	if (pcxt->nworkers > 0)
+	{
+		/* Get (or create) the per-session DSM segment's handle. */
+		session_dsm_handle = GetSessionDsmHandle();
+
+		/*
+		 * If we weren't able to create a per-session DSM segment, then we can
+		 * continue but we can't safely launch any workers because their
+		 * record typmods would be incompatible so they couldn't exchange
+		 * tuples.
+		 */
+		if (session_dsm_handle == DSM_HANDLE_INVALID)
+			pcxt->nworkers = 0;
+	}
+
+	if (pcxt->nworkers > 0)
+	{
+		/* Estimate space for various kinds of state sharing. */
+		library_len = EstimateLibraryStateSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, library_len);
+		guc_len = EstimateGUCStateSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, guc_len);
+		combocidlen = EstimateComboCIDStateSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, combocidlen);
+		if (IsolationUsesXactSnapshot())
+		{
+			tsnaplen = EstimateSnapshotSpace(transaction_snapshot);
+			shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen);
+		}
+		asnaplen = EstimateSnapshotSpace(active_snapshot);
+		shm_toc_estimate_chunk(&pcxt->estimator, asnaplen);
+		tstatelen = EstimateTransactionStateSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, tstatelen);
+		shm_toc_estimate_chunk(&pcxt->estimator, sizeof(dsm_handle));
+		pendingsyncslen = EstimatePendingSyncsSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, pendingsyncslen);
+		reindexlen = EstimateReindexStateSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, reindexlen);
+		relmapperlen = EstimateRelationMapSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, relmapperlen);
+		uncommittedenumslen = EstimateUncommittedEnumsSpace();
+		shm_toc_estimate_chunk(&pcxt->estimator, uncommittedenumslen);
+		/* If you add more chunks here, you probably need to add keys. */
+		shm_toc_estimate_keys(&pcxt->estimator, 11);
+
+		/* Estimate space need for error queues. */
+		StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
+						 PARALLEL_ERROR_QUEUE_SIZE,
+						 "parallel error queue size not buffer-aligned");
+		shm_toc_estimate_chunk(&pcxt->estimator,
+							   mul_size(PARALLEL_ERROR_QUEUE_SIZE,
+										pcxt->nworkers));
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+
+		/* Estimate how much we'll need for the entrypoint info. */
+		shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) +
+							   strlen(pcxt->function_name) + 2);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+	}
+
+	/*
+	 * Create DSM and initialize with new table of contents.  But if the user
+	 * didn't request any workers, then don't bother creating a dynamic shared
+	 * memory segment; instead, just use backend-private memory.
+	 *
+	 * Also, if we can't create a dynamic shared memory segment because the
+	 * maximum number of segments have already been created, then fall back to
+	 * backend-private memory, and plan not to use any workers.  We hope this
+	 * won't happen very often, but it's better to abandon the use of
+	 * parallelism than to fail outright.
+	 */
+	segsize = shm_toc_estimate(&pcxt->estimator);
+	if (pcxt->nworkers > 0)
+		pcxt->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS);
+	if (pcxt->seg != NULL)
+		pcxt->toc = shm_toc_create(PARALLEL_MAGIC,
+								   dsm_segment_address(pcxt->seg),
+								   segsize);
+	else
+	{
+		pcxt->nworkers = 0;
+		pcxt->private_memory = MemoryContextAlloc(TopMemoryContext, segsize);
+		pcxt->toc = shm_toc_create(PARALLEL_MAGIC, pcxt->private_memory,
+								   segsize);
+	}
+
+	/* Initialize fixed-size state in shared memory. */
+	fps = (FixedParallelState *)
+		shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState));
+	fps->database_id = MyDatabaseId;
+	fps->authenticated_user_id = GetAuthenticatedUserId();
+	fps->outer_user_id = GetCurrentRoleId();
+	fps->is_superuser = session_auth_is_superuser;
+	GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context);
+	GetTempNamespaceState(&fps->temp_namespace_id,
+						  &fps->temp_toast_namespace_id);
+	fps->parallel_leader_pgproc = MyProc;
+	fps->parallel_leader_pid = MyProcPid;
+	fps->parallel_leader_backend_id = MyBackendId;
+	fps->xact_ts = GetCurrentTransactionStartTimestamp();
+	fps->stmt_ts = GetCurrentStatementStartTimestamp();
+	fps->serializable_xact_handle = ShareSerializableXact();
+	SpinLockInit(&fps->mutex);
+	fps->last_xlog_end = 0;
+	shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps);
+
+	/* We can skip the rest of this if we're not budgeting for any workers. */
+	if (pcxt->nworkers > 0)
+	{
+		char	   *libraryspace;
+		char	   *gucspace;
+		char	   *combocidspace;
+		char	   *tsnapspace;
+		char	   *asnapspace;
+		char	   *tstatespace;
+		char	   *pendingsyncsspace;
+		char	   *reindexspace;
+		char	   *relmapperspace;
+		char	   *error_queue_space;
+		char	   *session_dsm_handle_space;
+		char	   *entrypointstate;
+		char	   *uncommittedenumsspace;
+		Size		lnamelen;
+
+		/* Serialize shared libraries we have loaded. */
+		libraryspace = shm_toc_allocate(pcxt->toc, library_len);
+		SerializeLibraryState(library_len, libraryspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace);
+
+		/* Serialize GUC settings. */
+		gucspace = shm_toc_allocate(pcxt->toc, guc_len);
+		SerializeGUCState(guc_len, gucspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace);
+
+		/* Serialize combo CID state. */
+		combocidspace = shm_toc_allocate(pcxt->toc, combocidlen);
+		SerializeComboCIDState(combocidlen, combocidspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace);
+
+		/*
+		 * Serialize the transaction snapshot if the transaction
+		 * isolation-level uses a transaction snapshot.
+		 */
+		if (IsolationUsesXactSnapshot())
+		{
+			tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen);
+			SerializeSnapshot(transaction_snapshot, tsnapspace);
+			shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT,
+						   tsnapspace);
+		}
+
+		/* Serialize the active snapshot. */
+		asnapspace = shm_toc_allocate(pcxt->toc, asnaplen);
+		SerializeSnapshot(active_snapshot, asnapspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace);
+
+		/* Provide the handle for per-session segment. */
+		session_dsm_handle_space = shm_toc_allocate(pcxt->toc,
+													sizeof(dsm_handle));
+		*(dsm_handle *) session_dsm_handle_space = session_dsm_handle;
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSION_DSM,
+					   session_dsm_handle_space);
+
+		/* Serialize transaction state. */
+		tstatespace = shm_toc_allocate(pcxt->toc, tstatelen);
+		SerializeTransactionState(tstatelen, tstatespace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace);
+
+		/* Serialize pending syncs. */
+		pendingsyncsspace = shm_toc_allocate(pcxt->toc, pendingsyncslen);
+		SerializePendingSyncs(pendingsyncslen, pendingsyncsspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_PENDING_SYNCS,
+					   pendingsyncsspace);
+
+		/* Serialize reindex state. */
+		reindexspace = shm_toc_allocate(pcxt->toc, reindexlen);
+		SerializeReindexState(reindexlen, reindexspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_REINDEX_STATE, reindexspace);
+
+		/* Serialize relmapper state. */
+		relmapperspace = shm_toc_allocate(pcxt->toc, relmapperlen);
+		SerializeRelationMap(relmapperlen, relmapperspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_RELMAPPER_STATE,
+					   relmapperspace);
+
+		/* Serialize uncommitted enum state. */
+		uncommittedenumsspace = shm_toc_allocate(pcxt->toc,
+												 uncommittedenumslen);
+		SerializeUncommittedEnums(uncommittedenumsspace, uncommittedenumslen);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_UNCOMMITTEDENUMS,
+					   uncommittedenumsspace);
+
+		/* Allocate space for worker information. */
+		pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers);
+
+		/*
+		 * Establish error queues in dynamic shared memory.
+		 *
+		 * These queues should be used only for transmitting ErrorResponse,
+		 * NoticeResponse, and NotifyResponse protocol messages.  Tuple data
+		 * should be transmitted via separate (possibly larger?) queues.
+		 */
+		error_queue_space =
+			shm_toc_allocate(pcxt->toc,
+							 mul_size(PARALLEL_ERROR_QUEUE_SIZE,
+									  pcxt->nworkers));
+		for (i = 0; i < pcxt->nworkers; ++i)
+		{
+			char	   *start;
+			shm_mq	   *mq;
+
+			start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
+			mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
+			shm_mq_set_receiver(mq, MyProc);
+			pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+		}
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space);
+
+		/*
+		 * Serialize entrypoint information.  It's unsafe to pass function
+		 * pointers across processes, as the function pointer may be different
+		 * in each process in EXEC_BACKEND builds, so we always pass library
+		 * and function name.  (We use library name "postgres" for functions
+		 * in the core backend.)
+		 */
+		lnamelen = strlen(pcxt->library_name);
+		entrypointstate = shm_toc_allocate(pcxt->toc, lnamelen +
+										   strlen(pcxt->function_name) + 2);
+		strcpy(entrypointstate, pcxt->library_name);
+		strcpy(entrypointstate + lnamelen + 1, pcxt->function_name);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate);
+	}
+
+	/* Restore previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Reinitialize the dynamic shared memory segment for a parallel context such
+ * that we could launch workers for it again.
+ */
+void
+ReinitializeParallelDSM(ParallelContext *pcxt)
+{
+	FixedParallelState *fps;
+
+	/* Wait for any old workers to exit. */
+	if (pcxt->nworkers_launched > 0)
+	{
+		WaitForParallelWorkersToFinish(pcxt);
+		WaitForParallelWorkersToExit(pcxt);
+		pcxt->nworkers_launched = 0;
+		if (pcxt->known_attached_workers)
+		{
+			pfree(pcxt->known_attached_workers);
+			pcxt->known_attached_workers = NULL;
+			pcxt->nknown_attached_workers = 0;
+		}
+	}
+
+	/* Reset a few bits of fixed parallel state to a clean state. */
+	fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
+	fps->last_xlog_end = 0;
+
+	/* Recreate error queues (if they exist). */
+	if (pcxt->nworkers > 0)
+	{
+		char	   *error_queue_space;
+		int			i;
+
+		error_queue_space =
+			shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false);
+		for (i = 0; i < pcxt->nworkers; ++i)
+		{
+			char	   *start;
+			shm_mq	   *mq;
+
+			start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
+			mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
+			shm_mq_set_receiver(mq, MyProc);
+			pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+		}
+	}
+}
+
+/*
+ * Reinitialize parallel workers for a parallel context such that we could
+ * launch a different number of workers.  This is required for cases where
+ * we need to reuse the same DSM segment, but the number of workers can
+ * vary from run-to-run.
+ */
+void
+ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch)
+{
+	/*
+	 * The number of workers that need to be launched must be less than the
+	 * number of workers with which the parallel context is initialized.
+	 */
+	Assert(pcxt->nworkers >= nworkers_to_launch);
+	pcxt->nworkers_to_launch = nworkers_to_launch;
+}
+
+/*
+ * Launch parallel workers.
+ */
+void
+LaunchParallelWorkers(ParallelContext *pcxt)
+{
+	MemoryContext oldcontext;
+	BackgroundWorker worker;
+	int			i;
+	bool		any_registrations_failed = false;
+
+	/* Skip this if we have no workers. */
+	if (pcxt->nworkers == 0 || pcxt->nworkers_to_launch == 0)
+		return;
+
+	/* We need to be a lock group leader. */
+	BecomeLockGroupLeader();
+
+	/* If we do have workers, we'd better have a DSM segment. */
+	Assert(pcxt->seg != NULL);
+
+	/* We might be running in a short-lived memory context. */
+	oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+
+	/* Configure a worker. */
+	memset(&worker, 0, sizeof(worker));
+	snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d",
+			 MyProcPid);
+	snprintf(worker.bgw_type, BGW_MAXLEN, "parallel worker");
+	worker.bgw_flags =
+		BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION
+		| BGWORKER_CLASS_PARALLEL;
+	worker.bgw_start_time = BgWorkerStart_ConsistentState;
+	worker.bgw_restart_time = BGW_NEVER_RESTART;
+	sprintf(worker.bgw_library_name, "postgres");
+	sprintf(worker.bgw_function_name, "ParallelWorkerMain");
+	worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg));
+	worker.bgw_notify_pid = MyProcPid;
+
+	/*
+	 * Start workers.
+	 *
+	 * The caller must be able to tolerate ending up with fewer workers than
+	 * expected, so there is no need to throw an error here if registration
+	 * fails.  It wouldn't help much anyway, because registering the worker in
+	 * no way guarantees that it will start up and initialize successfully.
+	 */
+	for (i = 0; i < pcxt->nworkers_to_launch; ++i)
+	{
+		memcpy(worker.bgw_extra, &i, sizeof(int));
+		if (!any_registrations_failed &&
+			RegisterDynamicBackgroundWorker(&worker,
+											&pcxt->worker[i].bgwhandle))
+		{
+			shm_mq_set_handle(pcxt->worker[i].error_mqh,
+							  pcxt->worker[i].bgwhandle);
+			pcxt->nworkers_launched++;
+		}
+		else
+		{
+			/*
+			 * If we weren't able to register the worker, then we've bumped up
+			 * against the max_worker_processes limit, and future
+			 * registrations will probably fail too, so arrange to skip them.
+			 * But we still have to execute this code for the remaining slots
+			 * to make sure that we forget about the error queues we budgeted
+			 * for those workers.  Otherwise, we'll wait for them to start,
+			 * but they never will.
+			 */
+			any_registrations_failed = true;
+			pcxt->worker[i].bgwhandle = NULL;
+			shm_mq_detach(pcxt->worker[i].error_mqh);
+			pcxt->worker[i].error_mqh = NULL;
+		}
+	}
+
+	/*
+	 * Now that nworkers_launched has taken its final value, we can initialize
+	 * known_attached_workers.
+	 */
+	if (pcxt->nworkers_launched > 0)
+	{
+		pcxt->known_attached_workers =
+			palloc0(sizeof(bool) * pcxt->nworkers_launched);
+		pcxt->nknown_attached_workers = 0;
+	}
+
+	/* Restore previous memory context. */
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Wait for all workers to attach to their error queues, and throw an error if
+ * any worker fails to do this.
+ *
+ * Callers can assume that if this function returns successfully, then the
+ * number of workers given by pcxt->nworkers_launched have initialized and
+ * attached to their error queues.  Whether or not these workers are guaranteed
+ * to still be running depends on what code the caller asked them to run;
+ * this function does not guarantee that they have not exited.  However, it
+ * does guarantee that any workers which exited must have done so cleanly and
+ * after successfully performing the work with which they were tasked.
+ *
+ * If this function is not called, then some of the workers that were launched
+ * may not have been started due to a fork() failure, or may have exited during
+ * early startup prior to attaching to the error queue, so nworkers_launched
+ * cannot be viewed as completely reliable.  It will never be less than the
+ * number of workers which actually started, but it might be more.  Any workers
+ * that failed to start will still be discovered by
+ * WaitForParallelWorkersToFinish and an error will be thrown at that time,
+ * provided that function is eventually reached.
+ *
+ * In general, the leader process should do as much work as possible before
+ * calling this function.  fork() failures and other early-startup failures
+ * are very uncommon, and having the leader sit idle when it could be doing
+ * useful work is undesirable.  However, if the leader needs to wait for
+ * all of its workers or for a specific worker, it may want to call this
+ * function before doing so.  If not, it must make some other provision for
+ * the failure-to-start case, lest it wait forever.  On the other hand, a
+ * leader which never waits for a worker that might not be started yet, or
+ * at least never does so prior to WaitForParallelWorkersToFinish(), need not
+ * call this function at all.
+ */
+void
+WaitForParallelWorkersToAttach(ParallelContext *pcxt)
+{
+	int			i;
+
+	/* Skip this if we have no launched workers. */
+	if (pcxt->nworkers_launched == 0)
+		return;
+
+	for (;;)
+	{
+		/*
+		 * This will process any parallel messages that are pending and it may
+		 * also throw an error propagated from a worker.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		for (i = 0; i < pcxt->nworkers_launched; ++i)
+		{
+			BgwHandleStatus status;
+			shm_mq	   *mq;
+			int			rc;
+			pid_t		pid;
+
+			if (pcxt->known_attached_workers[i])
+				continue;
+
+			/*
+			 * If error_mqh is NULL, then the worker has already exited
+			 * cleanly.
+			 */
+			if (pcxt->worker[i].error_mqh == NULL)
+			{
+				pcxt->known_attached_workers[i] = true;
+				++pcxt->nknown_attached_workers;
+				continue;
+			}
+
+			status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid);
+			if (status == BGWH_STARTED)
+			{
+				/* Has the worker attached to the error queue? */
+				mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+				if (shm_mq_get_sender(mq) != NULL)
+				{
+					/* Yes, so it is known to be attached. */
+					pcxt->known_attached_workers[i] = true;
+					++pcxt->nknown_attached_workers;
+				}
+			}
+			else if (status == BGWH_STOPPED)
+			{
+				/*
+				 * If the worker stopped without attaching to the error queue,
+				 * throw an error.
+				 */
+				mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+				if (shm_mq_get_sender(mq) == NULL)
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("parallel worker failed to initialize"),
+							 errhint("More details may be available in the server log.")));
+
+				pcxt->known_attached_workers[i] = true;
+				++pcxt->nknown_attached_workers;
+			}
+			else
+			{
+				/*
+				 * Worker not yet started, so we must wait.  The postmaster
+				 * will notify us if the worker's state changes.  Our latch
+				 * might also get set for some other reason, but if so we'll
+				 * just end up waiting for the same worker again.
+				 */
+				rc = WaitLatch(MyLatch,
+							   WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+							   -1, WAIT_EVENT_BGWORKER_STARTUP);
+
+				if (rc & WL_LATCH_SET)
+					ResetLatch(MyLatch);
+			}
+		}
+
+		/* If all workers are known to have started, we're done. */
+		if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched)
+		{
+			Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched);
+			break;
+		}
+	}
+}
+
+/*
+ * Wait for all workers to finish computing.
+ *
+ * Even if the parallel operation seems to have completed successfully, it's
+ * important to call this function afterwards.  We must not miss any errors
+ * the workers may have thrown during the parallel operation, or any that they
+ * may yet throw while shutting down.
+ *
+ * Also, we want to update our notion of XactLastRecEnd based on worker
+ * feedback.
+ */
+void
+WaitForParallelWorkersToFinish(ParallelContext *pcxt)
+{
+	for (;;)
+	{
+		bool		anyone_alive = false;
+		int			nfinished = 0;
+		int			i;
+
+		/*
+		 * This will process any parallel messages that are pending, which may
+		 * change the outcome of the loop that follows.  It may also throw an
+		 * error propagated from a worker.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		for (i = 0; i < pcxt->nworkers_launched; ++i)
+		{
+			/*
+			 * If error_mqh is NULL, then the worker has already exited
+			 * cleanly.  If we have received a message through error_mqh from
+			 * the worker, we know it started up cleanly, and therefore we're
+			 * certain to be notified when it exits.
+			 */
+			if (pcxt->worker[i].error_mqh == NULL)
+				++nfinished;
+			else if (pcxt->known_attached_workers[i])
+			{
+				anyone_alive = true;
+				break;
+			}
+		}
+
+		if (!anyone_alive)
+		{
+			/* If all workers are known to have finished, we're done. */
+			if (nfinished >= pcxt->nworkers_launched)
+			{
+				Assert(nfinished == pcxt->nworkers_launched);
+				break;
+			}
+
+			/*
+			 * We didn't detect any living workers, but not all workers are
+			 * known to have exited cleanly.  Either not all workers have
+			 * launched yet, or maybe some of them failed to start or
+			 * terminated abnormally.
+			 */
+			for (i = 0; i < pcxt->nworkers_launched; ++i)
+			{
+				pid_t		pid;
+				shm_mq	   *mq;
+
+				/*
+				 * If the worker is BGWH_NOT_YET_STARTED or BGWH_STARTED, we
+				 * should just keep waiting.  If it is BGWH_STOPPED, then
+				 * further investigation is needed.
+				 */
+				if (pcxt->worker[i].error_mqh == NULL ||
+					pcxt->worker[i].bgwhandle == NULL ||
+					GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle,
+										   &pid) != BGWH_STOPPED)
+					continue;
+
+				/*
+				 * Check whether the worker ended up stopped without ever
+				 * attaching to the error queue.  If so, the postmaster was
+				 * unable to fork the worker or it exited without initializing
+				 * properly.  We must throw an error, since the caller may
+				 * have been expecting the worker to do some work before
+				 * exiting.
+				 */
+				mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+				if (shm_mq_get_sender(mq) == NULL)
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("parallel worker failed to initialize"),
+							 errhint("More details may be available in the server log.")));
+
+				/*
+				 * The worker is stopped, but is attached to the error queue.
+				 * Unless there's a bug somewhere, this will only happen when
+				 * the worker writes messages and terminates after the
+				 * CHECK_FOR_INTERRUPTS() near the top of this function and
+				 * before the call to GetBackgroundWorkerPid().  In that case,
+				 * or latch should have been set as well and the right things
+				 * will happen on the next pass through the loop.
+				 */
+			}
+		}
+
+		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1,
+						 WAIT_EVENT_PARALLEL_FINISH);
+		ResetLatch(MyLatch);
+	}
+
+	if (pcxt->toc != NULL)
+	{
+		FixedParallelState *fps;
+
+		fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
+		if (fps->last_xlog_end > XactLastRecEnd)
+			XactLastRecEnd = fps->last_xlog_end;
+	}
+}
+
+/*
+ * Wait for all workers to exit.
+ *
+ * This function ensures that workers have been completely shutdown.  The
+ * difference between WaitForParallelWorkersToFinish and this function is
+ * that the former just ensures that last message sent by a worker backend is
+ * received by the leader backend whereas this ensures the complete shutdown.
+ */
+static void
+WaitForParallelWorkersToExit(ParallelContext *pcxt)
+{
+	int			i;
+
+	/* Wait until the workers actually die. */
+	for (i = 0; i < pcxt->nworkers_launched; ++i)
+	{
+		BgwHandleStatus status;
+
+		if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL)
+			continue;
+
+		status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle);
+
+		/*
+		 * If the postmaster kicked the bucket, we have no chance of cleaning
+		 * up safely -- we won't be able to tell when our workers are actually
+		 * dead.  This doesn't necessitate a PANIC since they will all abort
+		 * eventually, but we can't safely continue this session.
+		 */
+		if (status == BGWH_POSTMASTER_DIED)
+			ereport(FATAL,
+					(errcode(ERRCODE_ADMIN_SHUTDOWN),
+					 errmsg("postmaster exited during a parallel transaction")));
+
+		/* Release memory. */
+		pfree(pcxt->worker[i].bgwhandle);
+		pcxt->worker[i].bgwhandle = NULL;
+	}
+}
+
+/*
+ * Destroy a parallel context.
+ *
+ * If expecting a clean exit, you should use WaitForParallelWorkersToFinish()
+ * first, before calling this function.  When this function is invoked, any
+ * remaining workers are forcibly killed; the dynamic shared memory segment
+ * is unmapped; and we then wait (uninterruptibly) for the workers to exit.
+ */
+void
+DestroyParallelContext(ParallelContext *pcxt)
+{
+	int			i;
+
+	/*
+	 * Be careful about order of operations here!  We remove the parallel
+	 * context from the list before we do anything else; otherwise, if an
+	 * error occurs during a subsequent step, we might try to nuke it again
+	 * from AtEOXact_Parallel or AtEOSubXact_Parallel.
+	 */
+	dlist_delete(&pcxt->node);
+
+	/* Kill each worker in turn, and forget their error queues. */
+	if (pcxt->worker != NULL)
+	{
+		for (i = 0; i < pcxt->nworkers_launched; ++i)
+		{
+			if (pcxt->worker[i].error_mqh != NULL)
+			{
+				TerminateBackgroundWorker(pcxt->worker[i].bgwhandle);
+
+				shm_mq_detach(pcxt->worker[i].error_mqh);
+				pcxt->worker[i].error_mqh = NULL;
+			}
+		}
+	}
+
+	/*
+	 * If we have allocated a shared memory segment, detach it.  This will
+	 * implicitly detach the error queues, and any other shared memory queues,
+	 * stored there.
+	 */
+	if (pcxt->seg != NULL)
+	{
+		dsm_detach(pcxt->seg);
+		pcxt->seg = NULL;
+	}
+
+	/*
+	 * If this parallel context is actually in backend-private memory rather
+	 * than shared memory, free that memory instead.
+	 */
+	if (pcxt->private_memory != NULL)
+	{
+		pfree(pcxt->private_memory);
+		pcxt->private_memory = NULL;
+	}
+
+	/*
+	 * We can't finish transaction commit or abort until all of the workers
+	 * have exited.  This means, in particular, that we can't respond to
+	 * interrupts at this stage.
+	 */
+	HOLD_INTERRUPTS();
+	WaitForParallelWorkersToExit(pcxt);
+	RESUME_INTERRUPTS();
+
+	/* Free the worker array itself. */
+	if (pcxt->worker != NULL)
+	{
+		pfree(pcxt->worker);
+		pcxt->worker = NULL;
+	}
+
+	/* Free memory. */
+	pfree(pcxt->library_name);
+	pfree(pcxt->function_name);
+	pfree(pcxt);
+}
+
+/*
+ * Are there any parallel contexts currently active?
+ */
+bool
+ParallelContextActive(void)
+{
+	return !dlist_is_empty(&pcxt_list);
+}
+
+/*
+ * Handle receipt of an interrupt indicating a parallel worker message.
+ *
+ * Note: this is called within a signal handler!  All we can do is set
+ * a flag that will cause the next CHECK_FOR_INTERRUPTS() to invoke
+ * HandleParallelMessages().
+ */
+void
+HandleParallelMessageInterrupt(void)
+{
+	InterruptPending = true;
+	ParallelMessagePending = true;
+	SetLatch(MyLatch);
+}
+
+/*
+ * Handle any queued protocol messages received from parallel workers.
+ */
+void
+HandleParallelMessages(void)
+{
+	dlist_iter	iter;
+	MemoryContext oldcontext;
+
+	static MemoryContext hpm_context = NULL;
+
+	/*
+	 * This is invoked from ProcessInterrupts(), and since some of the
+	 * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential
+	 * for recursive calls if more signals are received while this runs.  It's
+	 * unclear that recursive entry would be safe, and it doesn't seem useful
+	 * even if it is safe, so let's block interrupts until done.
+	 */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Moreover, CurrentMemoryContext might be pointing almost anywhere.  We
+	 * don't want to risk leaking data into long-lived contexts, so let's do
+	 * our work here in a private context that we can reset on each use.
+	 */
+	if (hpm_context == NULL)	/* first time through? */
+		hpm_context = AllocSetContextCreate(TopMemoryContext,
+											"HandleParallelMessages",
+											ALLOCSET_DEFAULT_SIZES);
+	else
+		MemoryContextReset(hpm_context);
+
+	oldcontext = MemoryContextSwitchTo(hpm_context);
+
+	/* OK to process messages.  Reset the flag saying there are more to do. */
+	ParallelMessagePending = false;
+
+	dlist_foreach(iter, &pcxt_list)
+	{
+		ParallelContext *pcxt;
+		int			i;
+
+		pcxt = dlist_container(ParallelContext, node, iter.cur);
+		if (pcxt->worker == NULL)
+			continue;
+
+		for (i = 0; i < pcxt->nworkers_launched; ++i)
+		{
+			/*
+			 * Read as many messages as we can from each worker, but stop when
+			 * either (1) the worker's error queue goes away, which can happen
+			 * if we receive a Terminate message from the worker; or (2) no
+			 * more messages can be read from the worker without blocking.
+			 */
+			while (pcxt->worker[i].error_mqh != NULL)
+			{
+				shm_mq_result res;
+				Size		nbytes;
+				void	   *data;
+
+				res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes,
+									 &data, true);
+				if (res == SHM_MQ_WOULD_BLOCK)
+					break;
+				else if (res == SHM_MQ_SUCCESS)
+				{
+					StringInfoData msg;
+
+					initStringInfo(&msg);
+					appendBinaryStringInfo(&msg, data, nbytes);
+					HandleParallelMessage(pcxt, i, &msg);
+					pfree(msg.data);
+				}
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("lost connection to parallel worker")));
+			}
+		}
+	}
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Might as well clear the context on our way out */
+	MemoryContextReset(hpm_context);
+
+	RESUME_INTERRUPTS();
+}
+
+/*
+ * Handle a single protocol message received from a single parallel worker.
+ */
+static void
+HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg)
+{
+	char		msgtype;
+
+	if (pcxt->known_attached_workers != NULL &&
+		!pcxt->known_attached_workers[i])
+	{
+		pcxt->known_attached_workers[i] = true;
+		pcxt->nknown_attached_workers++;
+	}
+
+	msgtype = pq_getmsgbyte(msg);
+
+	switch (msgtype)
+	{
+		case 'K':				/* BackendKeyData */
+			{
+				int32		pid = pq_getmsgint(msg, 4);
+
+				(void) pq_getmsgint(msg, 4);	/* discard cancel key */
+				(void) pq_getmsgend(msg);
+				pcxt->worker[i].pid = pid;
+				break;
+			}
+
+		case 'E':				/* ErrorResponse */
+		case 'N':				/* NoticeResponse */
+			{
+				ErrorData	edata;
+				ErrorContextCallback *save_error_context_stack;
+
+				/* Parse ErrorResponse or NoticeResponse. */
+				pq_parse_errornotice(msg, &edata);
+
+				/* Death of a worker isn't enough justification for suicide. */
+				edata.elevel = Min(edata.elevel, ERROR);
+
+				/*
+				 * If desired, add a context line to show that this is a
+				 * message propagated from a parallel worker.  Otherwise, it
+				 * can sometimes be confusing to understand what actually
+				 * happened.  (We don't do this in FORCE_PARALLEL_REGRESS mode
+				 * because it causes test-result instability depending on
+				 * whether a parallel worker is actually used or not.)
+				 */
+				if (force_parallel_mode != FORCE_PARALLEL_REGRESS)
+				{
+					if (edata.context)
+						edata.context = psprintf("%s\n%s", edata.context,
+												 _("parallel worker"));
+					else
+						edata.context = pstrdup(_("parallel worker"));
+				}
+
+				/*
+				 * Context beyond that should use the error context callbacks
+				 * that were in effect when the ParallelContext was created,
+				 * not the current ones.
+				 */
+				save_error_context_stack = error_context_stack;
+				error_context_stack = pcxt->error_context_stack;
+
+				/* Rethrow error or print notice. */
+				ThrowErrorData(&edata);
+
+				/* Not an error, so restore previous context stack. */
+				error_context_stack = save_error_context_stack;
+
+				break;
+			}
+
+		case 'A':				/* NotifyResponse */
+			{
+				/* Propagate NotifyResponse. */
+				int32		pid;
+				const char *channel;
+				const char *payload;
+
+				pid = pq_getmsgint(msg, 4);
+				channel = pq_getmsgrawstring(msg);
+				payload = pq_getmsgrawstring(msg);
+				pq_endmessage(msg);
+
+				NotifyMyFrontEnd(channel, payload, pid);
+
+				break;
+			}
+
+		case 'X':				/* Terminate, indicating clean exit */
+			{
+				shm_mq_detach(pcxt->worker[i].error_mqh);
+				pcxt->worker[i].error_mqh = NULL;
+				break;
+			}
+
+		default:
+			{
+				elog(ERROR, "unrecognized message type received from parallel worker: %c (message length %d bytes)",
+					 msgtype, msg->len);
+			}
+	}
+}
+
+/*
+ * End-of-subtransaction cleanup for parallel contexts.
+ *
+ * Currently, it's forbidden to enter or leave a subtransaction while
+ * parallel mode is in effect, so we could just blow away everything.  But
+ * we may want to relax that restriction in the future, so this code
+ * contemplates that there may be multiple subtransaction IDs in pcxt_list.
+ */
+void
+AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId)
+{
+	while (!dlist_is_empty(&pcxt_list))
+	{
+		ParallelContext *pcxt;
+
+		pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+		if (pcxt->subid != mySubId)
+			break;
+		if (isCommit)
+			elog(WARNING, "leaked parallel context");
+		DestroyParallelContext(pcxt);
+	}
+}
+
+/*
+ * End-of-transaction cleanup for parallel contexts.
+ */
+void
+AtEOXact_Parallel(bool isCommit)
+{
+	while (!dlist_is_empty(&pcxt_list))
+	{
+		ParallelContext *pcxt;
+
+		pcxt = dlist_head_element(ParallelContext, node, &pcxt_list);
+		if (isCommit)
+			elog(WARNING, "leaked parallel context");
+		DestroyParallelContext(pcxt);
+	}
+}
+
+/*
+ * Main entrypoint for parallel workers.
+ */
+void
+ParallelWorkerMain(Datum main_arg)
+{
+	dsm_segment *seg;
+	shm_toc    *toc;
+	FixedParallelState *fps;
+	char	   *error_queue_space;
+	shm_mq	   *mq;
+	shm_mq_handle *mqh;
+	char	   *libraryspace;
+	char	   *entrypointstate;
+	char	   *library_name;
+	char	   *function_name;
+	parallel_worker_main_type entrypt;
+	char	   *gucspace;
+	char	   *combocidspace;
+	char	   *tsnapspace;
+	char	   *asnapspace;
+	char	   *tstatespace;
+	char	   *pendingsyncsspace;
+	char	   *reindexspace;
+	char	   *relmapperspace;
+	char	   *uncommittedenumsspace;
+	StringInfoData msgbuf;
+	char	   *session_dsm_handle_space;
+	Snapshot	tsnapshot;
+	Snapshot	asnapshot;
+
+	/* Set flag to indicate that we're initializing a parallel worker. */
+	InitializingParallelWorker = true;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGTERM, die);
+	BackgroundWorkerUnblockSignals();
+
+	/* Determine and set our parallel worker number. */
+	Assert(ParallelWorkerNumber == -1);
+	memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int));
+
+	/* Set up a memory context to work in, just for cleanliness. */
+	CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext,
+												 "Parallel worker",
+												 ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * Attach to the dynamic shared memory segment for the parallel query, and
+	 * find its table of contents.
+	 *
+	 * Note: at this point, we have not created any ResourceOwner in this
+	 * process.  This will result in our DSM mapping surviving until process
+	 * exit, which is fine.  If there were a ResourceOwner, it would acquire
+	 * ownership of the mapping, but we have no need for that.
+	 */
+	seg = dsm_attach(DatumGetUInt32(main_arg));
+	if (seg == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not map dynamic shared memory segment")));
+	toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg));
+	if (toc == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid magic number in dynamic shared memory segment")));
+
+	/* Look up fixed parallel state. */
+	fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED, false);
+	MyFixedParallelState = fps;
+
+	/* Arrange to signal the leader if we exit. */
+	ParallelLeaderPid = fps->parallel_leader_pid;
+	ParallelLeaderBackendId = fps->parallel_leader_backend_id;
+	before_shmem_exit(ParallelWorkerShutdown, PointerGetDatum(seg));
+
+	/*
+	 * Now we can find and attach to the error queue provided for us.  That's
+	 * good, because until we do that, any errors that happen here will not be
+	 * reported back to the process that requested that this worker be
+	 * launched.
+	 */
+	error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE, false);
+	mq = (shm_mq *) (error_queue_space +
+					 ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE);
+	shm_mq_set_sender(mq, MyProc);
+	mqh = shm_mq_attach(mq, seg, NULL);
+	pq_redirect_to_shm_mq(seg, mqh);
+	pq_set_parallel_leader(fps->parallel_leader_pid,
+						   fps->parallel_leader_backend_id);
+
+	/*
+	 * Send a BackendKeyData message to the process that initiated parallelism
+	 * so that it has access to our PID before it receives any other messages
+	 * from us.  Our cancel key is sent, too, since that's the way the
+	 * protocol message is defined, but it won't actually be used for anything
+	 * in this case.
+	 */
+	pq_beginmessage(&msgbuf, 'K');
+	pq_sendint32(&msgbuf, (int32) MyProcPid);
+	pq_sendint32(&msgbuf, (int32) MyCancelKey);
+	pq_endmessage(&msgbuf);
+
+	/*
+	 * Hooray! Primary initialization is complete.  Now, we need to set up our
+	 * backend-local state to match the original backend.
+	 */
+
+	/*
+	 * Join locking group.  We must do this before anything that could try to
+	 * acquire a heavyweight lock, because any heavyweight locks acquired to
+	 * this point could block either directly against the parallel group
+	 * leader or against some process which in turn waits for a lock that
+	 * conflicts with the parallel group leader, causing an undetected
+	 * deadlock.  (If we can't join the lock group, the leader has gone away,
+	 * so just exit quietly.)
+	 */
+	if (!BecomeLockGroupMember(fps->parallel_leader_pgproc,
+							   fps->parallel_leader_pid))
+		return;
+
+	/*
+	 * Restore transaction and statement start-time timestamps.  This must
+	 * happen before anything that would start a transaction, else asserts in
+	 * xact.c will fire.
+	 */
+	SetParallelStartTimestamps(fps->xact_ts, fps->stmt_ts);
+
+	/*
+	 * Identify the entry point to be called.  In theory this could result in
+	 * loading an additional library, though most likely the entry point is in
+	 * the core backend or in a library we just loaded.
+	 */
+	entrypointstate = shm_toc_lookup(toc, PARALLEL_KEY_ENTRYPOINT, false);
+	library_name = entrypointstate;
+	function_name = entrypointstate + strlen(library_name) + 1;
+
+	entrypt = LookupParallelWorkerFunction(library_name, function_name);
+
+	/* Restore database connection. */
+	BackgroundWorkerInitializeConnectionByOid(fps->database_id,
+											  fps->authenticated_user_id,
+											  0);
+
+	/*
+	 * Set the client encoding to the database encoding, since that is what
+	 * the leader will expect.
+	 */
+	SetClientEncoding(GetDatabaseEncoding());
+
+	/*
+	 * Load libraries that were loaded by original backend.  We want to do
+	 * this before restoring GUCs, because the libraries might define custom
+	 * variables.
+	 */
+	libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY, false);
+	StartTransactionCommand();
+	RestoreLibraryState(libraryspace);
+
+	/* Restore GUC values from launching backend. */
+	gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC, false);
+	RestoreGUCState(gucspace);
+	CommitTransactionCommand();
+
+	/* Crank up a transaction state appropriate to a parallel worker. */
+	tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE, false);
+	StartParallelWorkerTransaction(tstatespace);
+
+	/* Restore combo CID state. */
+	combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false);
+	RestoreComboCIDState(combocidspace);
+
+	/* Attach to the per-session DSM segment and contained objects. */
+	session_dsm_handle_space =
+		shm_toc_lookup(toc, PARALLEL_KEY_SESSION_DSM, false);
+	AttachSession(*(dsm_handle *) session_dsm_handle_space);
+
+	/*
+	 * If the transaction isolation level is REPEATABLE READ or SERIALIZABLE,
+	 * the leader has serialized the transaction snapshot and we must restore
+	 * it. At lower isolation levels, there is no transaction-lifetime
+	 * snapshot, but we need TransactionXmin to get set to a value which is
+	 * less than or equal to the xmin of every snapshot that will be used by
+	 * this worker. The easiest way to accomplish that is to install the
+	 * active snapshot as the transaction snapshot. Code running in this
+	 * parallel worker might take new snapshots via GetTransactionSnapshot()
+	 * or GetLatestSnapshot(), but it shouldn't have any way of acquiring a
+	 * snapshot older than the active snapshot.
+	 */
+	asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, false);
+	tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, true);
+	asnapshot = RestoreSnapshot(asnapspace);
+	tsnapshot = tsnapspace ? RestoreSnapshot(tsnapspace) : asnapshot;
+	RestoreTransactionSnapshot(tsnapshot,
+							   fps->parallel_leader_pgproc);
+	PushActiveSnapshot(asnapshot);
+
+	/*
+	 * We've changed which tuples we can see, and must therefore invalidate
+	 * system caches.
+	 */
+	InvalidateSystemCaches();
+
+	/*
+	 * Restore current role id.  Skip verifying whether session user is
+	 * allowed to become this role and blindly restore the leader's state for
+	 * current role.
+	 */
+	SetCurrentRoleId(fps->outer_user_id, fps->is_superuser);
+
+	/* Restore user ID and security context. */
+	SetUserIdAndSecContext(fps->current_user_id, fps->sec_context);
+
+	/* Restore temp-namespace state to ensure search path matches leader's. */
+	SetTempNamespaceState(fps->temp_namespace_id,
+						  fps->temp_toast_namespace_id);
+
+	/* Restore pending syncs. */
+	pendingsyncsspace = shm_toc_lookup(toc, PARALLEL_KEY_PENDING_SYNCS,
+									   false);
+	RestorePendingSyncs(pendingsyncsspace);
+
+	/* Restore reindex state. */
+	reindexspace = shm_toc_lookup(toc, PARALLEL_KEY_REINDEX_STATE, false);
+	RestoreReindexState(reindexspace);
+
+	/* Restore relmapper state. */
+	relmapperspace = shm_toc_lookup(toc, PARALLEL_KEY_RELMAPPER_STATE, false);
+	RestoreRelationMap(relmapperspace);
+
+	/* Restore uncommitted enums. */
+	uncommittedenumsspace = shm_toc_lookup(toc, PARALLEL_KEY_UNCOMMITTEDENUMS,
+										   false);
+	RestoreUncommittedEnums(uncommittedenumsspace);
+
+	/* Attach to the leader's serializable transaction, if SERIALIZABLE. */
+	AttachSerializableXact(fps->serializable_xact_handle);
+
+	/*
+	 * We've initialized all of our state now; nothing should change
+	 * hereafter.
+	 */
+	InitializingParallelWorker = false;
+	EnterParallelMode();
+
+	/*
+	 * Time to do the real work: invoke the caller-supplied code.
+	 */
+	entrypt(seg, toc);
+
+	/* Must exit parallel mode to pop active snapshot. */
+	ExitParallelMode();
+
+	/* Must pop active snapshot so snapmgr.c doesn't complain. */
+	PopActiveSnapshot();
+
+	/* Shut down the parallel-worker transaction. */
+	EndParallelWorkerTransaction();
+
+	/* Detach from the per-session DSM segment. */
+	DetachSession();
+
+	/* Report success. */
+	pq_putmessage('X', NULL, 0);
+}
+
+/*
+ * Update shared memory with the ending location of the last WAL record we
+ * wrote, if it's greater than the value already stored there.
+ */
+void
+ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end)
+{
+	FixedParallelState *fps = MyFixedParallelState;
+
+	Assert(fps != NULL);
+	SpinLockAcquire(&fps->mutex);
+	if (fps->last_xlog_end < last_xlog_end)
+		fps->last_xlog_end = last_xlog_end;
+	SpinLockRelease(&fps->mutex);
+}
+
+/*
+ * Make sure the leader tries to read from our error queue one more time.
+ * This guards against the case where we exit uncleanly without sending an
+ * ErrorResponse to the leader, for example because some code calls proc_exit
+ * directly.
+ *
+ * Also explicitly detach from dsm segment so that subsystems using
+ * on_dsm_detach() have a chance to send stats before the stats subsystem is
+ * shut down as part of a before_shmem_exit() hook.
+ *
+ * One might think this could instead be solved by carefully ordering the
+ * attaching to dsm segments, so that the pgstats segments get detached from
+ * later than the parallel query one. That turns out to not work because the
+ * stats hash might need to grow which can cause new segments to be allocated,
+ * which then will be detached from earlier.
+ */
+static void
+ParallelWorkerShutdown(int code, Datum arg)
+{
+	SendProcSignal(ParallelLeaderPid,
+				   PROCSIG_PARALLEL_MESSAGE,
+				   ParallelLeaderBackendId);
+
+	dsm_detach((dsm_segment *) DatumGetPointer(arg));
+}
+
+/*
+ * Look up (and possibly load) a parallel worker entry point function.
+ *
+ * For functions contained in the core code, we use library name "postgres"
+ * and consult the InternalParallelWorkers array.  External functions are
+ * looked up, and loaded if necessary, using load_external_function().
+ *
+ * The point of this is to pass function names as strings across process
+ * boundaries.  We can't pass actual function addresses because of the
+ * possibility that the function has been loaded at a different address
+ * in a different process.  This is obviously a hazard for functions in
+ * loadable libraries, but it can happen even for functions in the core code
+ * on platforms using EXEC_BACKEND (e.g., Windows).
+ *
+ * At some point it might be worthwhile to get rid of InternalParallelWorkers[]
+ * in favor of applying load_external_function() for core functions too;
+ * but that raises portability issues that are not worth addressing now.
+ */
+static parallel_worker_main_type
+LookupParallelWorkerFunction(const char *libraryname, const char *funcname)
+{
+	/*
+	 * If the function is to be loaded from postgres itself, search the
+	 * InternalParallelWorkers array.
+	 */
+	if (strcmp(libraryname, "postgres") == 0)
+	{
+		int			i;
+
+		for (i = 0; i < lengthof(InternalParallelWorkers); i++)
+		{
+			if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0)
+				return InternalParallelWorkers[i].fn_addr;
+		}
+
+		/* We can only reach this by programming error. */
+		elog(ERROR, "internal function \"%s\" not found", funcname);
+	}
+
+	/* Otherwise load from external library. */
+	return (parallel_worker_main_type)
+		load_external_function(libraryname, funcname, true, NULL);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
new file mode 100644
index 0000000..6bb4de3
--- /dev/null
+++ b/src/backend/access/transam/rmgr.c
@@ -0,0 +1,161 @@
+/*
+ * rmgr.c
+ *
+ * Resource managers definition
+ *
+ * src/backend/access/transam/rmgr.c
+ */
+#include "postgres.h"
+
+#include "access/brin_xlog.h"
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/generic_xlog.h"
+#include "access/ginxlog.h"
+#include "access/gistxlog.h"
+#include "access/hash_xlog.h"
+#include "access/heapam_xlog.h"
+#include "access/multixact.h"
+#include "access/nbtxlog.h"
+#include "access/spgxlog.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "catalog/storage_xlog.h"
+#include "commands/dbcommands_xlog.h"
+#include "commands/sequence.h"
+#include "commands/tablespace.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "replication/decode.h"
+#include "replication/message.h"
+#include "replication/origin.h"
+#include "storage/standby.h"
+#include "utils/builtins.h"
+#include "utils/relmapper.h"
+
+/* must be kept in sync with RmgrData definition in xlog_internal.h */
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \
+	{ name, redo, desc, identify, startup, cleanup, mask, decode },
+
+RmgrData	RmgrTable[RM_MAX_ID + 1] = {
+#include "access/rmgrlist.h"
+};
+
+/*
+ * Start up all resource managers.
+ */
+void
+RmgrStartup(void)
+{
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (!RmgrIdExists(rmid))
+			continue;
+
+		if (RmgrTable[rmid].rm_startup != NULL)
+			RmgrTable[rmid].rm_startup();
+	}
+}
+
+/*
+ * Clean up all resource managers.
+ */
+void
+RmgrCleanup(void)
+{
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (!RmgrIdExists(rmid))
+			continue;
+
+		if (RmgrTable[rmid].rm_cleanup != NULL)
+			RmgrTable[rmid].rm_cleanup();
+	}
+}
+
+/*
+ * Emit ERROR when we encounter a record with an RmgrId we don't
+ * recognize.
+ */
+void
+RmgrNotFound(RmgrId rmid)
+{
+	ereport(ERROR, (errmsg("resource manager with ID %d not registered", rmid),
+					errhint("Include the extension module that implements this resource manager in shared_preload_libraries.")));
+}
+
+/*
+ * Register a new custom WAL resource manager.
+ *
+ * Resource manager IDs must be globally unique across all extensions. Refer
+ * to https://wiki.postgresql.org/wiki/CustomWALResourceManagers to reserve a
+ * unique RmgrId for your extension, to avoid conflicts with other extension
+ * developers. During development, use RM_EXPERIMENTAL_ID to avoid needlessly
+ * reserving a new ID.
+ */
+void
+RegisterCustomRmgr(RmgrId rmid, RmgrData *rmgr)
+{
+	if (rmgr->rm_name == NULL || strlen(rmgr->rm_name) == 0)
+		ereport(ERROR, (errmsg("custom resource manager name is invalid"),
+						errhint("Provide a non-empty name for the custom resource manager.")));
+
+	if (!RmgrIdIsCustom(rmid))
+		ereport(ERROR, (errmsg("custom resource manager ID %d is out of range", rmid),
+						errhint("Provide a custom resource manager ID between %d and %d.",
+								RM_MIN_CUSTOM_ID, RM_MAX_CUSTOM_ID)));
+
+	if (!process_shared_preload_libraries_in_progress)
+		ereport(ERROR,
+				(errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+				 errdetail("Custom resource manager must be registered while initializing modules in shared_preload_libraries.")));
+
+	if (RmgrTable[rmid].rm_name != NULL)
+		ereport(ERROR,
+				(errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+				 errdetail("Custom resource manager \"%s\" already registered with the same ID.",
+						   RmgrTable[rmid].rm_name)));
+
+	/* check for existing rmgr with the same name */
+	for (int existing_rmid = 0; existing_rmid <= RM_MAX_ID; existing_rmid++)
+	{
+		if (!RmgrIdExists(existing_rmid))
+			continue;
+
+		if (!pg_strcasecmp(RmgrTable[existing_rmid].rm_name, rmgr->rm_name))
+			ereport(ERROR,
+					(errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid),
+					 errdetail("Existing resource manager with ID %d has the same name.", existing_rmid)));
+	}
+
+	/* register it */
+	RmgrTable[rmid] = *rmgr;
+	ereport(LOG,
+			(errmsg("registered custom resource manager \"%s\" with ID %d",
+					rmgr->rm_name, rmid)));
+}
+
+/* SQL SRF showing loaded resource managers */
+Datum
+pg_get_wal_resource_managers(PG_FUNCTION_ARGS)
+{
+#define PG_GET_RESOURCE_MANAGERS_COLS 3
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[PG_GET_RESOURCE_MANAGERS_COLS];
+	bool		nulls[PG_GET_RESOURCE_MANAGERS_COLS] = {0};
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (!RmgrIdExists(rmid))
+			continue;
+		values[0] = Int32GetDatum(rmid);
+		values[1] = CStringGetTextDatum(GetRmgr(rmid).rm_name);
+		values[2] = BoolGetDatum(RmgrIdIsBuiltin(rmid));
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+
+	return (Datum) 0;
+}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
new file mode 100644
index 0000000..af57fe9
--- /dev/null
+++ b/src/backend/access/transam/slru.c
@@ -0,0 +1,1615 @@
+/*-------------------------------------------------------------------------
+ *
+ * slru.c
+ *		Simple LRU buffering for transaction status logfiles
+ *
+ * We use a simple least-recently-used scheme to manage a pool of page
+ * buffers.  Under ordinary circumstances we expect that write
+ * traffic will occur mostly to the latest page (and to the just-prior
+ * page, soon after a page transition).  Read traffic will probably touch
+ * a larger span of pages, but in any case a fairly small number of page
+ * buffers should be sufficient.  So, we just search the buffers using plain
+ * linear search; there's no need for a hashtable or anything fancy.
+ * The management algorithm is straight LRU except that we will never swap
+ * out the latest page (since we know it's going to be hit again eventually).
+ *
+ * We use a control LWLock to protect the shared data structures, plus
+ * per-buffer LWLocks that synchronize I/O for each buffer.  The control lock
+ * must be held to examine or modify any shared state.  A process that is
+ * reading in or writing out a page buffer does not hold the control lock,
+ * only the per-buffer lock for the buffer it is working on.
+ *
+ * "Holding the control lock" means exclusive lock in all cases except for
+ * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
+ * the implications of that.
+ *
+ * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
+ * before releasing the control lock.  The per-buffer lock is released after
+ * completing the I/O, re-acquiring the control lock, and updating the shared
+ * state.  (Deadlock is not possible here, because we never try to initiate
+ * I/O when someone else is already doing I/O on the same buffer.)
+ * To wait for I/O to complete, release the control lock, acquire the
+ * per-buffer lock in shared mode, immediately release the per-buffer lock,
+ * reacquire the control lock, and then recheck state (since arbitrary things
+ * could have happened while we didn't have the lock).
+ *
+ * As with the regular buffer manager, it is possible for another process
+ * to re-dirty a page that is currently being written out.  This is handled
+ * by re-setting the page's page_dirty flag.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/slru.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/slru.h"
+#include "access/transam.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/shmem.h"
+
+#define SlruFileName(ctl, path, seg) \
+	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
+
+/*
+ * During SimpleLruWriteAll(), we will usually not need to write more than one
+ * or two physical files, but we may need to write several pages per file.  We
+ * can consolidate the I/O requests by leaving files open until control returns
+ * to SimpleLruWriteAll().  This data structure remembers which files are open.
+ */
+#define MAX_WRITEALL_BUFFERS	16
+
+typedef struct SlruWriteAllData
+{
+	int			num_files;		/* # files actually open */
+	int			fd[MAX_WRITEALL_BUFFERS];	/* their FD's */
+	int			segno[MAX_WRITEALL_BUFFERS];	/* their log seg#s */
+} SlruWriteAllData;
+
+typedef struct SlruWriteAllData *SlruWriteAll;
+
+/*
+ * Populate a file tag describing a segment file.  We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = (xx_handler), \
+	(a).segno = (xx_segno) \
+)
+
+/*
+ * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
+ * of arguments!
+ *
+ * The reason for the if-test is that there are often many consecutive
+ * accesses to the same page (particularly the latest page).  By suppressing
+ * useless increments of cur_lru_count, we reduce the probability that old
+ * pages' counts will "wrap around" and make them appear recently used.
+ *
+ * We allow this code to be executed concurrently by multiple processes within
+ * SimpleLruReadPage_ReadOnly().  As long as int reads and writes are atomic,
+ * this should not cause any completely-bogus values to enter the computation.
+ * However, it is possible for either cur_lru_count or individual
+ * page_lru_count entries to be "reset" to lower values than they should have,
+ * in case a process is delayed while it executes this macro.  With care in
+ * SlruSelectLRUPage(), this does little harm, and in any case the absolute
+ * worst possible consequence is a nonoptimal choice of page to evict.  The
+ * gain from allowing concurrent reads of SLRU pages seems worth it.
+ */
+#define SlruRecentlyUsed(shared, slotno)	\
+	do { \
+		int		new_lru_count = (shared)->cur_lru_count; \
+		if (new_lru_count != (shared)->page_lru_count[slotno]) { \
+			(shared)->cur_lru_count = ++new_lru_count; \
+			(shared)->page_lru_count[slotno] = new_lru_count; \
+		} \
+	} while (0)
+
+/* Saved info for SlruReportIOError */
+typedef enum
+{
+	SLRU_OPEN_FAILED,
+	SLRU_SEEK_FAILED,
+	SLRU_READ_FAILED,
+	SLRU_WRITE_FAILED,
+	SLRU_FSYNC_FAILED,
+	SLRU_CLOSE_FAILED
+} SlruErrorCause;
+
+static SlruErrorCause slru_errcause;
+static int	slru_errno;
+
+
+static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
+static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
+static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
+static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+								  SlruWriteAll fdata);
+static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
+static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
+
+static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
+									  int segpage, void *data);
+static void SlruInternalDeleteSegment(SlruCtl ctl, int segno);
+
+/*
+ * Initialization of shared memory
+ */
+
+Size
+SimpleLruShmemSize(int nslots, int nlsns)
+{
+	Size		sz;
+
+	/* we assume nslots isn't so large as to risk overflow */
+	sz = MAXALIGN(sizeof(SlruSharedData));
+	sz += MAXALIGN(nslots * sizeof(char *));	/* page_buffer[] */
+	sz += MAXALIGN(nslots * sizeof(SlruPageStatus));	/* page_status[] */
+	sz += MAXALIGN(nslots * sizeof(bool));	/* page_dirty[] */
+	sz += MAXALIGN(nslots * sizeof(int));	/* page_number[] */
+	sz += MAXALIGN(nslots * sizeof(int));	/* page_lru_count[] */
+	sz += MAXALIGN(nslots * sizeof(LWLockPadded));	/* buffer_locks[] */
+
+	if (nlsns > 0)
+		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
+
+	return BUFFERALIGN(sz) + BLCKSZ * nslots;
+}
+
+/*
+ * Initialize, or attach to, a simple LRU cache in shared memory.
+ *
+ * ctl: address of local (unshared) control structure.
+ * name: name of SLRU.  (This is user-visible, pick with care!)
+ * nslots: number of page slots to use.
+ * nlsns: number of LSN groups per page (set to zero if not relevant).
+ * ctllock: LWLock to use to control access to the shared control structure.
+ * subdir: PGDATA-relative subdirectory that will contain the files.
+ * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
+ * sync_handler: which set of functions to use to handle sync requests
+ */
+void
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+			  LWLock *ctllock, const char *subdir, int tranche_id,
+			  SyncRequestHandler sync_handler)
+{
+	SlruShared	shared;
+	bool		found;
+
+	shared = (SlruShared) ShmemInitStruct(name,
+										  SimpleLruShmemSize(nslots, nlsns),
+										  &found);
+
+	if (!IsUnderPostmaster)
+	{
+		/* Initialize locks and shared memory area */
+		char	   *ptr;
+		Size		offset;
+		int			slotno;
+
+		Assert(!found);
+
+		memset(shared, 0, sizeof(SlruSharedData));
+
+		shared->ControlLock = ctllock;
+
+		shared->num_slots = nslots;
+		shared->lsn_groups_per_page = nlsns;
+
+		shared->cur_lru_count = 0;
+
+		/* shared->latest_page_number will be set later */
+
+		shared->slru_stats_idx = pgstat_get_slru_index(name);
+
+		ptr = (char *) shared;
+		offset = MAXALIGN(sizeof(SlruSharedData));
+		shared->page_buffer = (char **) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(char *));
+		shared->page_status = (SlruPageStatus *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
+		shared->page_dirty = (bool *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(bool));
+		shared->page_number = (int *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(int));
+		shared->page_lru_count = (int *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(int));
+
+		/* Initialize LWLocks */
+		shared->buffer_locks = (LWLockPadded *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(LWLockPadded));
+
+		if (nlsns > 0)
+		{
+			shared->group_lsn = (XLogRecPtr *) (ptr + offset);
+			offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
+		}
+
+		ptr += BUFFERALIGN(offset);
+		for (slotno = 0; slotno < nslots; slotno++)
+		{
+			LWLockInitialize(&shared->buffer_locks[slotno].lock,
+							 tranche_id);
+
+			shared->page_buffer[slotno] = ptr;
+			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			shared->page_dirty[slotno] = false;
+			shared->page_lru_count[slotno] = 0;
+			ptr += BLCKSZ;
+		}
+
+		/* Should fit to estimated shmem size */
+		Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+	}
+	else
+		Assert(found);
+
+	/*
+	 * Initialize the unshared control struct, including directory path. We
+	 * assume caller set PagePrecedes.
+	 */
+	ctl->shared = shared;
+	ctl->sync_handler = sync_handler;
+	strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
+}
+
+/*
+ * Initialize (or reinitialize) a page to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+int
+SimpleLruZeroPage(SlruCtl ctl, int pageno)
+{
+	SlruShared	shared = ctl->shared;
+	int			slotno;
+
+	/* Find a suitable buffer slot for the page */
+	slotno = SlruSelectLRUPage(ctl, pageno);
+	Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+		   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+			!shared->page_dirty[slotno]) ||
+		   shared->page_number[slotno] == pageno);
+
+	/* Mark the slot as containing this page */
+	shared->page_number[slotno] = pageno;
+	shared->page_status[slotno] = SLRU_PAGE_VALID;
+	shared->page_dirty[slotno] = true;
+	SlruRecentlyUsed(shared, slotno);
+
+	/* Set the buffer to zeroes */
+	MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+
+	/* Set the LSNs for this new page to zero */
+	SimpleLruZeroLSNs(ctl, slotno);
+
+	/* Assume this page is now the latest active page */
+	shared->latest_page_number = pageno;
+
+	/* update the stats counter of zeroed pages */
+	pgstat_count_slru_page_zeroed(shared->slru_stats_idx);
+
+	return slotno;
+}
+
+/*
+ * Zero all the LSNs we store for this slru page.
+ *
+ * This should be called each time we create a new page, and each time we read
+ * in a page from disk into an existing buffer.  (Such an old page cannot
+ * have any interesting LSNs, since we'd have flushed them before writing
+ * the page in the first place.)
+ *
+ * This assumes that InvalidXLogRecPtr is bitwise-all-0.
+ */
+static void
+SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
+{
+	SlruShared	shared = ctl->shared;
+
+	if (shared->lsn_groups_per_page > 0)
+		MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
+			   shared->lsn_groups_per_page * sizeof(XLogRecPtr));
+}
+
+/*
+ * Wait for any active I/O on a page slot to finish.  (This does not
+ * guarantee that new I/O hasn't been started before we return, though.
+ * In fact the slot might not even contain the same page anymore.)
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static void
+SimpleLruWaitIO(SlruCtl ctl, int slotno)
+{
+	SlruShared	shared = ctl->shared;
+
+	/* See notes at top of file */
+	LWLockRelease(shared->ControlLock);
+	LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
+	LWLockRelease(&shared->buffer_locks[slotno].lock);
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+	/*
+	 * If the slot is still in an io-in-progress state, then either someone
+	 * already started a new I/O on the slot, or a previous I/O failed and
+	 * neglected to reset the page state.  That shouldn't happen, really, but
+	 * it seems worth a few extra cycles to check and recover from it. We can
+	 * cheaply test for failure by seeing if the buffer lock is still held (we
+	 * assume that transaction abort would release the lock).
+	 */
+	if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+		shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
+	{
+		if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
+		{
+			/* indeed, the I/O must have failed */
+			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
+				shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			else				/* write_in_progress */
+			{
+				shared->page_status[slotno] = SLRU_PAGE_VALID;
+				shared->page_dirty[slotno] = true;
+			}
+			LWLockRelease(&shared->buffer_locks[slotno].lock);
+		}
+	}
+}
+
+/*
+ * Find a page in a shared buffer, reading it in if necessary.
+ * The page number must correspond to an already-initialized page.
+ *
+ * If write_ok is true then it is OK to return a page that is in
+ * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
+ * that modification of the page is safe.  If write_ok is false then we
+ * will not return the page until it is not undergoing active I/O.
+ *
+ * The passed-in xid is used only for error reporting, and may be
+ * InvalidTransactionId if no specific xid is associated with the action.
+ *
+ * Return value is the shared-buffer slot number now holding the page.
+ * The buffer's LRU access info is updated.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+int
+SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+				  TransactionId xid)
+{
+	SlruShared	shared = ctl->shared;
+
+	/* Outer loop handles restart if we must wait for someone else's I/O */
+	for (;;)
+	{
+		int			slotno;
+		bool		ok;
+
+		/* See if page already is in memory; if not, pick victim slot */
+		slotno = SlruSelectLRUPage(ctl, pageno);
+
+		/* Did we find the page in memory? */
+		if (shared->page_number[slotno] == pageno &&
+			shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+		{
+			/*
+			 * If page is still being read in, we must wait for I/O.  Likewise
+			 * if the page is being written and the caller said that's not OK.
+			 */
+			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+				(shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+				 !write_ok))
+			{
+				SimpleLruWaitIO(ctl, slotno);
+				/* Now we must recheck state from the top */
+				continue;
+			}
+			/* Otherwise, it's ready to use */
+			SlruRecentlyUsed(shared, slotno);
+
+			/* update the stats counter of pages found in the SLRU */
+			pgstat_count_slru_page_hit(shared->slru_stats_idx);
+
+			return slotno;
+		}
+
+		/* We found no match; assert we selected a freeable slot */
+		Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+			   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+				!shared->page_dirty[slotno]));
+
+		/* Mark the slot read-busy */
+		shared->page_number[slotno] = pageno;
+		shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
+		shared->page_dirty[slotno] = false;
+
+		/* Acquire per-buffer lock (cannot deadlock, see notes at top) */
+		LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
+
+		/* Release control lock while doing I/O */
+		LWLockRelease(shared->ControlLock);
+
+		/* Do the read */
+		ok = SlruPhysicalReadPage(ctl, pageno, slotno);
+
+		/* Set the LSNs for this newly read-in page to zero */
+		SimpleLruZeroLSNs(ctl, slotno);
+
+		/* Re-acquire control lock and update page state */
+		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+		Assert(shared->page_number[slotno] == pageno &&
+			   shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
+			   !shared->page_dirty[slotno]);
+
+		shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
+
+		LWLockRelease(&shared->buffer_locks[slotno].lock);
+
+		/* Now it's okay to ereport if we failed */
+		if (!ok)
+			SlruReportIOError(ctl, pageno, xid);
+
+		SlruRecentlyUsed(shared, slotno);
+
+		/* update the stats counter of pages not found in SLRU */
+		pgstat_count_slru_page_read(shared->slru_stats_idx);
+
+		return slotno;
+	}
+}
+
+/*
+ * Find a page in a shared buffer, reading it in if necessary.
+ * The page number must correspond to an already-initialized page.
+ * The caller must intend only read-only access to the page.
+ *
+ * The passed-in xid is used only for error reporting, and may be
+ * InvalidTransactionId if no specific xid is associated with the action.
+ *
+ * Return value is the shared-buffer slot number now holding the page.
+ * The buffer's LRU access info is updated.
+ *
+ * Control lock must NOT be held at entry, but will be held at exit.
+ * It is unspecified whether the lock will be shared or exclusive.
+ */
+int
+SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
+{
+	SlruShared	shared = ctl->shared;
+	int			slotno;
+
+	/* Try to find the page while holding only shared lock */
+	LWLockAcquire(shared->ControlLock, LW_SHARED);
+
+	/* See if page is already in a buffer */
+	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	{
+		if (shared->page_number[slotno] == pageno &&
+			shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
+			shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
+		{
+			/* See comments for SlruRecentlyUsed macro */
+			SlruRecentlyUsed(shared, slotno);
+
+			/* update the stats counter of pages found in the SLRU */
+			pgstat_count_slru_page_hit(shared->slru_stats_idx);
+
+			return slotno;
+		}
+	}
+
+	/* No luck, so switch to normal exclusive lock and do regular read */
+	LWLockRelease(shared->ControlLock);
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+	return SimpleLruReadPage(ctl, pageno, true, xid);
+}
+
+/*
+ * Write a page from a shared buffer, if necessary.
+ * Does nothing if the specified slot is not dirty.
+ *
+ * NOTE: only one write attempt is made here.  Hence, it is possible that
+ * the page is still dirty at exit (if someone else re-dirtied it during
+ * the write).  However, we *do* attempt a fresh write even if the page
+ * is already being written; this is for checkpoints.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static void
+SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
+{
+	SlruShared	shared = ctl->shared;
+	int			pageno = shared->page_number[slotno];
+	bool		ok;
+
+	/* If a write is in progress, wait for it to finish */
+	while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+		   shared->page_number[slotno] == pageno)
+	{
+		SimpleLruWaitIO(ctl, slotno);
+	}
+
+	/*
+	 * Do nothing if page is not dirty, or if buffer no longer contains the
+	 * same page we were called for.
+	 */
+	if (!shared->page_dirty[slotno] ||
+		shared->page_status[slotno] != SLRU_PAGE_VALID ||
+		shared->page_number[slotno] != pageno)
+		return;
+
+	/*
+	 * Mark the slot write-busy, and clear the dirtybit.  After this point, a
+	 * transaction status update on this page will mark it dirty again.
+	 */
+	shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
+	shared->page_dirty[slotno] = false;
+
+	/* Acquire per-buffer lock (cannot deadlock, see notes at top) */
+	LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
+
+	/* Release control lock while doing I/O */
+	LWLockRelease(shared->ControlLock);
+
+	/* Do the write */
+	ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+	/* If we failed, and we're in a flush, better close the files */
+	if (!ok && fdata)
+	{
+		int			i;
+
+		for (i = 0; i < fdata->num_files; i++)
+			CloseTransientFile(fdata->fd[i]);
+	}
+
+	/* Re-acquire control lock and update page state */
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+	Assert(shared->page_number[slotno] == pageno &&
+		   shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
+
+	/* If we failed to write, mark the page dirty again */
+	if (!ok)
+		shared->page_dirty[slotno] = true;
+
+	shared->page_status[slotno] = SLRU_PAGE_VALID;
+
+	LWLockRelease(&shared->buffer_locks[slotno].lock);
+
+	/* Now it's okay to ereport if we failed */
+	if (!ok)
+		SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+	/* If part of a checkpoint, count this as a buffer written. */
+	if (fdata)
+		CheckpointStats.ckpt_bufs_written++;
+}
+
+/*
+ * Wrapper of SlruInternalWritePage, for external callers.
+ * fdata is always passed a NULL here.
+ */
+void
+SimpleLruWritePage(SlruCtl ctl, int slotno)
+{
+	SlruInternalWritePage(ctl, slotno, NULL);
+}
+
+/*
+ * Return whether the given page exists on disk.
+ *
+ * A false return means that either the file does not exist, or that it's not
+ * large enough to contain the given page.
+ */
+bool
+SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
+{
+	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+	int			offset = rpageno * BLCKSZ;
+	char		path[MAXPGPATH];
+	int			fd;
+	bool		result;
+	off_t		endpos;
+
+	/* update the stats counter of checked pages */
+	pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
+
+	SlruFileName(ctl, path, segno);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+	{
+		/* expected: file doesn't exist */
+		if (errno == ENOENT)
+			return false;
+
+		/* report error normally */
+		slru_errcause = SLRU_OPEN_FAILED;
+		slru_errno = errno;
+		SlruReportIOError(ctl, pageno, 0);
+	}
+
+	if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
+	{
+		slru_errcause = SLRU_SEEK_FAILED;
+		slru_errno = errno;
+		SlruReportIOError(ctl, pageno, 0);
+	}
+
+	result = endpos >= (off_t) (offset + BLCKSZ);
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		slru_errcause = SLRU_CLOSE_FAILED;
+		slru_errno = errno;
+		return false;
+	}
+
+	return result;
+}
+
+/*
+ * Physical read of a (previously existing) page into a buffer slot
+ *
+ * On failure, we cannot just ereport(ERROR) since caller has put state in
+ * shared memory that must be undone.  So, we return false and save enough
+ * info in static variables to let SlruReportIOError make the report.
+ *
+ * For now, assume it's not worth keeping a file pointer open across
+ * read/write operations.  We could cache one virtual file pointer ...
+ */
+static bool
+SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
+{
+	SlruShared	shared = ctl->shared;
+	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+	off_t		offset = rpageno * BLCKSZ;
+	char		path[MAXPGPATH];
+	int			fd;
+
+	SlruFileName(ctl, path, segno);
+
+	/*
+	 * In a crash-and-restart situation, it's possible for us to receive
+	 * commands to set the commit status of transactions whose bits are in
+	 * already-truncated segments of the commit log (see notes in
+	 * SlruPhysicalWritePage).  Hence, if we are InRecovery, allow the case
+	 * where the file doesn't exist, and return zeroes instead.
+	 */
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+	{
+		if (errno != ENOENT || !InRecovery)
+		{
+			slru_errcause = SLRU_OPEN_FAILED;
+			slru_errno = errno;
+			return false;
+		}
+
+		ereport(LOG,
+				(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+						path)));
+		MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+		return true;
+	}
+
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
+	if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+	{
+		pgstat_report_wait_end();
+		slru_errcause = SLRU_READ_FAILED;
+		slru_errno = errno;
+		CloseTransientFile(fd);
+		return false;
+	}
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		slru_errcause = SLRU_CLOSE_FAILED;
+		slru_errno = errno;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Physical write of a page from a buffer slot
+ *
+ * On failure, we cannot just ereport(ERROR) since caller has put state in
+ * shared memory that must be undone.  So, we return false and save enough
+ * info in static variables to let SlruReportIOError make the report.
+ *
+ * For now, assume it's not worth keeping a file pointer open across
+ * independent read/write operations.  We do batch operations during
+ * SimpleLruWriteAll, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruWriteAll.
+ */
+static bool
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
+{
+	SlruShared	shared = ctl->shared;
+	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+	off_t		offset = rpageno * BLCKSZ;
+	char		path[MAXPGPATH];
+	int			fd = -1;
+
+	/* update the stats counter of written pages */
+	pgstat_count_slru_page_written(shared->slru_stats_idx);
+
+	/*
+	 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
+	 * write out data before associated WAL records.  This is the same action
+	 * performed during FlushBuffer() in the main buffer manager.
+	 */
+	if (shared->group_lsn != NULL)
+	{
+		/*
+		 * We must determine the largest async-commit LSN for the page. This
+		 * is a bit tedious, but since this entire function is a slow path
+		 * anyway, it seems better to do this here than to maintain a per-page
+		 * LSN variable (which'd need an extra comparison in the
+		 * transaction-commit path).
+		 */
+		XLogRecPtr	max_lsn;
+		int			lsnindex,
+					lsnoff;
+
+		lsnindex = slotno * shared->lsn_groups_per_page;
+		max_lsn = shared->group_lsn[lsnindex++];
+		for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
+		{
+			XLogRecPtr	this_lsn = shared->group_lsn[lsnindex++];
+
+			if (max_lsn < this_lsn)
+				max_lsn = this_lsn;
+		}
+
+		if (!XLogRecPtrIsInvalid(max_lsn))
+		{
+			/*
+			 * As noted above, elog(ERROR) is not acceptable here, so if
+			 * XLogFlush were to fail, we must PANIC.  This isn't much of a
+			 * restriction because XLogFlush is just about all critical
+			 * section anyway, but let's make sure.
+			 */
+			START_CRIT_SECTION();
+			XLogFlush(max_lsn);
+			END_CRIT_SECTION();
+		}
+	}
+
+	/*
+	 * During a WriteAll, we may already have the desired file open.
+	 */
+	if (fdata)
+	{
+		int			i;
+
+		for (i = 0; i < fdata->num_files; i++)
+		{
+			if (fdata->segno[i] == segno)
+			{
+				fd = fdata->fd[i];
+				break;
+			}
+		}
+	}
+
+	if (fd < 0)
+	{
+		/*
+		 * If the file doesn't already exist, we should create it.  It is
+		 * possible for this to need to happen when writing a page that's not
+		 * first in its segment; we assume the OS can cope with that. (Note:
+		 * it might seem that it'd be okay to create files only when
+		 * SimpleLruZeroPage is called for the first page of a segment.
+		 * However, if after a crash and restart the REDO logic elects to
+		 * replay the log from a checkpoint before the latest one, then it's
+		 * possible that we will get commands to set transaction status of
+		 * transactions that have already been truncated from the commit log.
+		 * Easiest way to deal with that is to accept references to
+		 * nonexistent files here and in SlruPhysicalReadPage.)
+		 *
+		 * Note: it is possible for more than one backend to be executing this
+		 * code simultaneously for different pages of the same file. Hence,
+		 * don't use O_EXCL or O_TRUNC or anything like that.
+		 */
+		SlruFileName(ctl, path, segno);
+		fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+		if (fd < 0)
+		{
+			slru_errcause = SLRU_OPEN_FAILED;
+			slru_errno = errno;
+			return false;
+		}
+
+		if (fdata)
+		{
+			if (fdata->num_files < MAX_WRITEALL_BUFFERS)
+			{
+				fdata->fd[fdata->num_files] = fd;
+				fdata->segno[fdata->num_files] = segno;
+				fdata->num_files++;
+			}
+			else
+			{
+				/*
+				 * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
+				 * fall back to treating it as a standalone write.
+				 */
+				fdata = NULL;
+			}
+		}
+	}
+
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
+	if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+	{
+		pgstat_report_wait_end();
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		slru_errcause = SLRU_WRITE_FAILED;
+		slru_errno = errno;
+		if (!fdata)
+			CloseTransientFile(fd);
+		return false;
+	}
+	pgstat_report_wait_end();
+
+	/* Queue up a sync request for the checkpointer. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+	{
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+		{
+			/* No space to enqueue sync request.  Do it synchronously. */
+			pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+			if (pg_fsync(fd) != 0)
+			{
+				pgstat_report_wait_end();
+				slru_errcause = SLRU_FSYNC_FAILED;
+				slru_errno = errno;
+				CloseTransientFile(fd);
+				return false;
+			}
+			pgstat_report_wait_end();
+		}
+	}
+
+	/* Close file, unless part of flush request. */
+	if (!fdata)
+	{
+		if (CloseTransientFile(fd) != 0)
+		{
+			slru_errcause = SLRU_CLOSE_FAILED;
+			slru_errno = errno;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Issue the error message after failure of SlruPhysicalReadPage or
+ * SlruPhysicalWritePage.  Call this after cleaning up shared-memory state.
+ */
+static void
+SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
+{
+	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+	int			offset = rpageno * BLCKSZ;
+	char		path[MAXPGPATH];
+
+	SlruFileName(ctl, path, segno);
+	errno = slru_errno;
+	switch (slru_errcause)
+	{
+		case SLRU_OPEN_FAILED:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not access status of transaction %u", xid),
+					 errdetail("Could not open file \"%s\": %m.", path)));
+			break;
+		case SLRU_SEEK_FAILED:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not access status of transaction %u", xid),
+					 errdetail("Could not seek in file \"%s\" to offset %d: %m.",
+							   path, offset)));
+			break;
+		case SLRU_READ_FAILED:
+			if (errno)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not access status of transaction %u", xid),
+						 errdetail("Could not read from file \"%s\" at offset %d: %m.",
+								   path, offset)));
+			else
+				ereport(ERROR,
+						(errmsg("could not access status of transaction %u", xid),
+						 errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
+			break;
+		case SLRU_WRITE_FAILED:
+			if (errno)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not access status of transaction %u", xid),
+						 errdetail("Could not write to file \"%s\" at offset %d: %m.",
+								   path, offset)));
+			else
+				ereport(ERROR,
+						(errmsg("could not access status of transaction %u", xid),
+						 errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
+								   path, offset)));
+			break;
+		case SLRU_FSYNC_FAILED:
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not access status of transaction %u", xid),
+					 errdetail("Could not fsync file \"%s\": %m.",
+							   path)));
+			break;
+		case SLRU_CLOSE_FAILED:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not access status of transaction %u", xid),
+					 errdetail("Could not close file \"%s\": %m.",
+							   path)));
+			break;
+		default:
+			/* can't get here, we trust */
+			elog(ERROR, "unrecognized SimpleLru error cause: %d",
+				 (int) slru_errcause);
+			break;
+	}
+}
+
+/*
+ * Select the slot to re-use when we need a free slot.
+ *
+ * The target page number is passed because we need to consider the
+ * possibility that some other process reads in the target page while
+ * we are doing I/O to free a slot.  Hence, check or recheck to see if
+ * any slot already holds the target page, and return that slot if so.
+ * Thus, the returned slot is *either* a slot already holding the pageno
+ * (could be any state except EMPTY), *or* a freeable slot (state EMPTY
+ * or CLEAN).
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+SlruSelectLRUPage(SlruCtl ctl, int pageno)
+{
+	SlruShared	shared = ctl->shared;
+
+	/* Outer loop handles restart after I/O */
+	for (;;)
+	{
+		int			slotno;
+		int			cur_count;
+		int			bestvalidslot = 0;	/* keep compiler quiet */
+		int			best_valid_delta = -1;
+		int			best_valid_page_number = 0; /* keep compiler quiet */
+		int			bestinvalidslot = 0;	/* keep compiler quiet */
+		int			best_invalid_delta = -1;
+		int			best_invalid_page_number = 0;	/* keep compiler quiet */
+
+		/* See if page already has a buffer assigned */
+		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		{
+			if (shared->page_number[slotno] == pageno &&
+				shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+				return slotno;
+		}
+
+		/*
+		 * If we find any EMPTY slot, just select that one. Else choose a
+		 * victim page to replace.  We normally take the least recently used
+		 * valid page, but we will never take the slot containing
+		 * latest_page_number, even if it appears least recently used.  We
+		 * will select a slot that is already I/O busy only if there is no
+		 * other choice: a read-busy slot will not be least recently used once
+		 * the read finishes, and waiting for an I/O on a write-busy slot is
+		 * inferior to just picking some other slot.  Testing shows the slot
+		 * we pick instead will often be clean, allowing us to begin a read at
+		 * once.
+		 *
+		 * Normally the page_lru_count values will all be different and so
+		 * there will be a well-defined LRU page.  But since we allow
+		 * concurrent execution of SlruRecentlyUsed() within
+		 * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
+		 * acquire the same lru_count values.  In that case we break ties by
+		 * choosing the furthest-back page.
+		 *
+		 * Notice that this next line forcibly advances cur_lru_count to a
+		 * value that is certainly beyond any value that will be in the
+		 * page_lru_count array after the loop finishes.  This ensures that
+		 * the next execution of SlruRecentlyUsed will mark the page newly
+		 * used, even if it's for a page that has the current counter value.
+		 * That gets us back on the path to having good data when there are
+		 * multiple pages with the same lru_count.
+		 */
+		cur_count = (shared->cur_lru_count)++;
+		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		{
+			int			this_delta;
+			int			this_page_number;
+
+			if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+				return slotno;
+			this_delta = cur_count - shared->page_lru_count[slotno];
+			if (this_delta < 0)
+			{
+				/*
+				 * Clean up in case shared updates have caused cur_count
+				 * increments to get "lost".  We back off the page counts,
+				 * rather than trying to increase cur_count, to avoid any
+				 * question of infinite loops or failure in the presence of
+				 * wrapped-around counts.
+				 */
+				shared->page_lru_count[slotno] = cur_count;
+				this_delta = 0;
+			}
+			this_page_number = shared->page_number[slotno];
+			if (this_page_number == shared->latest_page_number)
+				continue;
+			if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+			{
+				if (this_delta > best_valid_delta ||
+					(this_delta == best_valid_delta &&
+					 ctl->PagePrecedes(this_page_number,
+									   best_valid_page_number)))
+				{
+					bestvalidslot = slotno;
+					best_valid_delta = this_delta;
+					best_valid_page_number = this_page_number;
+				}
+			}
+			else
+			{
+				if (this_delta > best_invalid_delta ||
+					(this_delta == best_invalid_delta &&
+					 ctl->PagePrecedes(this_page_number,
+									   best_invalid_page_number)))
+				{
+					bestinvalidslot = slotno;
+					best_invalid_delta = this_delta;
+					best_invalid_page_number = this_page_number;
+				}
+			}
+		}
+
+		/*
+		 * If all pages (except possibly the latest one) are I/O busy, we'll
+		 * have to wait for an I/O to complete and then retry.  In that
+		 * unhappy case, we choose to wait for the I/O on the least recently
+		 * used slot, on the assumption that it was likely initiated first of
+		 * all the I/Os in progress and may therefore finish first.
+		 */
+		if (best_valid_delta < 0)
+		{
+			SimpleLruWaitIO(ctl, bestinvalidslot);
+			continue;
+		}
+
+		/*
+		 * If the selected page is clean, we're set.
+		 */
+		if (!shared->page_dirty[bestvalidslot])
+			return bestvalidslot;
+
+		/*
+		 * Write the page.
+		 */
+		SlruInternalWritePage(ctl, bestvalidslot, NULL);
+
+		/*
+		 * Now loop back and try again.  This is the easiest way of dealing
+		 * with corner cases such as the victim page being re-dirtied while we
+		 * wrote it.
+		 */
+	}
+}
+
+/*
+ * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
+ * is deferred until the next call to ProcessSyncRequests(), though we do fsync
+ * the containing directory here to make sure that newly created directory
+ * entries are on disk.
+ */
+void
+SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
+{
+	SlruShared	shared = ctl->shared;
+	SlruWriteAllData fdata;
+	int			slotno;
+	int			pageno = 0;
+	int			i;
+	bool		ok;
+
+	/* update the stats counter of flushes */
+	pgstat_count_slru_flush(shared->slru_stats_idx);
+
+	/*
+	 * Find and write dirty pages
+	 */
+	fdata.num_files = 0;
+
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	{
+		SlruInternalWritePage(ctl, slotno, &fdata);
+
+		/*
+		 * In some places (e.g. checkpoints), we cannot assert that the slot
+		 * is clean now, since another process might have re-dirtied it
+		 * already.  That's okay.
+		 */
+		Assert(allow_redirtied ||
+			   shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
+			   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+				!shared->page_dirty[slotno]));
+	}
+
+	LWLockRelease(shared->ControlLock);
+
+	/*
+	 * Now close any files that were open
+	 */
+	ok = true;
+	for (i = 0; i < fdata.num_files; i++)
+	{
+		if (CloseTransientFile(fdata.fd[i]) != 0)
+		{
+			slru_errcause = SLRU_CLOSE_FAILED;
+			slru_errno = errno;
+			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+			ok = false;
+		}
+	}
+	if (!ok)
+		SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+	/* Ensure that directory entries for new files are on disk. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+		fsync_fname(ctl->Dir, true);
+}
+
+/*
+ * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
+ * before computing cutoffPage.  Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage.  Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
+ */
+void
+SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
+{
+	SlruShared	shared = ctl->shared;
+	int			slotno;
+
+	/* update the stats counter of truncates */
+	pgstat_count_slru_truncate(shared->slru_stats_idx);
+
+	/*
+	 * Scan shared memory and remove any pages preceding the cutoff page, to
+	 * ensure we won't rewrite them later.  (Since this is normally called in
+	 * or just after a checkpoint, any dirty pages should have been flushed
+	 * already ... we're just being extra careful here.)
+	 */
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+
+restart:;
+
+	/*
+	 * While we are holding the lock, make an important safety check: the
+	 * current endpoint page must not be eligible for removal.
+	 */
+	if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
+	{
+		LWLockRelease(shared->ControlLock);
+		ereport(LOG,
+				(errmsg("could not truncate directory \"%s\": apparent wraparound",
+						ctl->Dir)));
+		return;
+	}
+
+	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	{
+		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+			continue;
+		if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
+			continue;
+
+		/*
+		 * If page is clean, just change state to EMPTY (expected case).
+		 */
+		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+			!shared->page_dirty[slotno])
+		{
+			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			continue;
+		}
+
+		/*
+		 * Hmm, we have (or may have) I/O operations acting on the page, so
+		 * we've got to wait for them to finish and then start again. This is
+		 * the same logic as in SlruSelectLRUPage.  (XXX if page is dirty,
+		 * wouldn't it be OK to just discard it without writing it?
+		 * SlruMayDeleteSegment() uses a stricter qualification, so we might
+		 * not delete this page in the end; even if we don't delete it, we
+		 * won't have cause to read its data again.  For now, keep the logic
+		 * the same as it was.)
+		 */
+		if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+			SlruInternalWritePage(ctl, slotno, NULL);
+		else
+			SimpleLruWaitIO(ctl, slotno);
+		goto restart;
+	}
+
+	LWLockRelease(shared->ControlLock);
+
+	/* Now we can remove the old segment(s) */
+	(void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
+}
+
+/*
+ * Delete an individual SLRU segment.
+ *
+ * NB: This does not touch the SLRU buffers themselves, callers have to ensure
+ * they either can't yet contain anything, or have already been cleaned out.
+ */
+static void
+SlruInternalDeleteSegment(SlruCtl ctl, int segno)
+{
+	char		path[MAXPGPATH];
+
+	/* Forget any fsync requests queued for this segment. */
+	if (ctl->sync_handler != SYNC_HANDLER_NONE)
+	{
+		FileTag		tag;
+
+		INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+		RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+	}
+
+	/* Unlink the file. */
+	SlruFileName(ctl, path, segno);
+	ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
+	unlink(path);
+}
+
+/*
+ * Delete an individual SLRU segment, identified by the segment number.
+ */
+void
+SlruDeleteSegment(SlruCtl ctl, int segno)
+{
+	SlruShared	shared = ctl->shared;
+	int			slotno;
+	bool		did_write;
+
+	/* Clean out any possibly existing references to the segment. */
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
+restart:
+	did_write = false;
+	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	{
+		int			pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
+
+		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+			continue;
+
+		/* not the segment we're looking for */
+		if (pagesegno != segno)
+			continue;
+
+		/* If page is clean, just change state to EMPTY (expected case). */
+		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
+			!shared->page_dirty[slotno])
+		{
+			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			continue;
+		}
+
+		/* Same logic as SimpleLruTruncate() */
+		if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+			SlruInternalWritePage(ctl, slotno, NULL);
+		else
+			SimpleLruWaitIO(ctl, slotno);
+
+		did_write = true;
+	}
+
+	/*
+	 * Be extra careful and re-check. The IO functions release the control
+	 * lock, so new pages could have been read in.
+	 */
+	if (did_write)
+		goto restart;
+
+	SlruInternalDeleteSegment(ctl, segno);
+
+	LWLockRelease(shared->ControlLock);
+}
+
+/*
+ * Determine whether a segment is okay to delete.
+ *
+ * segpage is the first page of the segment, and cutoffPage is the oldest (in
+ * PagePrecedes order) page in the SLRU containing still-useful data.  Since
+ * every core PagePrecedes callback implements "wrap around", check the
+ * segment's first and last pages:
+ *
+ * first<cutoff  && last<cutoff:  yes
+ * first<cutoff  && last>=cutoff: no; cutoff falls inside this segment
+ * first>=cutoff && last<cutoff:  no; wrap point falls inside this segment
+ * first>=cutoff && last>=cutoff: no; every page of this segment is too young
+ */
+static bool
+SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage)
+{
+	int			seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
+
+	Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
+
+	return (ctl->PagePrecedes(segpage, cutoffPage) &&
+			ctl->PagePrecedes(seg_last_page, cutoffPage));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
+{
+	TransactionId lhs,
+				rhs;
+	int			newestPage,
+				oldestPage;
+	TransactionId newestXact,
+				oldestXact;
+
+	/*
+	 * Compare an XID pair having undefined order (see RFC 1982), a pair at
+	 * "opposite ends" of the XID space.  TransactionIdPrecedes() treats each
+	 * as preceding the other.  If RHS is oldestXact, LHS is the first XID we
+	 * must not assign.
+	 */
+	lhs = per_page + offset;	/* skip first page to avoid non-normal XIDs */
+	rhs = lhs + (1U << 31);
+	Assert(TransactionIdPrecedes(lhs, rhs));
+	Assert(TransactionIdPrecedes(rhs, lhs));
+	Assert(!TransactionIdPrecedes(lhs - 1, rhs));
+	Assert(TransactionIdPrecedes(rhs, lhs - 1));
+	Assert(TransactionIdPrecedes(lhs + 1, rhs));
+	Assert(!TransactionIdPrecedes(rhs, lhs + 1));
+	Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
+	Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
+	Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
+	Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
+	Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
+	Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
+	Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
+	Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
+	Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
+		   || (1U << 31) % per_page != 0);	/* See CommitTsPagePrecedes() */
+	Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
+		   || (1U << 31) % per_page != 0);
+	Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
+	Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
+	Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
+
+	/*
+	 * GetNewTransactionId() has assigned the last XID it can safely use, and
+	 * that XID is in the *LAST* page of the second segment.  We must not
+	 * delete that segment.
+	 */
+	newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
+	newestXact = newestPage * per_page + offset;
+	Assert(newestXact / per_page == newestPage);
+	oldestXact = newestXact + 1;
+	oldestXact -= 1U << 31;
+	oldestPage = oldestXact / per_page;
+	Assert(!SlruMayDeleteSegment(ctl,
+								 (newestPage -
+								  newestPage % SLRU_PAGES_PER_SEGMENT),
+								 oldestPage));
+
+	/*
+	 * GetNewTransactionId() has assigned the last XID it can safely use, and
+	 * that XID is in the *FIRST* page of the second segment.  We must not
+	 * delete that segment.
+	 */
+	newestPage = SLRU_PAGES_PER_SEGMENT;
+	newestXact = newestPage * per_page + offset;
+	Assert(newestXact / per_page == newestPage);
+	oldestXact = newestXact + 1;
+	oldestXact -= 1U << 31;
+	oldestPage = oldestXact / per_page;
+	Assert(!SlruMayDeleteSegment(ctl,
+								 (newestPage -
+								  newestPage % SLRU_PAGES_PER_SEGMENT),
+								 oldestPage));
+}
+
+/*
+ * Unit-test a PagePrecedes function.
+ *
+ * This assumes every uint32 >= FirstNormalTransactionId is a valid key.  It
+ * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
+ * (MultiXactMemberCtl separates flags from XIDs.  AsyncCtl has
+ * variable-length entries, no keys, and no random access.  These unit tests
+ * do not apply to them.)
+ */
+void
+SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
+{
+	/* Test first, middle and last entries of a page. */
+	SlruPagePrecedesTestOffset(ctl, per_page, 0);
+	SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
+	SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
+}
+#endif
+
+/*
+ * SlruScanDirectory callback
+ *		This callback reports true if there's any segment wholly prior to the
+ *		one containing the page passed as "data".
+ */
+bool
+SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+	int			cutoffPage = *(int *) data;
+
+	if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
+		return true;			/* found one; don't iterate any more */
+
+	return false;				/* keep going */
+}
+
+/*
+ * SlruScanDirectory callback.
+ *		This callback deletes segments prior to the one passed in as "data".
+ */
+static bool
+SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+	int			cutoffPage = *(int *) data;
+
+	if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
+		SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
+
+	return false;				/* keep going */
+}
+
+/*
+ * SlruScanDirectory callback.
+ *		This callback deletes all segments.
+ */
+bool
+SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
+{
+	SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
+
+	return false;				/* keep going */
+}
+
+/*
+ * Scan the SimpleLru directory and apply a callback to each file found in it.
+ *
+ * If the callback returns true, the scan is stopped.  The last return value
+ * from the callback is returned.
+ *
+ * The callback receives the following arguments: 1. the SlruCtl struct for the
+ * slru being truncated; 2. the filename being considered; 3. the page number
+ * for the first page of that file; 4. a pointer to the opaque data given to us
+ * by the caller.
+ *
+ * Note that the ordering in which the directory is scanned is not guaranteed.
+ *
+ * Note that no locking is applied.
+ */
+bool
+SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
+{
+	bool		retval = false;
+	DIR		   *cldir;
+	struct dirent *clde;
+	int			segno;
+	int			segpage;
+
+	cldir = AllocateDir(ctl->Dir);
+	while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
+	{
+		size_t		len;
+
+		len = strlen(clde->d_name);
+
+		if ((len == 4 || len == 5 || len == 6) &&
+			strspn(clde->d_name, "0123456789ABCDEF") == len)
+		{
+			segno = (int) strtol(clde->d_name, NULL, 16);
+			segpage = segno * SLRU_PAGES_PER_SEGMENT;
+
+			elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
+				 ctl->Dir, clde->d_name);
+			retval = callback(ctl, clde->d_name, segpage, data);
+			if (retval)
+				break;
+		}
+	}
+	FreeDir(cldir);
+
+	return retval;
+}
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+	int			fd;
+	int			save_errno;
+	int			result;
+
+	SlruFileName(ctl, path, ftag->segno);
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd < 0)
+		return -1;
+
+	pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
+	result = pg_fsync(fd);
+	pgstat_report_wait_end();
+	save_errno = errno;
+
+	CloseTransientFile(fd);
+
+	errno = save_errno;
+	return result;
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
new file mode 100644
index 0000000..66d3548
--- /dev/null
+++ b/src/backend/access/transam/subtrans.c
@@ -0,0 +1,374 @@
+/*-------------------------------------------------------------------------
+ *
+ * subtrans.c
+ *		PostgreSQL subtransaction-log manager
+ *
+ * The pg_subtrans manager is a pg_xact-like manager that stores the parent
+ * transaction Id for each transaction.  It is a fundamental part of the
+ * nested transactions implementation.  A main transaction has a parent
+ * of InvalidTransactionId, and each subtransaction has its immediate parent.
+ * The tree can easily be walked from child to parent, but not in the
+ * opposite direction.
+ *
+ * This code is based on xact.c, but the robustness requirements
+ * are completely different from pg_xact, because we only need to remember
+ * pg_subtrans information for currently-open transactions.  Thus, there is
+ * no need to preserve data over a crash and restart.
+ *
+ * There are no XLOG interactions since we do not care about preserving
+ * data across crashes.  During database startup, we simply force the
+ * currently-active page of SUBTRANS to zeroes.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/subtrans.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * Defines for SubTrans page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * SubTrans page numbering also wraps around at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing
+ * them in StartupSUBTRANS.
+ */
+
+/* We need four bytes per xact */
+#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+
+#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+
+
+/*
+ * Link to shared-memory data structures for SUBTRANS control
+ */
+static SlruCtlData SubTransCtlData;
+
+#define SubTransCtl  (&SubTransCtlData)
+
+
+static int	ZeroSUBTRANSPage(int pageno);
+static bool SubTransPagePrecedes(int page1, int page2);
+
+
+/*
+ * Record the parent of a subtransaction in the subtrans log.
+ */
+void
+SubTransSetParent(TransactionId xid, TransactionId parent)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToEntry(xid);
+	int			slotno;
+	TransactionId *ptr;
+
+	Assert(TransactionIdIsValid(parent));
+	Assert(TransactionIdFollows(xid, parent));
+
+	LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
+	ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+	ptr += entryno;
+
+	/*
+	 * It's possible we'll try to set the parent xid multiple times but we
+	 * shouldn't ever be changing the xid from one valid xid to another valid
+	 * xid, which would corrupt the data structure.
+	 */
+	if (*ptr != parent)
+	{
+		Assert(*ptr == InvalidTransactionId);
+		*ptr = parent;
+		SubTransCtl->shared->page_dirty[slotno] = true;
+	}
+
+	LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Interrogate the parent of a transaction in the subtrans log.
+ */
+TransactionId
+SubTransGetParent(TransactionId xid)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToEntry(xid);
+	int			slotno;
+	TransactionId *ptr;
+	TransactionId parent;
+
+	/* Can't ask about stuff that might not be around anymore */
+	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+	/* Bootstrap and frozen XIDs have no parent */
+	if (!TransactionIdIsNormal(xid))
+		return InvalidTransactionId;
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+	slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
+	ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+	ptr += entryno;
+
+	parent = *ptr;
+
+	LWLockRelease(SubtransSLRULock);
+
+	return parent;
+}
+
+/*
+ * SubTransGetTopmostTransaction
+ *
+ * Returns the topmost transaction of the given transaction id.
+ *
+ * Because we cannot look back further than TransactionXmin, it is possible
+ * that this function will lie and return an intermediate subtransaction ID
+ * instead of the true topmost parent ID.  This is OK, because in practice
+ * we only care about detecting whether the topmost parent is still running
+ * or is part of a current snapshot's list of still-running transactions.
+ * Therefore, any XID before TransactionXmin is as good as any other.
+ */
+TransactionId
+SubTransGetTopmostTransaction(TransactionId xid)
+{
+	TransactionId parentXid = xid,
+				previousXid = xid;
+
+	/* Can't ask about stuff that might not be around anymore */
+	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+	while (TransactionIdIsValid(parentXid))
+	{
+		previousXid = parentXid;
+		if (TransactionIdPrecedes(parentXid, TransactionXmin))
+			break;
+		parentXid = SubTransGetParent(parentXid);
+
+		/*
+		 * By convention the parent xid gets allocated first, so should always
+		 * precede the child xid. Anything else points to a corrupted data
+		 * structure that could lead to an infinite loop, so exit.
+		 */
+		if (!TransactionIdPrecedes(parentXid, previousXid))
+			elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u",
+				 previousXid, parentXid);
+	}
+
+	Assert(TransactionIdIsValid(previousXid));
+
+	return previousXid;
+}
+
+
+/*
+ * Initialization of shared memory for SUBTRANS
+ */
+Size
+SUBTRANSShmemSize(void)
+{
+	return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+}
+
+void
+SUBTRANSShmemInit(void)
+{
+	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
+	SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+				  SubtransSLRULock, "pg_subtrans",
+				  LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
+	SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates
+ * the initial SUBTRANS segment.  (The SUBTRANS directory is assumed to
+ * have been created by the initdb shell script, and SUBTRANSShmemInit
+ * must have been called already.)
+ *
+ * Note: it's not really necessary to create the initial segment now,
+ * since slru.c would create it on first write anyway.  But we may as well
+ * do it to be sure the directory is set up correctly.
+ */
+void
+BootStrapSUBTRANS(void)
+{
+	int			slotno;
+
+	LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the subtrans log */
+	slotno = ZeroSUBTRANSPage(0);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(SubTransCtl, slotno);
+	Assert(!SubTransCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of SUBTRANS to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroSUBTRANSPage(int pageno)
+{
+	return SimpleLruZeroPage(SubTransCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ */
+void
+StartupSUBTRANS(TransactionId oldestActiveXID)
+{
+	FullTransactionId nextXid;
+	int			startPage;
+	int			endPage;
+
+	/*
+	 * Since we don't expect pg_subtrans to be valid across crashes, we
+	 * initialize the currently-active page(s) to zeroes during startup.
+	 * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
+	 * the new page without regard to whatever was previously on disk.
+	 */
+	LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+	startPage = TransactionIdToPage(oldestActiveXID);
+	nextXid = ShmemVariableCache->nextXid;
+	endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid));
+
+	while (startPage != endPage)
+	{
+		(void) ZeroSUBTRANSPage(startPage);
+		startPage++;
+		/* must account for wraparound */
+		if (startPage > TransactionIdToPage(MaxTransactionId))
+			startPage = 0;
+	}
+	(void) ZeroSUBTRANSPage(startPage);
+
+	LWLockRelease(SubtransSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointSUBTRANS(void)
+{
+	/*
+	 * Write dirty SUBTRANS pages to disk
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
+	SimpleLruWriteAll(SubTransCtl, true);
+	TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
+}
+
+
+/*
+ * Make sure that SUBTRANS has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty subtrans page to make room
+ * in shared memory.
+ */
+void
+ExtendSUBTRANS(TransactionId newestXact)
+{
+	int			pageno;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToEntry(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
+
+	/* Zero the page */
+	ZeroSUBTRANSPage(pageno);
+
+	LWLockRelease(SubtransSLRULock);
+}
+
+
+/*
+ * Remove all SUBTRANS segments before the one holding the passed transaction ID
+ *
+ * oldestXact is the oldest TransactionXmin of any running transaction.  This
+ * is called only during checkpoint.
+ */
+void
+TruncateSUBTRANS(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate.  We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	SimpleLruTruncate(SubTransCtl, cutoffPage);
+}
+
+
+/*
+ * Decide whether a SUBTRANS page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SubTransPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId + 1;
+	xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId + 1;
+
+	return (TransactionIdPrecedes(xid1, xid2) &&
+			TransactionIdPrecedes(xid1, xid2 + SUBTRANS_XACTS_PER_PAGE - 1));
+}
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
new file mode 100644
index 0000000..be21968
--- /dev/null
+++ b/src/backend/access/transam/timeline.c
@@ -0,0 +1,600 @@
+/*-------------------------------------------------------------------------
+ *
+ * timeline.c
+ *		Functions for reading and writing timeline history files.
+ *
+ * A timeline history file lists the timeline changes of the timeline, in
+ * a simple text format. They are archived along with the WAL segments.
+ *
+ * The files are named like "<tli>.history". For example, if the database
+ * starts up and switches to timeline 5, the timeline history file would be
+ * called "00000005.history".
+ *
+ * Each line in the file represents a timeline switch:
+ *
+ * <parentTLI> <switchpoint> <reason>
+ *
+ *	parentTLI	ID of the parent timeline
+ *	switchpoint XLogRecPtr of the WAL location where the switch happened
+ *	reason		human-readable explanation of why the timeline was changed
+ *
+ * The fields are separated by tabs. Lines beginning with # are comments, and
+ * are ignored. Empty lines are also ignored.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/timeline.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogdefs.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+
+/*
+ * Copies all timeline history files with id's between 'begin' and 'end'
+ * from archive to pg_wal.
+ */
+void
+restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
+{
+	char		path[MAXPGPATH];
+	char		histfname[MAXFNAMELEN];
+	TimeLineID	tli;
+
+	for (tli = begin; tli < end; tli++)
+	{
+		if (tli == 1)
+			continue;
+
+		TLHistoryFileName(histfname, tli);
+		if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
+			KeepFileRestoredFromArchive(path, histfname);
+	}
+}
+
+/*
+ * Try to read a timeline's history file.
+ *
+ * If successful, return the list of component TLIs (the given TLI followed by
+ * its ancestor TLIs).  If we can't find the history file, assume that the
+ * timeline has no parents, and return a list of just the specified timeline
+ * ID.
+ */
+List *
+readTimeLineHistory(TimeLineID targetTLI)
+{
+	List	   *result;
+	char		path[MAXPGPATH];
+	char		histfname[MAXFNAMELEN];
+	FILE	   *fd;
+	TimeLineHistoryEntry *entry;
+	TimeLineID	lasttli = 0;
+	XLogRecPtr	prevend;
+	bool		fromArchive = false;
+
+	/* Timeline 1 does not have a history file, so no need to check */
+	if (targetTLI == 1)
+	{
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = targetTLI;
+		entry->begin = entry->end = InvalidXLogRecPtr;
+		return list_make1(entry);
+	}
+
+	if (ArchiveRecoveryRequested)
+	{
+		TLHistoryFileName(histfname, targetTLI);
+		fromArchive =
+			RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+	}
+	else
+		TLHistoryFilePath(path, targetTLI);
+
+	fd = AllocateFile(path, "r");
+	if (fd == NULL)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", path)));
+		/* Not there, so assume no parents */
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = targetTLI;
+		entry->begin = entry->end = InvalidXLogRecPtr;
+		return list_make1(entry);
+	}
+
+	result = NIL;
+
+	/*
+	 * Parse the file...
+	 */
+	prevend = InvalidXLogRecPtr;
+	for (;;)
+	{
+		char		fline[MAXPGPATH];
+		char	   *res;
+		char	   *ptr;
+		TimeLineID	tli;
+		uint32		switchpoint_hi;
+		uint32		switchpoint_lo;
+		int			nfields;
+
+		pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ);
+		res = fgets(fline, sizeof(fline), fd);
+		pgstat_report_wait_end();
+		if (res == NULL)
+		{
+			if (ferror(fd))
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m", path)));
+
+			break;
+		}
+
+		/* skip leading whitespace and check for # comment */
+		for (ptr = fline; *ptr; ptr++)
+		{
+			if (!isspace((unsigned char) *ptr))
+				break;
+		}
+		if (*ptr == '\0' || *ptr == '#')
+			continue;
+
+		nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
+
+		if (nfields < 1)
+		{
+			/* expect a numeric timeline ID as first field of line */
+			ereport(FATAL,
+					(errmsg("syntax error in history file: %s", fline),
+					 errhint("Expected a numeric timeline ID.")));
+		}
+		if (nfields != 3)
+			ereport(FATAL,
+					(errmsg("syntax error in history file: %s", fline),
+					 errhint("Expected a write-ahead log switchpoint location.")));
+
+		if (result && tli <= lasttli)
+			ereport(FATAL,
+					(errmsg("invalid data in history file: %s", fline),
+					 errhint("Timeline IDs must be in increasing sequence.")));
+
+		lasttli = tli;
+
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = tli;
+		entry->begin = prevend;
+		entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
+		prevend = entry->end;
+
+		/* Build list with newest item first */
+		result = lcons(entry, result);
+
+		/* we ignore the remainder of each line */
+	}
+
+	FreeFile(fd);
+
+	if (result && targetTLI <= lasttli)
+		ereport(FATAL,
+				(errmsg("invalid data in history file \"%s\"", path),
+				 errhint("Timeline IDs must be less than child timeline's ID.")));
+
+	/*
+	 * Create one more entry for the "tip" of the timeline, which has no entry
+	 * in the history file.
+	 */
+	entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+	entry->tli = targetTLI;
+	entry->begin = prevend;
+	entry->end = InvalidXLogRecPtr;
+
+	result = lcons(entry, result);
+
+	/*
+	 * If the history file was fetched from archive, save it in pg_wal for
+	 * future reference.
+	 */
+	if (fromArchive)
+		KeepFileRestoredFromArchive(path, histfname);
+
+	return result;
+}
+
+/*
+ * Probe whether a timeline history file exists for the given timeline ID
+ */
+bool
+existsTimeLineHistory(TimeLineID probeTLI)
+{
+	char		path[MAXPGPATH];
+	char		histfname[MAXFNAMELEN];
+	FILE	   *fd;
+
+	/* Timeline 1 does not have a history file, so no need to check */
+	if (probeTLI == 1)
+		return false;
+
+	if (ArchiveRecoveryRequested)
+	{
+		TLHistoryFileName(histfname, probeTLI);
+		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+	}
+	else
+		TLHistoryFilePath(path, probeTLI);
+
+	fd = AllocateFile(path, "r");
+	if (fd != NULL)
+	{
+		FreeFile(fd);
+		return true;
+	}
+	else
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", path)));
+		return false;
+	}
+}
+
+/*
+ * Find the newest existing timeline, assuming that startTLI exists.
+ *
+ * Note: while this is somewhat heuristic, it does positively guarantee
+ * that (result + 1) is not a known timeline, and therefore it should
+ * be safe to assign that ID to a new timeline.
+ */
+TimeLineID
+findNewestTimeLine(TimeLineID startTLI)
+{
+	TimeLineID	newestTLI;
+	TimeLineID	probeTLI;
+
+	/*
+	 * The algorithm is just to probe for the existence of timeline history
+	 * files.  XXX is it useful to allow gaps in the sequence?
+	 */
+	newestTLI = startTLI;
+
+	for (probeTLI = startTLI + 1;; probeTLI++)
+	{
+		if (existsTimeLineHistory(probeTLI))
+		{
+			newestTLI = probeTLI;	/* probeTLI exists */
+		}
+		else
+		{
+			/* doesn't exist, assume we're done */
+			break;
+		}
+	}
+
+	return newestTLI;
+}
+
+/*
+ * Create a new timeline history file.
+ *
+ *	newTLI: ID of the new timeline
+ *	parentTLI: ID of its immediate parent
+ *	switchpoint: WAL location where the system switched to the new timeline
+ *	reason: human-readable explanation of why the timeline was switched
+ *
+ * Currently this is only used at the end recovery, and so there are no locking
+ * considerations.  But we should be just as tense as XLogFileInit to avoid
+ * emplacing a bogus file.
+ */
+void
+writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
+					 XLogRecPtr switchpoint, char *reason)
+{
+	char		path[MAXPGPATH];
+	char		tmppath[MAXPGPATH];
+	char		histfname[MAXFNAMELEN];
+	char		buffer[BLCKSZ];
+	int			srcfd;
+	int			fd;
+	int			nbytes;
+
+	Assert(newTLI > parentTLI); /* else bad selection of newTLI */
+
+	/*
+	 * Write into a temp file name.
+	 */
+	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+	unlink(tmppath);
+
+	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
+	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tmppath)));
+
+	/*
+	 * If a history file exists for the parent, copy it verbatim
+	 */
+	if (ArchiveRecoveryRequested)
+	{
+		TLHistoryFileName(histfname, parentTLI);
+		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false);
+	}
+	else
+		TLHistoryFilePath(path, parentTLI);
+
+	srcfd = OpenTransientFile(path, O_RDONLY);
+	if (srcfd < 0)
+	{
+		if (errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", path)));
+		/* Not there, so assume parent has no parents */
+	}
+	else
+	{
+		for (;;)
+		{
+			errno = 0;
+			pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ);
+			nbytes = (int) read(srcfd, buffer, sizeof(buffer));
+			pgstat_report_wait_end();
+			if (nbytes < 0 || errno != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m", path)));
+			if (nbytes == 0)
+				break;
+			errno = 0;
+			pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE);
+			if ((int) write(fd, buffer, nbytes) != nbytes)
+			{
+				int			save_errno = errno;
+
+				/*
+				 * If we fail to make the file, delete it to release disk
+				 * space
+				 */
+				unlink(tmppath);
+
+				/*
+				 * if write didn't set errno, assume problem is no disk space
+				 */
+				errno = save_errno ? save_errno : ENOSPC;
+
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not write to file \"%s\": %m", tmppath)));
+			}
+			pgstat_report_wait_end();
+		}
+
+		if (CloseTransientFile(srcfd) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not close file \"%s\": %m", path)));
+	}
+
+	/*
+	 * Append one line with the details of this timeline split.
+	 *
+	 * If we did have a parent file, insert an extra newline just in case the
+	 * parent file failed to end with one.
+	 */
+	snprintf(buffer, sizeof(buffer),
+			 "%s%u\t%X/%X\t%s\n",
+			 (srcfd < 0) ? "" : "\n",
+			 parentTLI,
+			 LSN_FORMAT_ARGS(switchpoint),
+			 reason);
+
+	nbytes = strlen(buffer);
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE);
+	if ((int) write(fd, buffer, nbytes) != nbytes)
+	{
+		int			save_errno = errno;
+
+		/*
+		 * If we fail to make the file, delete it to release disk space
+		 */
+		unlink(tmppath);
+		/* if write didn't set errno, assume problem is no disk space */
+		errno = save_errno ? save_errno : ENOSPC;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
+	}
+	pgstat_report_wait_end();
+
+	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tmppath)));
+
+	/*
+	 * Now move the completed history file into place with its final name.
+	 */
+	TLHistoryFilePath(path, newTLI);
+
+	/*
+	 * Perform the rename using link if available, paranoidly trying to avoid
+	 * overwriting an existing file (there shouldn't be one).
+	 */
+	durable_rename_excl(tmppath, path, ERROR);
+
+	/* The history file can be archived immediately. */
+	if (XLogArchivingActive())
+	{
+		TLHistoryFileName(histfname, newTLI);
+		XLogArchiveNotify(histfname);
+	}
+}
+
+/*
+ * Writes a history file for given timeline and contents.
+ *
+ * Currently this is only used in the walreceiver process, and so there are
+ * no locking considerations.  But we should be just as tense as XLogFileInit
+ * to avoid emplacing a bogus file.
+ */
+void
+writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
+{
+	char		path[MAXPGPATH];
+	char		tmppath[MAXPGPATH];
+	int			fd;
+
+	/*
+	 * Write into a temp file name.
+	 */
+	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+	unlink(tmppath);
+
+	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
+	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tmppath)));
+
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE);
+	if ((int) write(fd, content, size) != size)
+	{
+		int			save_errno = errno;
+
+		/*
+		 * If we fail to make the file, delete it to release disk space
+		 */
+		unlink(tmppath);
+		/* if write didn't set errno, assume problem is no disk space */
+		errno = save_errno ? save_errno : ENOSPC;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
+	}
+	pgstat_report_wait_end();
+
+	pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tmppath)));
+
+	/*
+	 * Now move the completed history file into place with its final name.
+	 */
+	TLHistoryFilePath(path, tli);
+
+	/*
+	 * Perform the rename using link if available, paranoidly trying to avoid
+	 * overwriting an existing file (there shouldn't be one).
+	 */
+	durable_rename_excl(tmppath, path, ERROR);
+}
+
+/*
+ * Returns true if 'expectedTLEs' contains a timeline with id 'tli'
+ */
+bool
+tliInHistory(TimeLineID tli, List *expectedTLEs)
+{
+	ListCell   *cell;
+
+	foreach(cell, expectedTLEs)
+	{
+		if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Returns the ID of the timeline in use at a particular point in time, in
+ * the given timeline history.
+ */
+TimeLineID
+tliOfPointInHistory(XLogRecPtr ptr, List *history)
+{
+	ListCell   *cell;
+
+	foreach(cell, history)
+	{
+		TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+
+		if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) &&
+			(XLogRecPtrIsInvalid(tle->end) || ptr < tle->end))
+		{
+			/* found it */
+			return tle->tli;
+		}
+	}
+
+	/* shouldn't happen. */
+	elog(ERROR, "timeline history was not contiguous");
+	return 0;					/* keep compiler quiet */
+}
+
+/*
+ * Returns the point in history where we branched off the given timeline,
+ * and the timeline we branched to (*nextTLI). Returns InvalidXLogRecPtr if
+ * the timeline is current, ie. we have not branched off from it, and throws
+ * an error if the timeline is not part of this server's history.
+ */
+XLogRecPtr
+tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
+{
+	ListCell   *cell;
+
+	if (nextTLI)
+		*nextTLI = 0;
+	foreach(cell, history)
+	{
+		TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+
+		if (tle->tli == tli)
+			return tle->end;
+		if (nextTLI)
+			*nextTLI = tle->tli;
+	}
+
+	ereport(ERROR,
+			(errmsg("requested timeline %u is not in this server's history",
+					tli)));
+	return InvalidXLogRecPtr;	/* keep compiler quiet */
+}
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
new file mode 100644
index 0000000..5865810
--- /dev/null
+++ b/src/backend/access/transam/transam.c
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------
+ *
+ * transam.c
+ *	  postgres transaction (commit) log interface routines
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/transam/transam.c
+ *
+ * NOTES
+ *	  This file contains the high level access-method interface to the
+ *	  transaction system.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Single-item cache for results of TransactionLogFetch.  It's worth having
+ * such a cache because we frequently find ourselves repeatedly checking the
+ * same XID, for example when scanning a table just after a bulk insert,
+ * update, or delete.
+ */
+static TransactionId cachedFetchXid = InvalidTransactionId;
+static XidStatus cachedFetchXidStatus;
+static XLogRecPtr cachedCommitLSN;
+
+/* Local functions */
+static XidStatus TransactionLogFetch(TransactionId transactionId);
+
+
+/* ----------------------------------------------------------------
+ *		Postgres log access method interface
+ *
+ *		TransactionLogFetch
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * TransactionLogFetch --- fetch commit status of specified transaction id
+ */
+static XidStatus
+TransactionLogFetch(TransactionId transactionId)
+{
+	XidStatus	xidstatus;
+	XLogRecPtr	xidlsn;
+
+	/*
+	 * Before going to the commit log manager, check our single item cache to
+	 * see if we didn't just check the transaction status a moment ago.
+	 */
+	if (TransactionIdEquals(transactionId, cachedFetchXid))
+		return cachedFetchXidStatus;
+
+	/*
+	 * Also, check to see if the transaction ID is a permanent one.
+	 */
+	if (!TransactionIdIsNormal(transactionId))
+	{
+		if (TransactionIdEquals(transactionId, BootstrapTransactionId))
+			return TRANSACTION_STATUS_COMMITTED;
+		if (TransactionIdEquals(transactionId, FrozenTransactionId))
+			return TRANSACTION_STATUS_COMMITTED;
+		return TRANSACTION_STATUS_ABORTED;
+	}
+
+	/*
+	 * Get the transaction status.
+	 */
+	xidstatus = TransactionIdGetStatus(transactionId, &xidlsn);
+
+	/*
+	 * Cache it, but DO NOT cache status for unfinished or sub-committed
+	 * transactions!  We only cache status that is guaranteed not to change.
+	 */
+	if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
+		xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
+	{
+		cachedFetchXid = transactionId;
+		cachedFetchXidStatus = xidstatus;
+		cachedCommitLSN = xidlsn;
+	}
+
+	return xidstatus;
+}
+
+/* ----------------------------------------------------------------
+ *						Interface functions
+ *
+ *		TransactionIdDidCommit
+ *		TransactionIdDidAbort
+ *		========
+ *		   these functions test the transaction status of
+ *		   a specified transaction id.
+ *
+ *		TransactionIdCommitTree
+ *		TransactionIdAsyncCommitTree
+ *		TransactionIdAbortTree
+ *		========
+ *		   these functions set the transaction status of the specified
+ *		   transaction tree.
+ *
+ * See also TransactionIdIsInProgress, which once was in this module
+ * but now lives in procarray.c.
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * TransactionIdDidCommit
+ *		True iff transaction associated with the identifier did commit.
+ *
+ * Note:
+ *		Assumes transaction identifier is valid and exists in clog.
+ */
+bool							/* true if given transaction committed */
+TransactionIdDidCommit(TransactionId transactionId)
+{
+	XidStatus	xidstatus;
+
+	xidstatus = TransactionLogFetch(transactionId);
+
+	/*
+	 * If it's marked committed, it's committed.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		return true;
+
+	/*
+	 * If it's marked subcommitted, we have to check the parent recursively.
+	 * However, if it's older than TransactionXmin, we can't look at
+	 * pg_subtrans; instead assume that the parent crashed without cleaning up
+	 * its children.
+	 *
+	 * Originally we Assert'ed that the result of SubTransGetParent was not
+	 * zero. However with the introduction of prepared transactions, there can
+	 * be a window just after database startup where we do not have complete
+	 * knowledge in pg_subtrans of the transactions after TransactionXmin.
+	 * StartupSUBTRANS() has ensured that any missing information will be
+	 * zeroed.  Since this case should not happen under normal conditions, it
+	 * seems reasonable to emit a WARNING for it.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
+	{
+		TransactionId parentXid;
+
+		if (TransactionIdPrecedes(transactionId, TransactionXmin))
+			return false;
+		parentXid = SubTransGetParent(transactionId);
+		if (!TransactionIdIsValid(parentXid))
+		{
+			elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
+				 transactionId);
+			return false;
+		}
+		return TransactionIdDidCommit(parentXid);
+	}
+
+	/*
+	 * It's not committed.
+	 */
+	return false;
+}
+
+/*
+ * TransactionIdDidAbort
+ *		True iff transaction associated with the identifier did abort.
+ *
+ * Note:
+ *		Assumes transaction identifier is valid and exists in clog.
+ */
+bool							/* true if given transaction aborted */
+TransactionIdDidAbort(TransactionId transactionId)
+{
+	XidStatus	xidstatus;
+
+	xidstatus = TransactionLogFetch(transactionId);
+
+	/*
+	 * If it's marked aborted, it's aborted.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		return true;
+
+	/*
+	 * If it's marked subcommitted, we have to check the parent recursively.
+	 * However, if it's older than TransactionXmin, we can't look at
+	 * pg_subtrans; instead assume that the parent crashed without cleaning up
+	 * its children.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
+	{
+		TransactionId parentXid;
+
+		if (TransactionIdPrecedes(transactionId, TransactionXmin))
+			return true;
+		parentXid = SubTransGetParent(transactionId);
+		if (!TransactionIdIsValid(parentXid))
+		{
+			/* see notes in TransactionIdDidCommit */
+			elog(WARNING, "no pg_subtrans entry for subcommitted XID %u",
+				 transactionId);
+			return true;
+		}
+		return TransactionIdDidAbort(parentXid);
+	}
+
+	/*
+	 * It's not aborted.
+	 */
+	return false;
+}
+
+/*
+ * TransactionIdCommitTree
+ *		Marks the given transaction and children as committed
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
+ *
+ * This commit operation is not guaranteed to be atomic, but if not, subxids
+ * are correctly marked subcommit first.
+ */
+void
+TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
+{
+	TransactionIdSetTreeStatus(xid, nxids, xids,
+							   TRANSACTION_STATUS_COMMITTED,
+							   InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdAsyncCommitTree
+ *		Same as above, but for async commits.  The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids,
+							 XLogRecPtr lsn)
+{
+	TransactionIdSetTreeStatus(xid, nxids, xids,
+							   TRANSACTION_STATUS_COMMITTED, lsn);
+}
+
+/*
+ * TransactionIdAbortTree
+ *		Marks the given transaction and children as aborted.
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
+ *
+ * We don't need to worry about the non-atomic behavior, since any onlookers
+ * will consider all the xacts as not-yet-committed anyway.
+ */
+void
+TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
+{
+	TransactionIdSetTreeStatus(xid, nxids, xids,
+							   TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdPrecedes --- is id1 logically < id2?
+ */
+bool
+TransactionIdPrecedes(TransactionId id1, TransactionId id2)
+{
+	/*
+	 * If either ID is a permanent XID then we can just do unsigned
+	 * comparison.  If both are normal, do a modulo-2^32 comparison.
+	 */
+	int32		diff;
+
+	if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+		return (id1 < id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff < 0);
+}
+
+/*
+ * TransactionIdPrecedesOrEquals --- is id1 logically <= id2?
+ */
+bool
+TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
+{
+	int32		diff;
+
+	if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+		return (id1 <= id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff <= 0);
+}
+
+/*
+ * TransactionIdFollows --- is id1 logically > id2?
+ */
+bool
+TransactionIdFollows(TransactionId id1, TransactionId id2)
+{
+	int32		diff;
+
+	if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+		return (id1 > id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff > 0);
+}
+
+/*
+ * TransactionIdFollowsOrEquals --- is id1 logically >= id2?
+ */
+bool
+TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
+{
+	int32		diff;
+
+	if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2))
+		return (id1 >= id2);
+
+	diff = (int32) (id1 - id2);
+	return (diff >= 0);
+}
+
+
+/*
+ * TransactionIdLatest --- get latest XID among a main xact and its children
+ */
+TransactionId
+TransactionIdLatest(TransactionId mainxid,
+					int nxids, const TransactionId *xids)
+{
+	TransactionId result;
+
+	/*
+	 * In practice it is highly likely that the xids[] array is sorted, and so
+	 * we could save some cycles by just taking the last child XID, but this
+	 * probably isn't so performance-critical that it's worth depending on
+	 * that assumption.  But just to show we're not totally stupid, scan the
+	 * array back-to-front to avoid useless assignments.
+	 */
+	result = mainxid;
+	while (--nxids >= 0)
+	{
+		if (TransactionIdPrecedes(result, xids[nxids]))
+			result = xids[nxids];
+	}
+	return result;
+}
+
+
+/*
+ * TransactionIdGetCommitLSN
+ *
+ * This function returns an LSN that is late enough to be able
+ * to guarantee that if we flush up to the LSN returned then we
+ * will have flushed the transaction's commit record to disk.
+ *
+ * The result is not necessarily the exact LSN of the transaction's
+ * commit record!  For example, for long-past transactions (those whose
+ * clog pages already migrated to disk), we'll return InvalidXLogRecPtr.
+ * Also, because we group transactions on the same clog page to conserve
+ * storage, we might return the LSN of a later transaction that falls into
+ * the same group.
+ */
+XLogRecPtr
+TransactionIdGetCommitLSN(TransactionId xid)
+{
+	XLogRecPtr	result;
+
+	/*
+	 * Currently, all uses of this function are for xids that were just
+	 * reported to be committed by TransactionLogFetch, so we expect that
+	 * checking TransactionLogFetch's cache will usually succeed and avoid an
+	 * extra trip to shared memory.
+	 */
+	if (TransactionIdEquals(xid, cachedFetchXid))
+		return cachedCommitLSN;
+
+	/* Special XIDs are always known committed */
+	if (!TransactionIdIsNormal(xid))
+		return InvalidXLogRecPtr;
+
+	/*
+	 * Get the transaction status.
+	 */
+	(void) TransactionIdGetStatus(xid, &result);
+
+	return result;
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
new file mode 100644
index 0000000..5293c69
--- /dev/null
+++ b/src/backend/access/transam/twophase.c
@@ -0,0 +1,2662 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase.c
+ *		Two-phase commit support functions.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *		src/backend/access/transam/twophase.c
+ *
+ * NOTES
+ *		Each global transaction is associated with a global transaction
+ *		identifier (GID). The client assigns a GID to a postgres
+ *		transaction with the PREPARE TRANSACTION command.
+ *
+ *		We keep all active global transactions in a shared memory array.
+ *		When the PREPARE TRANSACTION command is issued, the GID is
+ *		reserved for the transaction in the array. This is done before
+ *		a WAL entry is made, because the reservation checks for duplicate
+ *		GIDs and aborts the transaction if there already is a global
+ *		transaction in prepared state with the same GID.
+ *
+ *		A global transaction (gxact) also has dummy PGPROC; this is what keeps
+ *		the XID considered running by TransactionIdIsInProgress.  It is also
+ *		convenient as a PGPROC to hook the gxact's locks to.
+ *
+ *		Information to recover prepared transactions in case of crash is
+ *		now stored in WAL for the common case. In some cases there will be
+ *		an extended period between preparing a GXACT and commit/abort, in
+ *		which case we need to separately record prepared transaction data
+ *		in permanent storage. This includes locking information, pending
+ *		notifications etc. All that state information is written to the
+ *		per-transaction state file in the pg_twophase directory.
+ *		All prepared transactions will be written prior to shutdown.
+ *
+ *		Life track of state data is following:
+ *
+ *		* On PREPARE TRANSACTION backend writes state data only to the WAL and
+ *		  stores pointer to the start of the WAL record in
+ *		  gxact->prepare_start_lsn.
+ *		* If COMMIT occurs before checkpoint then backend reads data from WAL
+ *		  using prepare_start_lsn.
+ *		* On checkpoint state data copied to files in pg_twophase directory and
+ *		  fsynced
+ *		* If COMMIT happens after checkpoint then backend reads state data from
+ *		  files
+ *
+ *		During replay and replication, TwoPhaseState also holds information
+ *		about active prepared transactions that haven't been moved to disk yet.
+ *
+ *		Replay of twophase records happens by the following rules:
+ *
+ *		* At the beginning of recovery, pg_twophase is scanned once, filling
+ *		  TwoPhaseState with entries marked with gxact->inredo and
+ *		  gxact->ondisk.  Two-phase file data older than the XID horizon of
+ *		  the redo position are discarded.
+ *		* On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts.
+ *		  gxact->inredo is set to true for such entries.
+ *		* On Checkpoint we iterate through TwoPhaseState->prepXacts entries
+ *		  that have gxact->inredo set and are behind the redo_horizon. We
+ *		  save them to disk and then switch gxact->ondisk to true.
+ *		* On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts.
+ *		  If gxact->ondisk is true, the corresponding entry from the disk
+ *		  is additionally deleted.
+ *		* RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions()
+ *		  and PrescanPreparedTransactions() have been modified to go through
+ *		  gxact->inredo entries that have not made it to disk.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/commit_ts.h"
+#include "access/htup_details.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "catalog/storage.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "replication/origin.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+/*
+ * Directory where Two-phase commit files reside within PGDATA
+ */
+#define TWOPHASE_DIR "pg_twophase"
+
+/* GUC variable, can't be changed after startup */
+int			max_prepared_xacts = 0;
+
+/*
+ * This struct describes one global transaction that is in prepared state
+ * or attempting to become prepared.
+ *
+ * The lifecycle of a global transaction is:
+ *
+ * 1. After checking that the requested GID is not in use, set up an entry in
+ * the TwoPhaseState->prepXacts array with the correct GID and valid = false,
+ * and mark it as locked by my backend.
+ *
+ * 2. After successfully completing prepare, set valid = true and enter the
+ * referenced PGPROC into the global ProcArray.
+ *
+ * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is
+ * valid and not locked, then mark the entry as locked by storing my current
+ * backend ID into locking_backend.  This prevents concurrent attempts to
+ * commit or rollback the same prepared xact.
+ *
+ * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry
+ * from the ProcArray and the TwoPhaseState->prepXacts array and return it to
+ * the freelist.
+ *
+ * Note that if the preparing transaction fails between steps 1 and 2, the
+ * entry must be removed so that the GID and the GlobalTransaction struct
+ * can be reused.  See AtAbort_Twophase().
+ *
+ * typedef struct GlobalTransactionData *GlobalTransaction appears in
+ * twophase.h
+ */
+
+typedef struct GlobalTransactionData
+{
+	GlobalTransaction next;		/* list link for free list */
+	int			pgprocno;		/* ID of associated dummy PGPROC */
+	BackendId	dummyBackendId; /* similar to backend id for backends */
+	TimestampTz prepared_at;	/* time of preparation */
+
+	/*
+	 * Note that we need to keep track of two LSNs for each GXACT. We keep
+	 * track of the start LSN because this is the address we must use to read
+	 * state data back from WAL when committing a prepared GXACT. We keep
+	 * track of the end LSN because that is the LSN we need to wait for prior
+	 * to commit.
+	 */
+	XLogRecPtr	prepare_start_lsn;	/* XLOG offset of prepare record start */
+	XLogRecPtr	prepare_end_lsn;	/* XLOG offset of prepare record end */
+	TransactionId xid;			/* The GXACT id */
+
+	Oid			owner;			/* ID of user that executed the xact */
+	BackendId	locking_backend;	/* backend currently working on the xact */
+	bool		valid;			/* true if PGPROC entry is in proc array */
+	bool		ondisk;			/* true if prepare state file is on disk */
+	bool		inredo;			/* true if entry was added via xlog_redo */
+	char		gid[GIDSIZE];	/* The GID assigned to the prepared xact */
+}			GlobalTransactionData;
+
+/*
+ * Two Phase Commit shared state.  Access to this struct is protected
+ * by TwoPhaseStateLock.
+ */
+typedef struct TwoPhaseStateData
+{
+	/* Head of linked list of free GlobalTransactionData structs */
+	GlobalTransaction freeGXacts;
+
+	/* Number of valid prepXacts entries. */
+	int			numPrepXacts;
+
+	/* There are max_prepared_xacts items in this array */
+	GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER];
+} TwoPhaseStateData;
+
+static TwoPhaseStateData *TwoPhaseState;
+
+/*
+ * Global transaction entry currently locked by us, if any.  Note that any
+ * access to the entry pointed to by this variable must be protected by
+ * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be
+ * (since it's just local memory).
+ */
+static GlobalTransaction MyLockedGxact = NULL;
+
+static bool twophaseExitRegistered = false;
+
+static void RecordTransactionCommitPrepared(TransactionId xid,
+											int nchildren,
+											TransactionId *children,
+											int nrels,
+											RelFileNode *rels,
+											int nstats,
+											xl_xact_stats_item *stats,
+											int ninvalmsgs,
+											SharedInvalidationMessage *invalmsgs,
+											bool initfileinval,
+											const char *gid);
+static void RecordTransactionAbortPrepared(TransactionId xid,
+										   int nchildren,
+										   TransactionId *children,
+										   int nrels,
+										   RelFileNode *rels,
+										   int nstats,
+										   xl_xact_stats_item *stats,
+										   const char *gid);
+static void ProcessRecords(char *bufptr, TransactionId xid,
+						   const TwoPhaseCallback callbacks[]);
+static void RemoveGXact(GlobalTransaction gxact);
+
+static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
+static char *ProcessTwoPhaseBuffer(TransactionId xid,
+								   XLogRecPtr prepare_start_lsn,
+								   bool fromdisk, bool setParent, bool setNextXid);
+static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
+								const char *gid, TimestampTz prepared_at, Oid owner,
+								Oid databaseid);
+static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
+static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
+
+/*
+ * Initialization of shared memory
+ */
+Size
+TwoPhaseShmemSize(void)
+{
+	Size		size;
+
+	/* Need the fixed struct, the array of pointers, and the GTD structs */
+	size = offsetof(TwoPhaseStateData, prepXacts);
+	size = add_size(size, mul_size(max_prepared_xacts,
+								   sizeof(GlobalTransaction)));
+	size = MAXALIGN(size);
+	size = add_size(size, mul_size(max_prepared_xacts,
+								   sizeof(GlobalTransactionData)));
+
+	return size;
+}
+
+void
+TwoPhaseShmemInit(void)
+{
+	bool		found;
+
+	TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
+									TwoPhaseShmemSize(),
+									&found);
+	if (!IsUnderPostmaster)
+	{
+		GlobalTransaction gxacts;
+		int			i;
+
+		Assert(!found);
+		TwoPhaseState->freeGXacts = NULL;
+		TwoPhaseState->numPrepXacts = 0;
+
+		/*
+		 * Initialize the linked list of free GlobalTransactionData structs
+		 */
+		gxacts = (GlobalTransaction)
+			((char *) TwoPhaseState +
+			 MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
+					  sizeof(GlobalTransaction) * max_prepared_xacts));
+		for (i = 0; i < max_prepared_xacts; i++)
+		{
+			/* insert into linked list */
+			gxacts[i].next = TwoPhaseState->freeGXacts;
+			TwoPhaseState->freeGXacts = &gxacts[i];
+
+			/* associate it with a PGPROC assigned by InitProcGlobal */
+			gxacts[i].pgprocno = PreparedXactProcs[i].pgprocno;
+
+			/*
+			 * Assign a unique ID for each dummy proc, so that the range of
+			 * dummy backend IDs immediately follows the range of normal
+			 * backend IDs. We don't dare to assign a real backend ID to dummy
+			 * procs, because prepared transactions don't take part in cache
+			 * invalidation like a real backend ID would imply, but having a
+			 * unique ID for them is nevertheless handy. This arrangement
+			 * allows you to allocate an array of size (MaxBackends +
+			 * max_prepared_xacts + 1), and have a slot for every backend and
+			 * prepared transaction. Currently multixact.c uses that
+			 * technique.
+			 */
+			gxacts[i].dummyBackendId = MaxBackends + 1 + i;
+		}
+	}
+	else
+		Assert(found);
+}
+
+/*
+ * Exit hook to unlock the global transaction entry we're working on.
+ */
+static void
+AtProcExit_Twophase(int code, Datum arg)
+{
+	/* same logic as abort */
+	AtAbort_Twophase();
+}
+
+/*
+ * Abort hook to unlock the global transaction entry we're working on.
+ */
+void
+AtAbort_Twophase(void)
+{
+	if (MyLockedGxact == NULL)
+		return;
+
+	/*
+	 * What to do with the locked global transaction entry?  If we were in the
+	 * process of preparing the transaction, but haven't written the WAL
+	 * record and state file yet, the transaction must not be considered as
+	 * prepared.  Likewise, if we are in the process of finishing an
+	 * already-prepared transaction, and fail after having already written the
+	 * 2nd phase commit or rollback record to the WAL, the transaction should
+	 * not be considered as prepared anymore.  In those cases, just remove the
+	 * entry from shared memory.
+	 *
+	 * Otherwise, the entry must be left in place so that the transaction can
+	 * be finished later, so just unlock it.
+	 *
+	 * If we abort during prepare, after having written the WAL record, we
+	 * might not have transferred all locks and other state to the prepared
+	 * transaction yet.  Likewise, if we abort during commit or rollback,
+	 * after having written the WAL record, we might not have released all the
+	 * resources held by the transaction yet.  In those cases, the in-memory
+	 * state can be wrong, but it's too late to back out.
+	 */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	if (!MyLockedGxact->valid)
+		RemoveGXact(MyLockedGxact);
+	else
+		MyLockedGxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	MyLockedGxact = NULL;
+}
+
+/*
+ * This is called after we have finished transferring state to the prepared
+ * PGPROC entry.
+ */
+void
+PostPrepare_Twophase(void)
+{
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	MyLockedGxact->locking_backend = InvalidBackendId;
+	LWLockRelease(TwoPhaseStateLock);
+
+	MyLockedGxact = NULL;
+}
+
+
+/*
+ * MarkAsPreparing
+ *		Reserve the GID for the given transaction.
+ */
+GlobalTransaction
+MarkAsPreparing(TransactionId xid, const char *gid,
+				TimestampTz prepared_at, Oid owner, Oid databaseid)
+{
+	GlobalTransaction gxact;
+	int			i;
+
+	if (strlen(gid) >= GIDSIZE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("transaction identifier \"%s\" is too long",
+						gid)));
+
+	/* fail immediately if feature is disabled */
+	if (max_prepared_xacts == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("prepared transactions are disabled"),
+				 errhint("Set max_prepared_transactions to a nonzero value.")));
+
+	/* on first call, register the exit hook */
+	if (!twophaseExitRegistered)
+	{
+		before_shmem_exit(AtProcExit_Twophase, 0);
+		twophaseExitRegistered = true;
+	}
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+	/* Check for conflicting GID */
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		gxact = TwoPhaseState->prepXacts[i];
+		if (strcmp(gxact->gid, gid) == 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_DUPLICATE_OBJECT),
+					 errmsg("transaction identifier \"%s\" is already in use",
+							gid)));
+		}
+	}
+
+	/* Get a free gxact from the freelist */
+	if (TwoPhaseState->freeGXacts == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("maximum number of prepared transactions reached"),
+				 errhint("Increase max_prepared_transactions (currently %d).",
+						 max_prepared_xacts)));
+	gxact = TwoPhaseState->freeGXacts;
+	TwoPhaseState->freeGXacts = gxact->next;
+
+	MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
+
+	gxact->ondisk = false;
+
+	/* And insert it into the active array */
+	Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
+	TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
+
+	LWLockRelease(TwoPhaseStateLock);
+
+	return gxact;
+}
+
+/*
+ * MarkAsPreparingGuts
+ *
+ * This uses a gxact struct and puts it into the active array.
+ * NOTE: this is also used when reloading a gxact after a crash; so avoid
+ * assuming that we can use very much backend context.
+ *
+ * Note: This function should be called with appropriate locks held.
+ */
+static void
+MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
+					TimestampTz prepared_at, Oid owner, Oid databaseid)
+{
+	PGPROC	   *proc;
+	int			i;
+
+	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+	Assert(gxact != NULL);
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	/* Initialize the PGPROC entry */
+	MemSet(proc, 0, sizeof(PGPROC));
+	proc->pgprocno = gxact->pgprocno;
+	SHMQueueElemInit(&(proc->links));
+	proc->waitStatus = PROC_WAIT_STATUS_OK;
+	if (LocalTransactionIdIsValid(MyProc->lxid))
+	{
+		/* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */
+		proc->lxid = MyProc->lxid;
+		proc->backendId = MyBackendId;
+	}
+	else
+	{
+		Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+		/* GetLockConflicts() uses this to specify a wait on the XID */
+		proc->lxid = xid;
+		proc->backendId = InvalidBackendId;
+	}
+	proc->xid = xid;
+	Assert(proc->xmin == InvalidTransactionId);
+	proc->delayChkptFlags = 0;
+	proc->statusFlags = 0;
+	proc->pid = 0;
+	proc->databaseId = databaseid;
+	proc->roleId = owner;
+	proc->tempNamespaceId = InvalidOid;
+	proc->isBackgroundWorker = false;
+	proc->lwWaiting = false;
+	proc->lwWaitMode = 0;
+	proc->waitLock = NULL;
+	proc->waitProcLock = NULL;
+	pg_atomic_init_u64(&proc->waitStart, 0);
+	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+		SHMQueueInit(&(proc->myProcLocks[i]));
+	/* subxid data must be filled later by GXactLoadSubxactData */
+	proc->subxidStatus.overflowed = false;
+	proc->subxidStatus.count = 0;
+
+	gxact->prepared_at = prepared_at;
+	gxact->xid = xid;
+	gxact->owner = owner;
+	gxact->locking_backend = MyBackendId;
+	gxact->valid = false;
+	gxact->inredo = false;
+	strcpy(gxact->gid, gid);
+
+	/*
+	 * Remember that we have this GlobalTransaction entry locked for us. If we
+	 * abort after this, we must release it.
+	 */
+	MyLockedGxact = gxact;
+}
+
+/*
+ * GXactLoadSubxactData
+ *
+ * If the transaction being persisted had any subtransactions, this must
+ * be called before MarkAsPrepared() to load information into the dummy
+ * PGPROC.
+ */
+static void
+GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
+					 TransactionId *children)
+{
+	PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+	/* We need no extra lock since the GXACT isn't valid yet */
+	if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
+	{
+		proc->subxidStatus.overflowed = true;
+		nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
+	}
+	if (nsubxacts > 0)
+	{
+		memcpy(proc->subxids.xids, children,
+			   nsubxacts * sizeof(TransactionId));
+		proc->subxidStatus.count = nsubxacts;
+	}
+}
+
+/*
+ * MarkAsPrepared
+ *		Mark the GXACT as fully valid, and enter it into the global ProcArray.
+ *
+ * lock_held indicates whether caller already holds TwoPhaseStateLock.
+ */
+static void
+MarkAsPrepared(GlobalTransaction gxact, bool lock_held)
+{
+	/* Lock here may be overkill, but I'm not convinced of that ... */
+	if (!lock_held)
+		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	Assert(!gxact->valid);
+	gxact->valid = true;
+	if (!lock_held)
+		LWLockRelease(TwoPhaseStateLock);
+
+	/*
+	 * Put it into the global ProcArray so TransactionIdIsInProgress considers
+	 * the XID as still running.
+	 */
+	ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]);
+}
+
+/*
+ * LockGXact
+ *		Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
+ */
+static GlobalTransaction
+LockGXact(const char *gid, Oid user)
+{
+	int			i;
+
+	/* on first call, register the exit hook */
+	if (!twophaseExitRegistered)
+	{
+		before_shmem_exit(AtProcExit_Twophase, 0);
+		twophaseExitRegistered = true;
+	}
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+		PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+
+		/* Ignore not-yet-valid GIDs */
+		if (!gxact->valid)
+			continue;
+		if (strcmp(gxact->gid, gid) != 0)
+			continue;
+
+		/* Found it, but has someone else got it locked? */
+		if (gxact->locking_backend != InvalidBackendId)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("prepared transaction with identifier \"%s\" is busy",
+							gid)));
+
+		if (user != gxact->owner && !superuser_arg(user))
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+					 errmsg("permission denied to finish prepared transaction"),
+					 errhint("Must be superuser or the user that prepared the transaction.")));
+
+		/*
+		 * Note: it probably would be possible to allow committing from
+		 * another database; but at the moment NOTIFY is known not to work and
+		 * there may be some other issues as well.  Hence disallow until
+		 * someone gets motivated to make it work.
+		 */
+		if (MyDatabaseId != proc->databaseId)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("prepared transaction belongs to another database"),
+					 errhint("Connect to the database where the transaction was prepared to finish it.")));
+
+		/* OK for me to lock it */
+		gxact->locking_backend = MyBackendId;
+		MyLockedGxact = gxact;
+
+		LWLockRelease(TwoPhaseStateLock);
+
+		return gxact;
+	}
+
+	LWLockRelease(TwoPhaseStateLock);
+
+	ereport(ERROR,
+			(errcode(ERRCODE_UNDEFINED_OBJECT),
+			 errmsg("prepared transaction with identifier \"%s\" does not exist",
+					gid)));
+
+	/* NOTREACHED */
+	return NULL;
+}
+
+/*
+ * RemoveGXact
+ *		Remove the prepared transaction from the shared memory array.
+ *
+ * NB: caller should have already removed it from ProcArray
+ */
+static void
+RemoveGXact(GlobalTransaction gxact)
+{
+	int			i;
+
+	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		if (gxact == TwoPhaseState->prepXacts[i])
+		{
+			/* remove from the active array */
+			TwoPhaseState->numPrepXacts--;
+			TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts];
+
+			/* and put it back in the freelist */
+			gxact->next = TwoPhaseState->freeGXacts;
+			TwoPhaseState->freeGXacts = gxact;
+
+			return;
+		}
+	}
+
+	elog(ERROR, "failed to find %p in GlobalTransaction array", gxact);
+}
+
+/*
+ * Returns an array of all prepared transactions for the user-level
+ * function pg_prepared_xact.
+ *
+ * The returned array and all its elements are copies of internal data
+ * structures, to minimize the time we need to hold the TwoPhaseStateLock.
+ *
+ * WARNING -- we return even those transactions that are not fully prepared
+ * yet.  The caller should filter them out if he doesn't want them.
+ *
+ * The returned array is palloc'd.
+ */
+static int
+GetPreparedTransactionList(GlobalTransaction *gxacts)
+{
+	GlobalTransaction array;
+	int			num;
+	int			i;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+	if (TwoPhaseState->numPrepXacts == 0)
+	{
+		LWLockRelease(TwoPhaseStateLock);
+
+		*gxacts = NULL;
+		return 0;
+	}
+
+	num = TwoPhaseState->numPrepXacts;
+	array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num);
+	*gxacts = array;
+	for (i = 0; i < num; i++)
+		memcpy(array + i, TwoPhaseState->prepXacts[i],
+			   sizeof(GlobalTransactionData));
+
+	LWLockRelease(TwoPhaseStateLock);
+
+	return num;
+}
+
+
+/* Working status for pg_prepared_xact */
+typedef struct
+{
+	GlobalTransaction array;
+	int			ngxacts;
+	int			currIdx;
+} Working_State;
+
+/*
+ * pg_prepared_xact
+ *		Produce a view with one row per prepared transaction.
+ *
+ * This function is here so we don't have to export the
+ * GlobalTransactionData struct definition.
+ */
+Datum
+pg_prepared_xact(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Working_State *status;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		TupleDesc	tupdesc;
+		MemoryContext oldcontext;
+
+		/* create a function context for cross-call persistence */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/*
+		 * Switch to memory context appropriate for multiple function calls
+		 */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* build tupdesc for result tuples */
+		/* this had better match pg_prepared_xacts view in system_views.sql */
+		tupdesc = CreateTemplateTupleDesc(5);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction",
+						   XIDOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared",
+						   TIMESTAMPTZOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid",
+						   OIDOID, -1, 0);
+
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		/*
+		 * Collect all the 2PC status information that we will format and send
+		 * out as a result set.
+		 */
+		status = (Working_State *) palloc(sizeof(Working_State));
+		funcctx->user_fctx = (void *) status;
+
+		status->ngxacts = GetPreparedTransactionList(&status->array);
+		status->currIdx = 0;
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+	status = (Working_State *) funcctx->user_fctx;
+
+	while (status->array != NULL && status->currIdx < status->ngxacts)
+	{
+		GlobalTransaction gxact = &status->array[status->currIdx++];
+		PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+		Datum		values[5];
+		bool		nulls[5];
+		HeapTuple	tuple;
+		Datum		result;
+
+		if (!gxact->valid)
+			continue;
+
+		/*
+		 * Form tuple with appropriate data.
+		 */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = TransactionIdGetDatum(proc->xid);
+		values[1] = CStringGetTextDatum(gxact->gid);
+		values[2] = TimestampTzGetDatum(gxact->prepared_at);
+		values[3] = ObjectIdGetDatum(gxact->owner);
+		values[4] = ObjectIdGetDatum(proc->databaseId);
+
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+
+	SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * TwoPhaseGetGXact
+ *		Get the GlobalTransaction struct for a prepared transaction
+ *		specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
+ */
+static GlobalTransaction
+TwoPhaseGetGXact(TransactionId xid, bool lock_held)
+{
+	GlobalTransaction result = NULL;
+	int			i;
+
+	static TransactionId cached_xid = InvalidTransactionId;
+	static GlobalTransaction cached_gxact = NULL;
+
+	Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
+
+	/*
+	 * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
+	 * repeatedly for the same XID.  We can save work with a simple cache.
+	 */
+	if (xid == cached_xid)
+		return cached_gxact;
+
+	if (!lock_held)
+		LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+		if (gxact->xid == xid)
+		{
+			result = gxact;
+			break;
+		}
+	}
+
+	if (!lock_held)
+		LWLockRelease(TwoPhaseStateLock);
+
+	if (result == NULL)			/* should not happen */
+		elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
+
+	cached_xid = xid;
+	cached_gxact = result;
+
+	return result;
+}
+
+/*
+ * TwoPhaseGetXidByVirtualXID
+ *		Lookup VXID among xacts prepared since last startup.
+ *
+ * (This won't find recovered xacts.)  If more than one matches, return any
+ * and set "have_more" to true.  To witness multiple matches, a single
+ * BackendId must consume 2^32 LXIDs, with no intervening database restart.
+ */
+TransactionId
+TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
+						   bool *have_more)
+{
+	int			i;
+	TransactionId result = InvalidTransactionId;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+		PGPROC	   *proc;
+		VirtualTransactionId proc_vxid;
+
+		if (!gxact->valid)
+			continue;
+		proc = &ProcGlobal->allProcs[gxact->pgprocno];
+		GET_VXID_FROM_PGPROC(proc_vxid, *proc);
+		if (VirtualTransactionIdEquals(vxid, proc_vxid))
+		{
+			/* Startup process sets proc->backendId to InvalidBackendId. */
+			Assert(!gxact->inredo);
+
+			if (result != InvalidTransactionId)
+			{
+				*have_more = true;
+				break;
+			}
+			result = gxact->xid;
+		}
+	}
+
+	LWLockRelease(TwoPhaseStateLock);
+
+	return result;
+}
+
+/*
+ * TwoPhaseGetDummyBackendId
+ *		Get the dummy backend ID for prepared transaction specified by XID
+ *
+ * Dummy backend IDs are similar to real backend IDs of real backends.
+ * They start at MaxBackends + 1, and are unique across all currently active
+ * real backends and prepared transactions.  If lock_held is set to true,
+ * TwoPhaseStateLock will not be taken, so the caller had better hold it.
+ */
+BackendId
+TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held)
+{
+	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+
+	return gxact->dummyBackendId;
+}
+
+/*
+ * TwoPhaseGetDummyProc
+ *		Get the PGPROC that represents a prepared transaction specified by XID
+ *
+ * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
+ * caller had better hold it.
+ */
+PGPROC *
+TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
+{
+	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+
+	return &ProcGlobal->allProcs[gxact->pgprocno];
+}
+
+/************************************************************************/
+/* State file support													*/
+/************************************************************************/
+
+#define TwoPhaseFilePath(path, xid) \
+	snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid)
+
+/*
+ * 2PC state file format:
+ *
+ *	1. TwoPhaseFileHeader
+ *	2. TransactionId[] (subtransactions)
+ *	3. RelFileNode[] (files to be deleted at commit)
+ *	4. RelFileNode[] (files to be deleted at abort)
+ *	5. SharedInvalidationMessage[] (inval messages to be sent at commit)
+ *	6. TwoPhaseRecordOnDisk
+ *	7. ...
+ *	8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
+ *	9. checksum (CRC-32C)
+ *
+ * Each segment except the final checksum is MAXALIGN'd.
+ */
+
+/*
+ * Header for a 2PC state file
+ */
+#define TWOPHASE_MAGIC	0x57F94534	/* format identifier */
+
+typedef xl_xact_prepare TwoPhaseFileHeader;
+
+/*
+ * Header for each record in a state file
+ *
+ * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header.
+ * The rmgr data will be stored starting on a MAXALIGN boundary.
+ */
+typedef struct TwoPhaseRecordOnDisk
+{
+	uint32		len;			/* length of rmgr data */
+	TwoPhaseRmgrId rmid;		/* resource manager for this record */
+	uint16		info;			/* flag bits for use by rmgr */
+} TwoPhaseRecordOnDisk;
+
+/*
+ * During prepare, the state file is assembled in memory before writing it
+ * to WAL and the actual state file.  We use a chain of StateFileChunk blocks
+ * for that.
+ */
+typedef struct StateFileChunk
+{
+	char	   *data;
+	uint32		len;
+	struct StateFileChunk *next;
+} StateFileChunk;
+
+static struct xllist
+{
+	StateFileChunk *head;		/* first data block in the chain */
+	StateFileChunk *tail;		/* last block in chain */
+	uint32		num_chunks;
+	uint32		bytes_free;		/* free bytes left in tail block */
+	uint32		total_len;		/* total data bytes in chain */
+}			records;
+
+
+/*
+ * Append a block of data to records data structure.
+ *
+ * NB: each block is padded to a MAXALIGN multiple.  This must be
+ * accounted for when the file is later read!
+ *
+ * The data is copied, so the caller is free to modify it afterwards.
+ */
+static void
+save_state_data(const void *data, uint32 len)
+{
+	uint32		padlen = MAXALIGN(len);
+
+	if (padlen > records.bytes_free)
+	{
+		records.tail->next = palloc0(sizeof(StateFileChunk));
+		records.tail = records.tail->next;
+		records.tail->len = 0;
+		records.tail->next = NULL;
+		records.num_chunks++;
+
+		records.bytes_free = Max(padlen, 512);
+		records.tail->data = palloc(records.bytes_free);
+	}
+
+	memcpy(((char *) records.tail->data) + records.tail->len, data, len);
+	records.tail->len += padlen;
+	records.bytes_free -= padlen;
+	records.total_len += padlen;
+}
+
+/*
+ * Start preparing a state file.
+ *
+ * Initializes data structure and inserts the 2PC file header record.
+ */
+void
+StartPrepare(GlobalTransaction gxact)
+{
+	PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
+	TransactionId xid = gxact->xid;
+	TwoPhaseFileHeader hdr;
+	TransactionId *children;
+	RelFileNode *commitrels;
+	RelFileNode *abortrels;
+	xl_xact_stats_item *abortstats = NULL;
+	xl_xact_stats_item *commitstats = NULL;
+	SharedInvalidationMessage *invalmsgs;
+
+	/* Initialize linked list */
+	records.head = palloc0(sizeof(StateFileChunk));
+	records.head->len = 0;
+	records.head->next = NULL;
+
+	records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512);
+	records.head->data = palloc(records.bytes_free);
+
+	records.tail = records.head;
+	records.num_chunks = 1;
+
+	records.total_len = 0;
+
+	/* Create header */
+	hdr.magic = TWOPHASE_MAGIC;
+	hdr.total_len = 0;			/* EndPrepare will fill this in */
+	hdr.xid = xid;
+	hdr.database = proc->databaseId;
+	hdr.prepared_at = gxact->prepared_at;
+	hdr.owner = gxact->owner;
+	hdr.nsubxacts = xactGetCommittedChildren(&children);
+	hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
+	hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
+	hdr.ncommitstats =
+		pgstat_get_transactional_drops(true, &commitstats);
+	hdr.nabortstats =
+		pgstat_get_transactional_drops(false, &abortstats);
+	hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs,
+														  &hdr.initfileinval);
+	hdr.gidlen = strlen(gxact->gid) + 1;	/* Include '\0' */
+	/* EndPrepare will fill the origin data, if necessary */
+	hdr.origin_lsn = InvalidXLogRecPtr;
+	hdr.origin_timestamp = 0;
+
+	save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
+	save_state_data(gxact->gid, hdr.gidlen);
+
+	/*
+	 * Add the additional info about subxacts, deletable files and cache
+	 * invalidation messages.
+	 */
+	if (hdr.nsubxacts > 0)
+	{
+		save_state_data(children, hdr.nsubxacts * sizeof(TransactionId));
+		/* While we have the child-xact data, stuff it in the gxact too */
+		GXactLoadSubxactData(gxact, hdr.nsubxacts, children);
+	}
+	if (hdr.ncommitrels > 0)
+	{
+		save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode));
+		pfree(commitrels);
+	}
+	if (hdr.nabortrels > 0)
+	{
+		save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode));
+		pfree(abortrels);
+	}
+	if (hdr.ncommitstats > 0)
+	{
+		save_state_data(commitstats,
+						hdr.ncommitstats * sizeof(xl_xact_stats_item));
+		pfree(commitstats);
+	}
+	if (hdr.nabortstats > 0)
+	{
+		save_state_data(abortstats,
+						hdr.nabortstats * sizeof(xl_xact_stats_item));
+		pfree(abortstats);
+	}
+	if (hdr.ninvalmsgs > 0)
+	{
+		save_state_data(invalmsgs,
+						hdr.ninvalmsgs * sizeof(SharedInvalidationMessage));
+		pfree(invalmsgs);
+	}
+}
+
+/*
+ * Finish preparing state data and writing it to WAL.
+ */
+void
+EndPrepare(GlobalTransaction gxact)
+{
+	TwoPhaseFileHeader *hdr;
+	StateFileChunk *record;
+	bool		replorigin;
+
+	/* Add the end sentinel to the list of 2PC records */
+	RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
+						   NULL, 0);
+
+	/* Go back and fill in total_len in the file header record */
+	hdr = (TwoPhaseFileHeader *) records.head->data;
+	Assert(hdr->magic == TWOPHASE_MAGIC);
+	hdr->total_len = records.total_len + sizeof(pg_crc32c);
+
+	replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+				  replorigin_session_origin != DoNotReplicateId);
+
+	if (replorigin)
+	{
+		hdr->origin_lsn = replorigin_session_origin_lsn;
+		hdr->origin_timestamp = replorigin_session_origin_timestamp;
+	}
+
+	/*
+	 * If the data size exceeds MaxAllocSize, we won't be able to read it in
+	 * ReadTwoPhaseFile. Check for that now, rather than fail in the case
+	 * where we write data to file and then re-read at commit time.
+	 */
+	if (hdr->total_len > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("two-phase state file maximum length exceeded")));
+
+	/*
+	 * Now writing 2PC state data to WAL. We let the WAL's CRC protection
+	 * cover us, so no need to calculate a separate CRC.
+	 *
+	 * We have to set DELAY_CHKPT_START here, too; otherwise a checkpoint
+	 * starting immediately after the WAL record is inserted could complete
+	 * without fsync'ing our state file.  (This is essentially the same kind
+	 * of race condition as the COMMIT-to-clog-write case that
+	 * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.)
+	 *
+	 * We save the PREPARE record's location in the gxact for later use by
+	 * CheckPointTwoPhase.
+	 */
+	XLogEnsureRecordSpace(0, records.num_chunks);
+
+	START_CRIT_SECTION();
+
+	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+	XLogBeginInsert();
+	for (record = records.head; record != NULL; record = record->next)
+		XLogRegisterData(record->data, record->len);
+
+	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+	gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
+
+	if (replorigin)
+	{
+		/* Move LSNs forward for this replication origin */
+		replorigin_session_advance(replorigin_session_origin_lsn,
+								   gxact->prepare_end_lsn);
+	}
+
+	XLogFlush(gxact->prepare_end_lsn);
+
+	/* If we crash now, we have prepared: WAL replay will fix things */
+
+	/* Store record's start location to read that later on Commit */
+	gxact->prepare_start_lsn = ProcLastRecPtr;
+
+	/*
+	 * Mark the prepared transaction as valid.  As soon as xact.c marks MyProc
+	 * as not running our XID (which it will do immediately after this
+	 * function returns), others can commit/rollback the xact.
+	 *
+	 * NB: a side effect of this is to make a dummy ProcArray entry for the
+	 * prepared XID.  This must happen before we clear the XID from MyProc /
+	 * ProcGlobal->xids[], else there is a window where the XID is not running
+	 * according to TransactionIdIsInProgress, and onlookers would be entitled
+	 * to assume the xact crashed.  Instead we have a window where the same
+	 * XID appears twice in ProcArray, which is OK.
+	 */
+	MarkAsPrepared(gxact, false);
+
+	/*
+	 * Now we can mark ourselves as out of the commit critical section: a
+	 * checkpoint starting after this will certainly see the gxact as a
+	 * candidate for fsyncing.
+	 */
+	MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+	/*
+	 * Remember that we have this GlobalTransaction entry locked for us.  If
+	 * we crash after this point, it's too late to abort, but we must unlock
+	 * it so that the prepared transaction can be committed or rolled back.
+	 */
+	MyLockedGxact = gxact;
+
+	END_CRIT_SECTION();
+
+	/*
+	 * Wait for synchronous replication, if required.
+	 *
+	 * Note that at this stage we have marked the prepare, but still show as
+	 * running in the procarray (twice!) and continue to hold locks.
+	 */
+	SyncRepWaitForLSN(gxact->prepare_end_lsn, false);
+
+	records.tail = records.head = NULL;
+	records.num_chunks = 0;
+}
+
+/*
+ * Register a 2PC record to be written to state file.
+ */
+void
+RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
+					   const void *data, uint32 len)
+{
+	TwoPhaseRecordOnDisk record;
+
+	record.rmid = rmid;
+	record.info = info;
+	record.len = len;
+	save_state_data(&record, sizeof(TwoPhaseRecordOnDisk));
+	if (len > 0)
+		save_state_data(data, len);
+}
+
+
+/*
+ * Read and validate the state file for xid.
+ *
+ * If it looks OK (has a valid magic number and CRC), return the palloc'd
+ * contents of the file, issuing an error when finding corrupted data.  If
+ * missing_ok is true, which indicates that missing files can be safely
+ * ignored, then return NULL.  This state can be reached when doing recovery.
+ */
+static char *
+ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
+{
+	char		path[MAXPGPATH];
+	char	   *buf;
+	TwoPhaseFileHeader *hdr;
+	int			fd;
+	struct stat stat;
+	uint32		crc_offset;
+	pg_crc32c	calc_crc,
+				file_crc;
+	int			r;
+
+	TwoPhaseFilePath(path, xid);
+
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+	{
+		if (missing_ok && errno == ENOENT)
+			return NULL;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+	}
+
+	/*
+	 * Check file length.  We can determine a lower bound pretty easily. We
+	 * set an upper bound to avoid palloc() failure on a corrupt file, though
+	 * we can't guarantee that we won't get an out of memory error anyway,
+	 * even on a valid file.
+	 */
+	if (fstat(fd, &stat))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
+
+	if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
+						MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
+						sizeof(pg_crc32c)) ||
+		stat.st_size > MaxAllocSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg_plural("incorrect size of file \"%s\": %lld byte",
+							   "incorrect size of file \"%s\": %lld bytes",
+							   (long long int) stat.st_size, path,
+							   (long long int) stat.st_size)));
+
+	crc_offset = stat.st_size - sizeof(pg_crc32c);
+	if (crc_offset != MAXALIGN(crc_offset))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("incorrect alignment of CRC offset for file \"%s\"",
+						path)));
+
+	/*
+	 * OK, slurp in the file.
+	 */
+	buf = (char *) palloc(stat.st_size);
+
+	pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ);
+	r = read(fd, buf, stat.st_size);
+	if (r != stat.st_size)
+	{
+		if (r < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m", path)));
+		else
+			ereport(ERROR,
+					(errmsg("could not read file \"%s\": read %d of %lld",
+							path, r, (long long int) stat.st_size)));
+	}
+
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", path)));
+
+	hdr = (TwoPhaseFileHeader *) buf;
+	if (hdr->magic != TWOPHASE_MAGIC)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid magic number stored in file \"%s\"",
+						path)));
+
+	if (hdr->total_len != stat.st_size)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid size stored in file \"%s\"",
+						path)));
+
+	INIT_CRC32C(calc_crc);
+	COMP_CRC32C(calc_crc, buf, crc_offset);
+	FIN_CRC32C(calc_crc);
+
+	file_crc = *((pg_crc32c *) (buf + crc_offset));
+
+	if (!EQ_CRC32C(calc_crc, file_crc))
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("calculated CRC checksum does not match value stored in file \"%s\"",
+						path)));
+
+	return buf;
+}
+
+
+/*
+ * Reads 2PC data from xlog. During checkpoint this data will be moved to
+ * twophase files and ReadTwoPhaseFile should be used instead.
+ *
+ * Note clearly that this function can access WAL during normal operation,
+ * similarly to the way WALSender or Logical Decoding would do.
+ */
+static void
+XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
+{
+	XLogRecord *record;
+	XLogReaderState *xlogreader;
+	char	   *errormsg;
+
+	xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
+									XL_ROUTINE(.page_read = &read_local_xlog_page,
+											   .segment_open = &wal_segment_open,
+											   .segment_close = &wal_segment_close),
+									NULL);
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while allocating a WAL reading processor.")));
+
+	XLogBeginRead(xlogreader, lsn);
+	record = XLogReadRecord(xlogreader, &errormsg);
+
+	if (record == NULL)
+	{
+		if (errormsg)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read two-phase state from WAL at %X/%X: %s",
+							LSN_FORMAT_ARGS(lsn), errormsg)));
+		else
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read two-phase state from WAL at %X/%X",
+							LSN_FORMAT_ARGS(lsn))));
+	}
+
+	if (XLogRecGetRmid(xlogreader) != RM_XACT_ID ||
+		(XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("expected two-phase state data is not present in WAL at %X/%X",
+						LSN_FORMAT_ARGS(lsn))));
+
+	if (len != NULL)
+		*len = XLogRecGetDataLen(xlogreader);
+
+	*buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader));
+	memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader));
+
+	XLogReaderFree(xlogreader);
+}
+
+
+/*
+ * Confirms an xid is prepared, during recovery
+ */
+bool
+StandbyTransactionIdIsPrepared(TransactionId xid)
+{
+	char	   *buf;
+	TwoPhaseFileHeader *hdr;
+	bool		result;
+
+	Assert(TransactionIdIsValid(xid));
+
+	if (max_prepared_xacts <= 0)
+		return false;			/* nothing to do */
+
+	/* Read and validate file */
+	buf = ReadTwoPhaseFile(xid, true);
+	if (buf == NULL)
+		return false;
+
+	/* Check header also */
+	hdr = (TwoPhaseFileHeader *) buf;
+	result = TransactionIdEquals(hdr->xid, xid);
+	pfree(buf);
+
+	return result;
+}
+
+/*
+ * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
+ */
+void
+FinishPreparedTransaction(const char *gid, bool isCommit)
+{
+	GlobalTransaction gxact;
+	PGPROC	   *proc;
+	TransactionId xid;
+	char	   *buf;
+	char	   *bufptr;
+	TwoPhaseFileHeader *hdr;
+	TransactionId latestXid;
+	TransactionId *children;
+	RelFileNode *commitrels;
+	RelFileNode *abortrels;
+	RelFileNode *delrels;
+	int			ndelrels;
+	xl_xact_stats_item *commitstats;
+	xl_xact_stats_item *abortstats;
+	SharedInvalidationMessage *invalmsgs;
+
+	/*
+	 * Validate the GID, and lock the GXACT to ensure that two backends do not
+	 * try to commit the same GID at once.
+	 */
+	gxact = LockGXact(gid, GetUserId());
+	proc = &ProcGlobal->allProcs[gxact->pgprocno];
+	xid = gxact->xid;
+
+	/*
+	 * Read and validate 2PC state data. State data will typically be stored
+	 * in WAL files if the LSN is after the last checkpoint record, or moved
+	 * to disk if for some reason they have lived for a long time.
+	 */
+	if (gxact->ondisk)
+		buf = ReadTwoPhaseFile(xid, false);
+	else
+		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+
+
+	/*
+	 * Disassemble the header area
+	 */
+	hdr = (TwoPhaseFileHeader *) buf;
+	Assert(TransactionIdEquals(hdr->xid, xid));
+	bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+	bufptr += MAXALIGN(hdr->gidlen);
+	children = (TransactionId *) bufptr;
+	bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
+	commitrels = (RelFileNode *) bufptr;
+	bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
+	abortrels = (RelFileNode *) bufptr;
+	bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+	commitstats = (xl_xact_stats_item *) bufptr;
+	bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
+	abortstats = (xl_xact_stats_item *) bufptr;
+	bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
+	invalmsgs = (SharedInvalidationMessage *) bufptr;
+	bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
+
+	/* compute latestXid among all children */
+	latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children);
+
+	/* Prevent cancel/die interrupt while cleaning up */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * The order of operations here is critical: make the XLOG entry for
+	 * commit or abort, then mark the transaction committed or aborted in
+	 * pg_xact, then remove its PGPROC from the global ProcArray (which means
+	 * TransactionIdIsInProgress will stop saying the prepared xact is in
+	 * progress), then run the post-commit or post-abort callbacks. The
+	 * callbacks will release the locks the transaction held.
+	 */
+	if (isCommit)
+		RecordTransactionCommitPrepared(xid,
+										hdr->nsubxacts, children,
+										hdr->ncommitrels, commitrels,
+										hdr->ncommitstats,
+										commitstats,
+										hdr->ninvalmsgs, invalmsgs,
+										hdr->initfileinval, gid);
+	else
+		RecordTransactionAbortPrepared(xid,
+									   hdr->nsubxacts, children,
+									   hdr->nabortrels, abortrels,
+									   hdr->nabortstats,
+									   abortstats,
+									   gid);
+
+	ProcArrayRemove(proc, latestXid);
+
+	/*
+	 * In case we fail while running the callbacks, mark the gxact invalid so
+	 * no one else will try to commit/rollback, and so it will be recycled if
+	 * we fail after this point.  It is still locked by our backend so it
+	 * won't go away yet.
+	 *
+	 * (We assume it's safe to do this without taking TwoPhaseStateLock.)
+	 */
+	gxact->valid = false;
+
+	/*
+	 * We have to remove any files that were supposed to be dropped. For
+	 * consistency with the regular xact.c code paths, must do this before
+	 * releasing locks, so do it before running the callbacks.
+	 *
+	 * NB: this code knows that we couldn't be dropping any temp rels ...
+	 */
+	if (isCommit)
+	{
+		delrels = commitrels;
+		ndelrels = hdr->ncommitrels;
+	}
+	else
+	{
+		delrels = abortrels;
+		ndelrels = hdr->nabortrels;
+	}
+
+	/* Make sure files supposed to be dropped are dropped */
+	DropRelationFiles(delrels, ndelrels, false);
+
+	if (isCommit)
+		pgstat_execute_transactional_drops(hdr->ncommitstats, commitstats, false);
+	else
+		pgstat_execute_transactional_drops(hdr->nabortstats, abortstats, false);
+
+	/*
+	 * Handle cache invalidation messages.
+	 *
+	 * Relcache init file invalidation requires processing both before and
+	 * after we send the SI messages, only when committing.  See
+	 * AtEOXact_Inval().
+	 */
+	if (isCommit)
+	{
+		if (hdr->initfileinval)
+			RelationCacheInitFilePreInvalidate();
+		SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
+		if (hdr->initfileinval)
+			RelationCacheInitFilePostInvalidate();
+	}
+
+	/*
+	 * Acquire the two-phase lock.  We want to work on the two-phase callbacks
+	 * while holding it to avoid potential conflicts with other transactions
+	 * attempting to use the same GID, so the lock is released once the shared
+	 * memory state is cleared.
+	 */
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+
+	/* And now do the callbacks */
+	if (isCommit)
+		ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
+	else
+		ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
+
+	PredicateLockTwoPhaseFinish(xid, isCommit);
+
+	/* Clear shared memory state */
+	RemoveGXact(gxact);
+
+	/*
+	 * Release the lock as all callbacks are called and shared memory cleanup
+	 * is done.
+	 */
+	LWLockRelease(TwoPhaseStateLock);
+
+	/* Count the prepared xact as committed or aborted */
+	AtEOXact_PgStat(isCommit, false);
+
+	/*
+	 * And now we can clean up any files we may have left.
+	 */
+	if (gxact->ondisk)
+		RemoveTwoPhaseFile(xid, true);
+
+	MyLockedGxact = NULL;
+
+	RESUME_INTERRUPTS();
+
+	pfree(buf);
+}
+
+/*
+ * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
+ */
+static void
+ProcessRecords(char *bufptr, TransactionId xid,
+			   const TwoPhaseCallback callbacks[])
+{
+	for (;;)
+	{
+		TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr;
+
+		Assert(record->rmid <= TWOPHASE_RM_MAX_ID);
+		if (record->rmid == TWOPHASE_RM_END_ID)
+			break;
+
+		bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
+
+		if (callbacks[record->rmid] != NULL)
+			callbacks[record->rmid] (xid, record->info,
+									 (void *) bufptr, record->len);
+
+		bufptr += MAXALIGN(record->len);
+	}
+}
+
+/*
+ * Remove the 2PC file for the specified XID.
+ *
+ * If giveWarning is false, do not complain about file-not-present;
+ * this is an expected case during WAL replay.
+ */
+static void
+RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
+{
+	char		path[MAXPGPATH];
+
+	TwoPhaseFilePath(path, xid);
+	if (unlink(path))
+		if (errno != ENOENT || giveWarning)
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m", path)));
+}
+
+/*
+ * Recreates a state file. This is used in WAL replay and during
+ * checkpoint creation.
+ *
+ * Note: content and len don't include CRC.
+ */
+static void
+RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
+{
+	char		path[MAXPGPATH];
+	pg_crc32c	statefile_crc;
+	int			fd;
+
+	/* Recompute CRC */
+	INIT_CRC32C(statefile_crc);
+	COMP_CRC32C(statefile_crc, content, len);
+	FIN_CRC32C(statefile_crc);
+
+	TwoPhaseFilePath(path, xid);
+
+	fd = OpenTransientFile(path,
+						   O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not recreate file \"%s\": %m", path)));
+
+	/* Write content and CRC */
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE);
+	if (write(fd, content, len) != len)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write file \"%s\": %m", path)));
+	}
+	if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c))
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write file \"%s\": %m", path)));
+	}
+	pgstat_report_wait_end();
+
+	/*
+	 * We must fsync the file because the end-of-replay checkpoint will not do
+	 * so, there being no GXACT in shared memory yet to tell it to.
+	 */
+	pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", path)));
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", path)));
+}
+
+/*
+ * CheckPointTwoPhase -- handle 2PC component of checkpointing.
+ *
+ * We must fsync the state file of any GXACT that is valid or has been
+ * generated during redo and has a PREPARE LSN <= the checkpoint's redo
+ * horizon.  (If the gxact isn't valid yet, has not been generated in
+ * redo, or has a later LSN, this checkpoint is not responsible for
+ * fsyncing it.)
+ *
+ * This is deliberately run as late as possible in the checkpoint sequence,
+ * because GXACTs ordinarily have short lifespans, and so it is quite
+ * possible that GXACTs that were valid at checkpoint start will no longer
+ * exist if we wait a little bit. With typical checkpoint settings this
+ * will be about 3 minutes for an online checkpoint, so as a result we
+ * expect that there will be no GXACTs that need to be copied to disk.
+ *
+ * If a GXACT remains valid across multiple checkpoints, it will already
+ * be on disk so we don't bother to repeat that write.
+ */
+void
+CheckPointTwoPhase(XLogRecPtr redo_horizon)
+{
+	int			i;
+	int			serialized_xacts = 0;
+
+	if (max_prepared_xacts <= 0)
+		return;					/* nothing to do */
+
+	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
+
+	/*
+	 * We are expecting there to be zero GXACTs that need to be copied to
+	 * disk, so we perform all I/O while holding TwoPhaseStateLock for
+	 * simplicity. This prevents any new xacts from preparing while this
+	 * occurs, which shouldn't be a problem since the presence of long-lived
+	 * prepared xacts indicates the transaction manager isn't active.
+	 *
+	 * It's also possible to move I/O out of the lock, but on every error we
+	 * should check whether somebody committed our transaction in different
+	 * backend. Let's leave this optimization for future, if somebody will
+	 * spot that this place cause bottleneck.
+	 *
+	 * Note that it isn't possible for there to be a GXACT with a
+	 * prepare_end_lsn set prior to the last checkpoint yet is marked invalid,
+	 * because of the efforts with delayChkptFlags.
+	 */
+	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		/*
+		 * Note that we are using gxact not PGPROC so this works in recovery
+		 * also
+		 */
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+		if ((gxact->valid || gxact->inredo) &&
+			!gxact->ondisk &&
+			gxact->prepare_end_lsn <= redo_horizon)
+		{
+			char	   *buf;
+			int			len;
+
+			XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
+			RecreateTwoPhaseFile(gxact->xid, buf, len);
+			gxact->ondisk = true;
+			gxact->prepare_start_lsn = InvalidXLogRecPtr;
+			gxact->prepare_end_lsn = InvalidXLogRecPtr;
+			pfree(buf);
+			serialized_xacts++;
+		}
+	}
+	LWLockRelease(TwoPhaseStateLock);
+
+	/*
+	 * Flush unconditionally the parent directory to make any information
+	 * durable on disk.  Two-phase files could have been removed and those
+	 * removals need to be made persistent as well as any files newly created
+	 * previously since the last checkpoint.
+	 */
+	fsync_fname(TWOPHASE_DIR, true);
+
+	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
+
+	if (log_checkpoints && serialized_xacts > 0)
+		ereport(LOG,
+				(errmsg_plural("%u two-phase state file was written "
+							   "for a long-running prepared transaction",
+							   "%u two-phase state files were written "
+							   "for long-running prepared transactions",
+							   serialized_xacts,
+							   serialized_xacts)));
+}
+
+/*
+ * restoreTwoPhaseData
+ *
+ * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
+ * This is called once at the beginning of recovery, saving any extra
+ * lookups in the future.  Two-phase files that are newer than the
+ * minimum XID horizon are discarded on the way.
+ */
+void
+restoreTwoPhaseData(void)
+{
+	DIR		   *cldir;
+	struct dirent *clde;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	cldir = AllocateDir(TWOPHASE_DIR);
+	while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+	{
+		if (strlen(clde->d_name) == 8 &&
+			strspn(clde->d_name, "0123456789ABCDEF") == 8)
+		{
+			TransactionId xid;
+			char	   *buf;
+
+			xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+			buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
+										true, false, false);
+			if (buf == NULL)
+				continue;
+
+			PrepareRedoAdd(buf, InvalidXLogRecPtr,
+						   InvalidXLogRecPtr, InvalidRepOriginId);
+		}
+	}
+	LWLockRelease(TwoPhaseStateLock);
+	FreeDir(cldir);
+}
+
+/*
+ * PrescanPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and determine the range
+ * of valid XIDs present.  This is run during database startup, after we
+ * have completed reading WAL.  ShmemVariableCache->nextXid has been set to
+ * one more than the highest XID for which evidence exists in WAL.
+ *
+ * We throw away any prepared xacts with main XID beyond nextXid --- if any
+ * are present, it suggests that the DBA has done a PITR recovery to an
+ * earlier point in time without cleaning out pg_twophase.  We dare not
+ * try to recover such prepared xacts since they likely depend on database
+ * state that doesn't exist now.
+ *
+ * However, we will advance nextXid beyond any subxact XIDs belonging to
+ * valid prepared xacts.  We need to do this since subxact commit doesn't
+ * write a WAL entry, and so there might be no evidence in WAL of those
+ * subxact XIDs.
+ *
+ * On corrupted two-phase files, fail immediately.  Keeping around broken
+ * entries and let replay continue causes harm on the system, and a new
+ * backup should be rolled in.
+ *
+ * Our other responsibility is to determine and return the oldest valid XID
+ * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
+ * This is needed to synchronize pg_subtrans startup properly.
+ *
+ * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
+ * top-level xids is stored in *xids_p. The number of entries in the array
+ * is returned in *nxids_p.
+ */
+TransactionId
+PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
+{
+	FullTransactionId nextXid = ShmemVariableCache->nextXid;
+	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
+	TransactionId result = origNextXid;
+	TransactionId *xids = NULL;
+	int			nxids = 0;
+	int			allocsize = 0;
+	int			i;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		TransactionId xid;
+		char	   *buf;
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+		Assert(gxact->inredo);
+
+		xid = gxact->xid;
+
+		buf = ProcessTwoPhaseBuffer(xid,
+									gxact->prepare_start_lsn,
+									gxact->ondisk, false, true);
+
+		if (buf == NULL)
+			continue;
+
+		/*
+		 * OK, we think this file is valid.  Incorporate xid into the
+		 * running-minimum result.
+		 */
+		if (TransactionIdPrecedes(xid, result))
+			result = xid;
+
+		if (xids_p)
+		{
+			if (nxids == allocsize)
+			{
+				if (nxids == 0)
+				{
+					allocsize = 10;
+					xids = palloc(allocsize * sizeof(TransactionId));
+				}
+				else
+				{
+					allocsize = allocsize * 2;
+					xids = repalloc(xids, allocsize * sizeof(TransactionId));
+				}
+			}
+			xids[nxids++] = xid;
+		}
+
+		pfree(buf);
+	}
+	LWLockRelease(TwoPhaseStateLock);
+
+	if (xids_p)
+	{
+		*xids_p = xids;
+		*nxids_p = nxids;
+	}
+
+	return result;
+}
+
+/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and setup all the required
+ * information to allow standby queries to treat prepared transactions as still
+ * active.
+ *
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * The lack of calls to SubTransSetParent() calls here is by design;
+ * those calls are made by RecoverPreparedTransactions() at the end of recovery
+ * for those xacts that need this.
+ */
+void
+StandbyRecoverPreparedTransactions(void)
+{
+	int			i;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		TransactionId xid;
+		char	   *buf;
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+		Assert(gxact->inredo);
+
+		xid = gxact->xid;
+
+		buf = ProcessTwoPhaseBuffer(xid,
+									gxact->prepare_start_lsn,
+									gxact->ondisk, false, false);
+		if (buf != NULL)
+			pfree(buf);
+	}
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * RecoverPreparedTransactions
+ *
+ * Scan the shared memory entries of TwoPhaseState and reload the state for
+ * each prepared transaction (reacquire locks, etc).
+ *
+ * This is run at the end of recovery, but before we allow backends to write
+ * WAL.
+ *
+ * At the end of recovery the way we take snapshots will change. We now need
+ * to mark all running transactions with their full SubTransSetParent() info
+ * to allow normal snapshots to work correctly if snapshots overflow.
+ * We do this here because by definition prepared transactions are the only
+ * type of write transaction still running, so this is necessary and
+ * complete.
+ */
+void
+RecoverPreparedTransactions(void)
+{
+	int			i;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		TransactionId xid;
+		char	   *buf;
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+		char	   *bufptr;
+		TwoPhaseFileHeader *hdr;
+		TransactionId *subxids;
+		const char *gid;
+
+		xid = gxact->xid;
+
+		/*
+		 * Reconstruct subtrans state for the transaction --- needed because
+		 * pg_subtrans is not preserved over a restart.  Note that we are
+		 * linking all the subtransactions directly to the top-level XID;
+		 * there may originally have been a more complex hierarchy, but
+		 * there's no need to restore that exactly. It's possible that
+		 * SubTransSetParent has been set before, if the prepared transaction
+		 * generated xid assignment records.
+		 */
+		buf = ProcessTwoPhaseBuffer(xid,
+									gxact->prepare_start_lsn,
+									gxact->ondisk, true, false);
+		if (buf == NULL)
+			continue;
+
+		ereport(LOG,
+				(errmsg("recovering prepared transaction %u from shared memory", xid)));
+
+		hdr = (TwoPhaseFileHeader *) buf;
+		Assert(TransactionIdEquals(hdr->xid, xid));
+		bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+		gid = (const char *) bufptr;
+		bufptr += MAXALIGN(hdr->gidlen);
+		subxids = (TransactionId *) bufptr;
+		bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId));
+		bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode));
+		bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode));
+		bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item));
+		bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item));
+		bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage));
+
+		/*
+		 * Recreate its GXACT and dummy PGPROC. But, check whether it was
+		 * added in redo and already has a shmem entry for it.
+		 */
+		MarkAsPreparingGuts(gxact, xid, gid,
+							hdr->prepared_at,
+							hdr->owner, hdr->database);
+
+		/* recovered, so reset the flag for entries generated by redo */
+		gxact->inredo = false;
+
+		GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids);
+		MarkAsPrepared(gxact, true);
+
+		LWLockRelease(TwoPhaseStateLock);
+
+		/*
+		 * Recover other state (notably locks) using resource managers.
+		 */
+		ProcessRecords(bufptr, xid, twophase_recover_callbacks);
+
+		/*
+		 * Release locks held by the standby process after we process each
+		 * prepared transaction. As a result, we don't need too many
+		 * additional locks at any one time.
+		 */
+		if (InHotStandby)
+			StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
+
+		/*
+		 * We're done with recovering this transaction. Clear MyLockedGxact,
+		 * like we do in PrepareTransaction() during normal operation.
+		 */
+		PostPrepare_Twophase();
+
+		pfree(buf);
+
+		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+	}
+
+	LWLockRelease(TwoPhaseStateLock);
+}
+
+/*
+ * ProcessTwoPhaseBuffer
+ *
+ * Given a transaction id, read it either from disk or read it directly
+ * via shmem xlog record pointer using the provided "prepare_start_lsn".
+ *
+ * If setParent is true, set up subtransaction parent linkages.
+ *
+ * If setNextXid is true, set ShmemVariableCache->nextXid to the newest
+ * value scanned.
+ */
+static char *
+ProcessTwoPhaseBuffer(TransactionId xid,
+					  XLogRecPtr prepare_start_lsn,
+					  bool fromdisk,
+					  bool setParent, bool setNextXid)
+{
+	FullTransactionId nextXid = ShmemVariableCache->nextXid;
+	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
+	TransactionId *subxids;
+	char	   *buf;
+	TwoPhaseFileHeader *hdr;
+	int			i;
+
+	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+
+	if (!fromdisk)
+		Assert(prepare_start_lsn != InvalidXLogRecPtr);
+
+	/* Already processed? */
+	if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+	{
+		if (fromdisk)
+		{
+			ereport(WARNING,
+					(errmsg("removing stale two-phase state file for transaction %u",
+							xid)));
+			RemoveTwoPhaseFile(xid, true);
+		}
+		else
+		{
+			ereport(WARNING,
+					(errmsg("removing stale two-phase state from memory for transaction %u",
+							xid)));
+			PrepareRedoRemove(xid, true);
+		}
+		return NULL;
+	}
+
+	/* Reject XID if too new */
+	if (TransactionIdFollowsOrEquals(xid, origNextXid))
+	{
+		if (fromdisk)
+		{
+			ereport(WARNING,
+					(errmsg("removing future two-phase state file for transaction %u",
+							xid)));
+			RemoveTwoPhaseFile(xid, true);
+		}
+		else
+		{
+			ereport(WARNING,
+					(errmsg("removing future two-phase state from memory for transaction %u",
+							xid)));
+			PrepareRedoRemove(xid, true);
+		}
+		return NULL;
+	}
+
+	if (fromdisk)
+	{
+		/* Read and validate file */
+		buf = ReadTwoPhaseFile(xid, false);
+	}
+	else
+	{
+		/* Read xlog data */
+		XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL);
+	}
+
+	/* Deconstruct header */
+	hdr = (TwoPhaseFileHeader *) buf;
+	if (!TransactionIdEquals(hdr->xid, xid))
+	{
+		if (fromdisk)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted two-phase state file for transaction %u",
+							xid)));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted two-phase state in memory for transaction %u",
+							xid)));
+	}
+
+	/*
+	 * Examine subtransaction XIDs ... they should all follow main XID, and
+	 * they may force us to advance nextXid.
+	 */
+	subxids = (TransactionId *) (buf +
+								 MAXALIGN(sizeof(TwoPhaseFileHeader)) +
+								 MAXALIGN(hdr->gidlen));
+	for (i = 0; i < hdr->nsubxacts; i++)
+	{
+		TransactionId subxid = subxids[i];
+
+		Assert(TransactionIdFollows(subxid, xid));
+
+		/* update nextXid if needed */
+		if (setNextXid)
+			AdvanceNextFullTransactionIdPastXid(subxid);
+
+		if (setParent)
+			SubTransSetParent(subxid, xid);
+	}
+
+	return buf;
+}
+
+
+/*
+ *	RecordTransactionCommitPrepared
+ *
+ * This is basically the same as RecordTransactionCommit (q.v. if you change
+ * this function): in particular, we must set DELAY_CHKPT_START to avoid a
+ * race condition.
+ *
+ * We know the transaction made at least one XLOG entry (its PREPARE),
+ * so it is never possible to optimize out the commit record.
+ */
+static void
+RecordTransactionCommitPrepared(TransactionId xid,
+								int nchildren,
+								TransactionId *children,
+								int nrels,
+								RelFileNode *rels,
+								int nstats,
+								xl_xact_stats_item *stats,
+								int ninvalmsgs,
+								SharedInvalidationMessage *invalmsgs,
+								bool initfileinval,
+								const char *gid)
+{
+	XLogRecPtr	recptr;
+	TimestampTz committs = GetCurrentTimestamp();
+	bool		replorigin;
+
+	/*
+	 * Are we using the replication origins feature?  Or, in other words, are
+	 * we replaying remote actions?
+	 */
+	replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+				  replorigin_session_origin != DoNotReplicateId);
+
+	START_CRIT_SECTION();
+
+	/* See notes in RecordTransactionCommit */
+	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+	MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+	/*
+	 * Emit the XLOG commit record. Note that we mark 2PC commits as
+	 * potentially having AccessExclusiveLocks since we don't know whether or
+	 * not they do.
+	 */
+	recptr = XactLogCommitRecord(committs,
+								 nchildren, children, nrels, rels,
+								 nstats, stats,
+								 ninvalmsgs, invalmsgs,
+								 initfileinval,
+								 MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
+								 xid, gid);
+
+
+	if (replorigin)
+		/* Move LSNs forward for this replication origin */
+		replorigin_session_advance(replorigin_session_origin_lsn,
+								   XactLastRecEnd);
+
+	/*
+	 * Record commit timestamp.  The value comes from plain commit timestamp
+	 * if replorigin is not enabled, or replorigin already set a value for us
+	 * in replorigin_session_origin_timestamp otherwise.
+	 *
+	 * We don't need to WAL-log anything here, as the commit record written
+	 * above already contains the data.
+	 */
+	if (!replorigin || replorigin_session_origin_timestamp == 0)
+		replorigin_session_origin_timestamp = committs;
+
+	TransactionTreeSetCommitTsData(xid, nchildren, children,
+								   replorigin_session_origin_timestamp,
+								   replorigin_session_origin);
+
+	/*
+	 * We don't currently try to sleep before flush here ... nor is there any
+	 * support for async commit of a prepared xact (the very idea is probably
+	 * a contradiction)
+	 */
+
+	/* Flush XLOG to disk */
+	XLogFlush(recptr);
+
+	/* Mark the transaction committed in pg_xact */
+	TransactionIdCommitTree(xid, nchildren, children);
+
+	/* Checkpoint can proceed now */
+	MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+	END_CRIT_SECTION();
+
+	/*
+	 * Wait for synchronous replication, if required.
+	 *
+	 * Note that at this stage we have marked clog, but still show as running
+	 * in the procarray and continue to hold locks.
+	 */
+	SyncRepWaitForLSN(recptr, true);
+}
+
+/*
+ *	RecordTransactionAbortPrepared
+ *
+ * This is basically the same as RecordTransactionAbort.
+ *
+ * We know the transaction made at least one XLOG entry (its PREPARE),
+ * so it is never possible to optimize out the abort record.
+ */
+static void
+RecordTransactionAbortPrepared(TransactionId xid,
+							   int nchildren,
+							   TransactionId *children,
+							   int nrels,
+							   RelFileNode *rels,
+							   int nstats,
+							   xl_xact_stats_item *stats,
+							   const char *gid)
+{
+	XLogRecPtr	recptr;
+	bool		replorigin;
+
+	/*
+	 * Are we using the replication origins feature?  Or, in other words, are
+	 * we replaying remote actions?
+	 */
+	replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+				  replorigin_session_origin != DoNotReplicateId);
+
+	/*
+	 * Catch the scenario where we aborted partway through
+	 * RecordTransactionCommitPrepared ...
+	 */
+	if (TransactionIdDidCommit(xid))
+		elog(PANIC, "cannot abort transaction %u, it was already committed",
+			 xid);
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Emit the XLOG commit record. Note that we mark 2PC aborts as
+	 * potentially having AccessExclusiveLocks since we don't know whether or
+	 * not they do.
+	 */
+	recptr = XactLogAbortRecord(GetCurrentTimestamp(),
+								nchildren, children,
+								nrels, rels,
+								nstats, stats,
+								MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
+								xid, gid);
+
+	if (replorigin)
+		/* Move LSNs forward for this replication origin */
+		replorigin_session_advance(replorigin_session_origin_lsn,
+								   XactLastRecEnd);
+
+	/* Always flush, since we're about to remove the 2PC state file */
+	XLogFlush(recptr);
+
+	/*
+	 * Mark the transaction aborted in clog.  This is not absolutely necessary
+	 * but we may as well do it while we are here.
+	 */
+	TransactionIdAbortTree(xid, nchildren, children);
+
+	END_CRIT_SECTION();
+
+	/*
+	 * Wait for synchronous replication, if required.
+	 *
+	 * Note that at this stage we have marked clog, but still show as running
+	 * in the procarray and continue to hold locks.
+	 */
+	SyncRepWaitForLSN(recptr, false);
+}
+
+/*
+ * PrepareRedoAdd
+ *
+ * Store pointers to the start/end of the WAL record along with the xid in
+ * a gxact entry in shared memory TwoPhaseState structure.  If caller
+ * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase
+ * data, the entry is marked as located on disk.
+ */
+void
+PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
+			   XLogRecPtr end_lsn, RepOriginId origin_id)
+{
+	TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf;
+	char	   *bufptr;
+	const char *gid;
+	GlobalTransaction gxact;
+
+	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+	Assert(RecoveryInProgress());
+
+	bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
+	gid = (const char *) bufptr;
+
+	/*
+	 * Reserve the GID for the given transaction in the redo code path.
+	 *
+	 * This creates a gxact struct and puts it into the active array.
+	 *
+	 * In redo, this struct is mainly used to track PREPARE/COMMIT entries in
+	 * shared memory. Hence, we only fill up the bare minimum contents here.
+	 * The gxact also gets marked with gxact->inredo set to true to indicate
+	 * that it got added in the redo phase
+	 */
+
+	/*
+	 * In the event of a crash while a checkpoint was running, it may be
+	 * possible that some two-phase data found its way to disk while its
+	 * corresponding record needs to be replayed in the follow-up recovery. As
+	 * the 2PC data was on disk, it has already been restored at the beginning
+	 * of recovery with restoreTwoPhaseData(), so skip this record to avoid
+	 * duplicates in TwoPhaseState.  If a consistent state has been reached,
+	 * the record is added to TwoPhaseState and it should have no
+	 * corresponding file in pg_twophase.
+	 */
+	if (!XLogRecPtrIsInvalid(start_lsn))
+	{
+		char		path[MAXPGPATH];
+
+		TwoPhaseFilePath(path, hdr->xid);
+
+		if (access(path, F_OK) == 0)
+		{
+			ereport(reachedConsistency ? ERROR : WARNING,
+					(errmsg("could not recover two-phase state file for transaction %u",
+							hdr->xid),
+					 errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.",
+							   LSN_FORMAT_ARGS(start_lsn))));
+			return;
+		}
+
+		if (errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not access file \"%s\": %m", path)));
+	}
+
+	/* Get a free gxact from the freelist */
+	if (TwoPhaseState->freeGXacts == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("maximum number of prepared transactions reached"),
+				 errhint("Increase max_prepared_transactions (currently %d).",
+						 max_prepared_xacts)));
+	gxact = TwoPhaseState->freeGXacts;
+	TwoPhaseState->freeGXacts = gxact->next;
+
+	gxact->prepared_at = hdr->prepared_at;
+	gxact->prepare_start_lsn = start_lsn;
+	gxact->prepare_end_lsn = end_lsn;
+	gxact->xid = hdr->xid;
+	gxact->owner = hdr->owner;
+	gxact->locking_backend = InvalidBackendId;
+	gxact->valid = false;
+	gxact->ondisk = XLogRecPtrIsInvalid(start_lsn);
+	gxact->inredo = true;		/* yes, added in redo */
+	strcpy(gxact->gid, gid);
+
+	/* And insert it into the active array */
+	Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts);
+	TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact;
+
+	if (origin_id != InvalidRepOriginId)
+	{
+		/* recover apply progress */
+		replorigin_advance(origin_id, hdr->origin_lsn, end_lsn,
+						   false /* backward */ , false /* WAL */ );
+	}
+
+	elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid);
+}
+
+/*
+ * PrepareRedoRemove
+ *
+ * Remove the corresponding gxact entry from TwoPhaseState. Also remove
+ * the 2PC file if a prepared transaction was saved via an earlier checkpoint.
+ *
+ * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
+ * is updated.
+ */
+void
+PrepareRedoRemove(TransactionId xid, bool giveWarning)
+{
+	GlobalTransaction gxact = NULL;
+	int			i;
+	bool		found = false;
+
+	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
+	Assert(RecoveryInProgress());
+
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		gxact = TwoPhaseState->prepXacts[i];
+
+		if (gxact->xid == xid)
+		{
+			Assert(gxact->inredo);
+			found = true;
+			break;
+		}
+	}
+
+	/*
+	 * Just leave if there is nothing, this is expected during WAL replay.
+	 */
+	if (!found)
+		return;
+
+	/*
+	 * And now we can clean up any files we may have left.
+	 */
+	elog(DEBUG2, "removing 2PC data for transaction %u", xid);
+	if (gxact->ondisk)
+		RemoveTwoPhaseFile(xid, giveWarning);
+	RemoveGXact(gxact);
+}
+
+/*
+ * LookupGXact
+ *		Check if the prepared transaction with the given GID, lsn and timestamp
+ *		exists.
+ *
+ * Note that we always compare with the LSN where prepare ends because that is
+ * what is stored as origin_lsn in the 2PC file.
+ *
+ * This function is primarily used to check if the prepared transaction
+ * received from the upstream (remote node) already exists. Checking only GID
+ * is not sufficient because a different prepared xact with the same GID can
+ * exist on the same node. So, we are ensuring to match origin_lsn and
+ * origin_timestamp of prepared xact to avoid the possibility of a match of
+ * prepared xact from two different nodes.
+ */
+bool
+LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
+			TimestampTz origin_prepare_timestamp)
+{
+	int			i;
+	bool		found = false;
+
+	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+		/* Ignore not-yet-valid GIDs. */
+		if (gxact->valid && strcmp(gxact->gid, gid) == 0)
+		{
+			char	   *buf;
+			TwoPhaseFileHeader *hdr;
+
+			/*
+			 * We are not expecting collisions of GXACTs (same gid) between
+			 * publisher and subscribers, so we perform all I/O while holding
+			 * TwoPhaseStateLock for simplicity.
+			 *
+			 * To move the I/O out of the lock, we need to ensure that no
+			 * other backend commits the prepared xact in the meantime. We can
+			 * do this optimization if we encounter many collisions in GID
+			 * between publisher and subscriber.
+			 */
+			if (gxact->ondisk)
+				buf = ReadTwoPhaseFile(gxact->xid, false);
+			else
+			{
+				Assert(gxact->prepare_start_lsn);
+				XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+			}
+
+			hdr = (TwoPhaseFileHeader *) buf;
+
+			if (hdr->origin_lsn == prepare_end_lsn &&
+				hdr->origin_timestamp == origin_prepare_timestamp)
+			{
+				found = true;
+				pfree(buf);
+				break;
+			}
+
+			pfree(buf);
+		}
+	}
+	LWLockRelease(TwoPhaseStateLock);
+	return found;
+}
diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c
new file mode 100644
index 0000000..35a9b32
--- /dev/null
+++ b/src/backend/access/transam/twophase_rmgr.c
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * twophase_rmgr.c
+ *	  Two-phase-commit resource managers tables
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/transam/twophase_rmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/multixact.h"
+#include "access/twophase_rmgr.h"
+#include "pgstat.h"
+#include "storage/lock.h"
+#include "storage/predicate.h"
+
+
+const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+	NULL,						/* END ID */
+	lock_twophase_recover,		/* Lock */
+	NULL,						/* pgstat */
+	multixact_twophase_recover, /* MultiXact */
+	predicatelock_twophase_recover	/* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+	NULL,						/* END ID */
+	lock_twophase_postcommit,	/* Lock */
+	pgstat_twophase_postcommit, /* pgstat */
+	multixact_twophase_postcommit,	/* MultiXact */
+	NULL						/* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+	NULL,						/* END ID */
+	lock_twophase_postabort,	/* Lock */
+	pgstat_twophase_postabort,	/* pgstat */
+	multixact_twophase_postabort,	/* MultiXact */
+	NULL						/* PredicateLock */
+};
+
+const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
+{
+	NULL,						/* END ID */
+	lock_twophase_standby_recover,	/* Lock */
+	NULL,						/* pgstat */
+	NULL,						/* MultiXact */
+	NULL						/* PredicateLock */
+};
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
new file mode 100644
index 0000000..748120a
--- /dev/null
+++ b/src/backend/access/transam/varsup.c
@@ -0,0 +1,678 @@
+/*-------------------------------------------------------------------------
+ *
+ * varsup.c
+ *	  postgres OID & XID variables support routines
+ *
+ * Copyright (c) 2000-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/transam/varsup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlogutils.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "postmaster/autovacuum.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "utils/syscache.h"
+
+
+/* Number of OIDs to prefetch (preallocate) per XLOG write */
+#define VAR_OID_PREFETCH		8192
+
+/* pointer to "variable cache" in shared memory (set up by shmem.c) */
+VariableCache ShmemVariableCache = NULL;
+
+
+/*
+ * Allocate the next FullTransactionId for a new transaction or
+ * subtransaction.
+ *
+ * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before
+ * returning.
+ *
+ * Note: when this is called, we are actually already inside a valid
+ * transaction, since XIDs are now not allocated until the transaction
+ * does something.  So it is safe to do a database lookup if we want to
+ * issue a warning about XID wrap.
+ */
+FullTransactionId
+GetNewTransactionId(bool isSubXact)
+{
+	FullTransactionId full_xid;
+	TransactionId xid;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for new XIDs after that point.
+	 */
+	if (IsInParallelMode())
+		elog(ERROR, "cannot assign TransactionIds during a parallel operation");
+
+	/*
+	 * During bootstrap initialization, we return the special bootstrap
+	 * transaction id.
+	 */
+	if (IsBootstrapProcessingMode())
+	{
+		Assert(!isSubXact);
+		MyProc->xid = BootstrapTransactionId;
+		ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId;
+		return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId);
+	}
+
+	/* safety check, we should never get this far in a HS standby */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign TransactionIds during recovery");
+
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	full_xid = ShmemVariableCache->nextXid;
+	xid = XidFromFullTransactionId(full_xid);
+
+	/*----------
+	 * Check to see if it's safe to assign another XID.  This protects against
+	 * catastrophic data loss due to XID wraparound.  The basic rules are:
+	 *
+	 * If we're past xidVacLimit, start trying to force autovacuum cycles.
+	 * If we're past xidWarnLimit, start issuing warnings.
+	 * If we're past xidStopLimit, refuse to execute transactions, unless
+	 * we are running in single-user mode (which gives an escape hatch
+	 * to the DBA who somehow got past the earlier defenses).
+	 *
+	 * Note that this coding also appears in GetNewMultiXactId.
+	 *----------
+	 */
+	if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit))
+	{
+		/*
+		 * For safety's sake, we release XidGenLock while sending signals,
+		 * warnings, etc.  This is not so much because we care about
+		 * preserving concurrency in this situation, as to avoid any
+		 * possibility of deadlock while doing get_database_name(). First,
+		 * copy all the shared values we'll need in this path.
+		 */
+		TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit;
+		TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit;
+		TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit;
+		Oid			oldest_datoid = ShmemVariableCache->oldestXidDB;
+
+		LWLockRelease(XidGenLock);
+
+		/*
+		 * To avoid swamping the postmaster with signals, we issue the autovac
+		 * request only once per 64K transaction starts.  This still gives
+		 * plenty of chances before we get into real trouble.
+		 */
+		if (IsUnderPostmaster && (xid % 65536) == 0)
+			SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+		if (IsUnderPostmaster &&
+			TransactionIdFollowsOrEquals(xid, xidStopLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"",
+								oldest_datname),
+						 errhint("Stop the postmaster and vacuum that database in single-user mode.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+						 errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u",
+								oldest_datoid),
+						 errhint("Stop the postmaster and vacuum that database in single-user mode.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		}
+		else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit))
+		{
+			char	   *oldest_datname = get_database_name(oldest_datoid);
+
+			/* complain even if that DB has disappeared */
+			if (oldest_datname)
+				ereport(WARNING,
+						(errmsg("database \"%s\" must be vacuumed within %u transactions",
+								oldest_datname,
+								xidWrapLimit - xid),
+						 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+			else
+				ereport(WARNING,
+						(errmsg("database with OID %u must be vacuumed within %u transactions",
+								oldest_datoid,
+								xidWrapLimit - xid),
+						 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+								 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		}
+
+		/* Re-acquire lock and start over */
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+		full_xid = ShmemVariableCache->nextXid;
+		xid = XidFromFullTransactionId(full_xid);
+	}
+
+	/*
+	 * If we are allocating the first XID of a new page of the commit log,
+	 * zero out that commit-log page before returning. We must do this while
+	 * holding XidGenLock, else another xact could acquire and commit a later
+	 * XID before we zero the page.  Fortunately, a page of the commit log
+	 * holds 32K or more transactions, so we don't have to do this very often.
+	 *
+	 * Extend pg_subtrans and pg_commit_ts too.
+	 */
+	ExtendCLOG(xid);
+	ExtendCommitTs(xid);
+	ExtendSUBTRANS(xid);
+
+	/*
+	 * Now advance the nextXid counter.  This must not happen until after we
+	 * have successfully completed ExtendCLOG() --- if that routine fails, we
+	 * want the next incoming transaction to try it again.  We cannot assign
+	 * more XIDs until there is CLOG space for them.
+	 */
+	FullTransactionIdAdvance(&ShmemVariableCache->nextXid);
+
+	/*
+	 * We must store the new XID into the shared ProcArray before releasing
+	 * XidGenLock.  This ensures that every active XID older than
+	 * latestCompletedXid is present in the ProcArray, which is essential for
+	 * correct OldestXmin tracking; see src/backend/access/transam/README.
+	 *
+	 * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful to
+	 * fetch the value for each proc only once, rather than assume they can
+	 * read a value multiple times and get the same answer each time.  Note we
+	 * are assuming that TransactionId and int fetch/store are atomic.
+	 *
+	 * The same comments apply to the subxact xid count and overflow fields.
+	 *
+	 * Use of a write barrier prevents dangerous code rearrangement in this
+	 * function; other backends could otherwise e.g. be examining my subxids
+	 * info concurrently, and we don't want them to see an invalid
+	 * intermediate state, such as an incremented nxids before the array entry
+	 * is filled.
+	 *
+	 * Other processes that read nxids should do so before reading xids
+	 * elements with a pg_read_barrier() in between, so that they can be sure
+	 * not to read an uninitialized array element; see
+	 * src/backend/storage/lmgr/README.barrier.
+	 *
+	 * If there's no room to fit a subtransaction XID into PGPROC, set the
+	 * cache-overflowed flag instead.  This forces readers to look in
+	 * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a
+	 * race-condition window, in that the new XID will not appear as running
+	 * until its parent link has been placed into pg_subtrans. However, that
+	 * will happen before anyone could possibly have a reason to inquire about
+	 * the status of the XID, so it seems OK.  (Snapshots taken during this
+	 * window *will* include the parent XID, so they will deliver the correct
+	 * answer later on when someone does have a reason to inquire.)
+	 */
+	if (!isSubXact)
+	{
+		Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0);
+		Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed);
+		Assert(MyProc->subxidStatus.count == 0);
+		Assert(!MyProc->subxidStatus.overflowed);
+
+		/* LWLockRelease acts as barrier */
+		MyProc->xid = xid;
+		ProcGlobal->xids[MyProc->pgxactoff] = xid;
+	}
+	else
+	{
+		XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+		int			nxids = MyProc->subxidStatus.count;
+
+		Assert(substat->count == MyProc->subxidStatus.count);
+		Assert(substat->overflowed == MyProc->subxidStatus.overflowed);
+
+		if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
+		{
+			MyProc->subxids.xids[nxids] = xid;
+			pg_write_barrier();
+			MyProc->subxidStatus.count = substat->count = nxids + 1;
+		}
+		else
+			MyProc->subxidStatus.overflowed = substat->overflowed = true;
+	}
+
+	LWLockRelease(XidGenLock);
+
+	return full_xid;
+}
+
+/*
+ * Read nextXid but don't allocate it.
+ */
+FullTransactionId
+ReadNextFullTransactionId(void)
+{
+	FullTransactionId fullXid;
+
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	fullXid = ShmemVariableCache->nextXid;
+	LWLockRelease(XidGenLock);
+
+	return fullXid;
+}
+
+/*
+ * Advance nextXid to the value after a given xid.  The epoch is inferred.
+ * This must only be called during recovery or from two-phase start-up code.
+ */
+void
+AdvanceNextFullTransactionIdPastXid(TransactionId xid)
+{
+	FullTransactionId newNextFullXid;
+	TransactionId next_xid;
+	uint32		epoch;
+
+	/*
+	 * It is safe to read nextXid without a lock, because this is only called
+	 * from the startup process or single-process mode, meaning that no other
+	 * process can modify it.
+	 */
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+
+	/* Fast return if this isn't an xid high enough to move the needle. */
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	if (!TransactionIdFollowsOrEquals(xid, next_xid))
+		return;
+
+	/*
+	 * Compute the FullTransactionId that comes after the given xid.  To do
+	 * this, we preserve the existing epoch, but detect when we've wrapped
+	 * into a new epoch.  This is necessary because WAL records and 2PC state
+	 * currently contain 32 bit xids.  The wrap logic is safe in those cases
+	 * because the span of active xids cannot exceed one epoch at any given
+	 * point in the WAL stream.
+	 */
+	TransactionIdAdvance(xid);
+	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
+	if (unlikely(xid < next_xid))
+		++epoch;
+	newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid);
+
+	/*
+	 * We still need to take a lock to modify the value when there are
+	 * concurrent readers.
+	 */
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+	ShmemVariableCache->nextXid = newNextFullXid;
+	LWLockRelease(XidGenLock);
+}
+
+/*
+ * Advance the cluster-wide value for the oldest valid clog entry.
+ *
+ * We must acquire XactTruncationLock to advance the oldestClogXid. It's not
+ * necessary to hold the lock during the actual clog truncation, only when we
+ * advance the limit, as code looking up arbitrary xids is required to hold
+ * XactTruncationLock from when it tests oldestClogXid through to when it
+ * completes the clog lookup.
+ */
+void
+AdvanceOldestClogXid(TransactionId oldest_datfrozenxid)
+{
+	LWLockAcquire(XactTruncationLock, LW_EXCLUSIVE);
+	if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid,
+							  oldest_datfrozenxid))
+	{
+		ShmemVariableCache->oldestClogXid = oldest_datfrozenxid;
+	}
+	LWLockRelease(XactTruncationLock);
+}
+
+/*
+ * Determine the last safe XID to allocate using the currently oldest
+ * datfrozenxid (ie, the oldest XID that might exist in any database
+ * of our cluster), and the OID of the (or a) database with that value.
+ */
+void
+SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
+{
+	TransactionId xidVacLimit;
+	TransactionId xidWarnLimit;
+	TransactionId xidStopLimit;
+	TransactionId xidWrapLimit;
+	TransactionId curXid;
+
+	Assert(TransactionIdIsNormal(oldest_datfrozenxid));
+
+	/*
+	 * The place where we actually get into deep trouble is halfway around
+	 * from the oldest potentially-existing XID.  (This calculation is
+	 * probably off by one or two counts, because the special XIDs reduce the
+	 * size of the loop a little bit.  But we throw in plenty of slop below,
+	 * so it doesn't matter.)
+	 */
+	xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1);
+	if (xidWrapLimit < FirstNormalTransactionId)
+		xidWrapLimit += FirstNormalTransactionId;
+
+	/*
+	 * We'll refuse to continue assigning XIDs in interactive mode once we get
+	 * within 3M transactions of data loss.  This leaves lots of room for the
+	 * DBA to fool around fixing things in a standalone backend, while not
+	 * being significant compared to total XID space. (VACUUM requires an XID
+	 * if it truncates at wal_level!=minimal.  "VACUUM (ANALYZE)", which a DBA
+	 * might do by reflex, assigns an XID.  Hence, we had better be sure
+	 * there's lots of XIDs left...)  Also, at default BLCKSZ, this leaves two
+	 * completely-idle segments.  In the event of edge-case bugs involving
+	 * page or segment arithmetic, idle segments render the bugs unreachable
+	 * outside of single-user mode.
+	 */
+	xidStopLimit = xidWrapLimit - 3000000;
+	if (xidStopLimit < FirstNormalTransactionId)
+		xidStopLimit -= FirstNormalTransactionId;
+
+	/*
+	 * We'll start complaining loudly when we get within 40M transactions of
+	 * data loss.  This is kind of arbitrary, but if you let your gas gauge
+	 * get down to 2% of full, would you be looking for the next gas station?
+	 * We need to be fairly liberal about this number because there are lots
+	 * of scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
+	 * configurable.  If you know enough to configure it, you know enough to
+	 * not get in this kind of trouble in the first place.)
+	 */
+	xidWarnLimit = xidWrapLimit - 40000000;
+	if (xidWarnLimit < FirstNormalTransactionId)
+		xidWarnLimit -= FirstNormalTransactionId;
+
+	/*
+	 * We'll start trying to force autovacuums when oldest_datfrozenxid gets
+	 * to be more than autovacuum_freeze_max_age transactions old.
+	 *
+	 * Note: guc.c ensures that autovacuum_freeze_max_age is in a sane range,
+	 * so that xidVacLimit will be well before xidWarnLimit.
+	 *
+	 * Note: autovacuum_freeze_max_age is a PGC_POSTMASTER parameter so that
+	 * we don't have to worry about dealing with on-the-fly changes in its
+	 * value.  It doesn't look practical to update shared state from a GUC
+	 * assign hook (too many processes would try to execute the hook,
+	 * resulting in race conditions as well as crashes of those not connected
+	 * to shared memory).  Perhaps this can be improved someday.  See also
+	 * SetMultiXactIdLimit.
+	 */
+	xidVacLimit = oldest_datfrozenxid + autovacuum_freeze_max_age;
+	if (xidVacLimit < FirstNormalTransactionId)
+		xidVacLimit += FirstNormalTransactionId;
+
+	/* Grab lock for just long enough to set the new limit values */
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+	ShmemVariableCache->oldestXid = oldest_datfrozenxid;
+	ShmemVariableCache->xidVacLimit = xidVacLimit;
+	ShmemVariableCache->xidWarnLimit = xidWarnLimit;
+	ShmemVariableCache->xidStopLimit = xidStopLimit;
+	ShmemVariableCache->xidWrapLimit = xidWrapLimit;
+	ShmemVariableCache->oldestXidDB = oldest_datoid;
+	curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	LWLockRelease(XidGenLock);
+
+	/* Log the info */
+	ereport(DEBUG1,
+			(errmsg_internal("transaction ID wrap limit is %u, limited by database with OID %u",
+							 xidWrapLimit, oldest_datoid)));
+
+	/*
+	 * If past the autovacuum force point, immediately signal an autovac
+	 * request.  The reason for this is that autovac only processes one
+	 * database per invocation.  Once it's finished cleaning up the oldest
+	 * database, it'll call here, and we'll signal the postmaster to start
+	 * another iteration immediately if there are still any old databases.
+	 */
+	if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) &&
+		IsUnderPostmaster && !InRecovery)
+		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
+
+	/* Give an immediate warning if past the wrap warn point */
+	if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery)
+	{
+		char	   *oldest_datname;
+
+		/*
+		 * We can be called when not inside a transaction, for example during
+		 * StartupXLOG().  In such a case we cannot do database access, so we
+		 * must just report the oldest DB's OID.
+		 *
+		 * Note: it's also possible that get_database_name fails and returns
+		 * NULL, for example because the database just got dropped.  We'll
+		 * still warn, even though the warning might now be unnecessary.
+		 */
+		if (IsTransactionState())
+			oldest_datname = get_database_name(oldest_datoid);
+		else
+			oldest_datname = NULL;
+
+		if (oldest_datname)
+			ereport(WARNING,
+					(errmsg("database \"%s\" must be vacuumed within %u transactions",
+							oldest_datname,
+							xidWrapLimit - curXid),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+		else
+			ereport(WARNING,
+					(errmsg("database with OID %u must be vacuumed within %u transactions",
+							oldest_datoid,
+							xidWrapLimit - curXid),
+					 errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
+							 "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+	}
+}
+
+
+/*
+ * ForceTransactionIdLimitUpdate -- does the XID wrap-limit data need updating?
+ *
+ * We primarily check whether oldestXidDB is valid.  The cases we have in
+ * mind are that that database was dropped, or the field was reset to zero
+ * by pg_resetwal.  In either case we should force recalculation of the
+ * wrap limit.  Also do it if oldestXid is old enough to be forcing
+ * autovacuums or other actions; this ensures we update our state as soon
+ * as possible once extra overhead is being incurred.
+ */
+bool
+ForceTransactionIdLimitUpdate(void)
+{
+	TransactionId nextXid;
+	TransactionId xidVacLimit;
+	TransactionId oldestXid;
+	Oid			oldestXidDB;
+
+	/* Locking is probably not really necessary, but let's be careful */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	xidVacLimit = ShmemVariableCache->xidVacLimit;
+	oldestXid = ShmemVariableCache->oldestXid;
+	oldestXidDB = ShmemVariableCache->oldestXidDB;
+	LWLockRelease(XidGenLock);
+
+	if (!TransactionIdIsNormal(oldestXid))
+		return true;			/* shouldn't happen, but just in case */
+	if (!TransactionIdIsValid(xidVacLimit))
+		return true;			/* this shouldn't happen anymore either */
+	if (TransactionIdFollowsOrEquals(nextXid, xidVacLimit))
+		return true;			/* past xidVacLimit, don't delay updating */
+	if (!SearchSysCacheExists1(DATABASEOID, ObjectIdGetDatum(oldestXidDB)))
+		return true;			/* could happen, per comments above */
+	return false;
+}
+
+
+/*
+ * GetNewObjectId -- allocate a new OID
+ *
+ * OIDs are generated by a cluster-wide counter.  Since they are only 32 bits
+ * wide, counter wraparound will occur eventually, and therefore it is unwise
+ * to assume they are unique unless precautions are taken to make them so.
+ * Hence, this routine should generally not be used directly.  The only direct
+ * callers should be GetNewOidWithIndex() and GetNewRelFileNode() in
+ * catalog/catalog.c.
+ */
+Oid
+GetNewObjectId(void)
+{
+	Oid			result;
+
+	/* safety check, we should never get this far in a HS standby */
+	if (RecoveryInProgress())
+		elog(ERROR, "cannot assign OIDs during recovery");
+
+	LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+
+	/*
+	 * Check for wraparound of the OID counter.  We *must* not return 0
+	 * (InvalidOid), and in normal operation we mustn't return anything below
+	 * FirstNormalObjectId since that range is reserved for initdb (see
+	 * IsCatalogRelationOid()).  Note we are relying on unsigned comparison.
+	 *
+	 * During initdb, we start the OID generator at FirstGenbkiObjectId, so we
+	 * only wrap if before that point when in bootstrap or standalone mode.
+	 * The first time through this routine after normal postmaster start, the
+	 * counter will be forced up to FirstNormalObjectId.  This mechanism
+	 * leaves the OIDs between FirstGenbkiObjectId and FirstNormalObjectId
+	 * available for automatic assignment during initdb, while ensuring they
+	 * will never conflict with user-assigned OIDs.
+	 */
+	if (ShmemVariableCache->nextOid < ((Oid) FirstNormalObjectId))
+	{
+		if (IsPostmasterEnvironment)
+		{
+			/* wraparound, or first post-initdb assignment, in normal mode */
+			ShmemVariableCache->nextOid = FirstNormalObjectId;
+			ShmemVariableCache->oidCount = 0;
+		}
+		else
+		{
+			/* we may be bootstrapping, so don't enforce the full range */
+			if (ShmemVariableCache->nextOid < ((Oid) FirstGenbkiObjectId))
+			{
+				/* wraparound in standalone mode (unlikely but possible) */
+				ShmemVariableCache->nextOid = FirstNormalObjectId;
+				ShmemVariableCache->oidCount = 0;
+			}
+		}
+	}
+
+	/* If we run out of logged for use oids then we must log more */
+	if (ShmemVariableCache->oidCount == 0)
+	{
+		XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH);
+		ShmemVariableCache->oidCount = VAR_OID_PREFETCH;
+	}
+
+	result = ShmemVariableCache->nextOid;
+
+	(ShmemVariableCache->nextOid)++;
+	(ShmemVariableCache->oidCount)--;
+
+	LWLockRelease(OidGenLock);
+
+	return result;
+}
+
+/*
+ * SetNextObjectId
+ *
+ * This may only be called during initdb; it advances the OID counter
+ * to the specified value.
+ */
+static void
+SetNextObjectId(Oid nextOid)
+{
+	/* Safety check, this is only allowable during initdb */
+	if (IsPostmasterEnvironment)
+		elog(ERROR, "cannot advance OID counter anymore");
+
+	/* Taking the lock is, therefore, just pro forma; but do it anyway */
+	LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+
+	if (ShmemVariableCache->nextOid > nextOid)
+		elog(ERROR, "too late to advance OID counter to %u, it is now %u",
+			 nextOid, ShmemVariableCache->nextOid);
+
+	ShmemVariableCache->nextOid = nextOid;
+	ShmemVariableCache->oidCount = 0;
+
+	LWLockRelease(OidGenLock);
+}
+
+/*
+ * StopGeneratingPinnedObjectIds
+ *
+ * This is called once during initdb to force the OID counter up to
+ * FirstUnpinnedObjectId.  This supports letting initdb's post-bootstrap
+ * processing create some pinned objects early on.  Once it's done doing
+ * so, it calls this (via pg_stop_making_pinned_objects()) so that the
+ * remaining objects it makes will be considered un-pinned.
+ */
+void
+StopGeneratingPinnedObjectIds(void)
+{
+	SetNextObjectId(FirstUnpinnedObjectId);
+}
+
+
+#ifdef USE_ASSERT_CHECKING
+
+/*
+ * Assert that xid is between [oldestXid, nextXid], which is the range we
+ * expect XIDs coming from tables etc to be in.
+ *
+ * As ShmemVariableCache->oldestXid could change just after this call without
+ * further precautions, and as a wrapped-around xid could again fall within
+ * the valid range, this assertion can only detect if something is definitely
+ * wrong, but not establish correctness.
+ *
+ * This intentionally does not expose a return value, to avoid code being
+ * introduced that depends on the return value.
+ */
+void
+AssertTransactionIdInAllowableRange(TransactionId xid)
+{
+	TransactionId oldest_xid;
+	TransactionId next_xid;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/* we may see bootstrap / frozen */
+	if (!TransactionIdIsNormal(xid))
+		return;
+
+	/*
+	 * We can't acquire XidGenLock, as this may be called with XidGenLock
+	 * already held (or with other locks that don't allow XidGenLock to be
+	 * nested). That's ok for our purposes though, since we already rely on
+	 * 32bit reads to be atomic. While nextXid is 64 bit, we only look at the
+	 * lower 32bit, so a skewed read doesn't hurt.
+	 *
+	 * There's no increased danger of falling outside [oldest, next] by
+	 * accessing them without a lock. xid needs to have been created with
+	 * GetNewTransactionId() in the originating session, and the locks there
+	 * pair with the memory barrier below.  We do however accept xid to be <=
+	 * to next_xid, instead of just <, as xid could be from the procarray,
+	 * before we see the updated nextXid value.
+	 */
+	pg_memory_barrier();
+	oldest_xid = ShmemVariableCache->oldestXid;
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) ||
+		   TransactionIdPrecedesOrEquals(xid, next_xid));
+}
+#endif
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
new file mode 100644
index 0000000..e0c7ad1
--- /dev/null
+++ b/src/backend/access/transam/xact.c
@@ -0,0 +1,6249 @@
+/*-------------------------------------------------------------------------
+ *
+ * xact.c
+ *	  top level transaction system support routines
+ *
+ * See src/backend/access/transam/README for more information.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/transam/xact.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "access/commit_ts.h"
+#include "access/multixact.h"
+#include "access/parallel.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/index.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_enum.h"
+#include "catalog/storage.h"
+#include "commands/async.h"
+#include "commands/tablecmds.h"
+#include "commands/trigger.h"
+#include "common/pg_prng.h"
+#include "executor/spi.h"
+#include "libpq/be-fsstubs.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "replication/logical.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/snapbuild.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/fd.h"
+#include "storage/lmgr.h"
+#include "storage/md.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/catcache.h"
+#include "utils/combocid.h"
+#include "utils/guc.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+#include "utils/relmapper.h"
+#include "utils/snapmgr.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/*
+ *	User-tweakable parameters
+ */
+int			DefaultXactIsoLevel = XACT_READ_COMMITTED;
+int			XactIsoLevel;
+
+bool		DefaultXactReadOnly = false;
+bool		XactReadOnly;
+
+bool		DefaultXactDeferrable = false;
+bool		XactDeferrable;
+
+int			synchronous_commit = SYNCHRONOUS_COMMIT_ON;
+
+/*
+ * CheckXidAlive is a xid value pointing to a possibly ongoing (sub)
+ * transaction.  Currently, it is used in logical decoding.  It's possible
+ * that such transactions can get aborted while the decoding is ongoing in
+ * which case we skip decoding that particular transaction.  To ensure that we
+ * check whether the CheckXidAlive is aborted after fetching the tuple from
+ * system tables.  We also ensure that during logical decoding we never
+ * directly access the tableam or heap APIs because we are checking for the
+ * concurrent aborts only in systable_* APIs.
+ */
+TransactionId CheckXidAlive = InvalidTransactionId;
+bool		bsysscan = false;
+
+/*
+ * When running as a parallel worker, we place only a single
+ * TransactionStateData on the parallel worker's state stack, and the XID
+ * reflected there will be that of the *innermost* currently-active
+ * subtransaction in the backend that initiated parallelism.  However,
+ * GetTopTransactionId() and TransactionIdIsCurrentTransactionId()
+ * need to return the same answers in the parallel worker as they would have
+ * in the user backend, so we need some additional bookkeeping.
+ *
+ * XactTopFullTransactionId stores the XID of our toplevel transaction, which
+ * will be the same as TopTransactionStateData.fullTransactionId in an
+ * ordinary backend; but in a parallel backend, which does not have the entire
+ * transaction state, it will instead be copied from the backend that started
+ * the parallel operation.
+ *
+ * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary
+ * backend, but in a parallel backend, nParallelCurrentXids will contain the
+ * number of XIDs that need to be considered current, and ParallelCurrentXids
+ * will contain the XIDs themselves.  This includes all XIDs that were current
+ * or sub-committed in the parent at the time the parallel operation began.
+ * The XIDs are stored sorted in numerical order (not logical order) to make
+ * lookups as fast as possible.
+ */
+static FullTransactionId XactTopFullTransactionId = {InvalidTransactionId};
+static int	nParallelCurrentXids = 0;
+static TransactionId *ParallelCurrentXids;
+
+/*
+ * Miscellaneous flag bits to record events which occur on the top level
+ * transaction. These flags are only persisted in MyXactFlags and are intended
+ * so we remember to do certain things later on in the transaction. This is
+ * globally accessible, so can be set from anywhere in the code that requires
+ * recording flags.
+ */
+int			MyXactFlags;
+
+/*
+ *	transaction states - transaction state from server perspective
+ */
+typedef enum TransState
+{
+	TRANS_DEFAULT,				/* idle */
+	TRANS_START,				/* transaction starting */
+	TRANS_INPROGRESS,			/* inside a valid transaction */
+	TRANS_COMMIT,				/* commit in progress */
+	TRANS_ABORT,				/* abort in progress */
+	TRANS_PREPARE				/* prepare in progress */
+} TransState;
+
+/*
+ *	transaction block states - transaction state of client queries
+ *
+ * Note: the subtransaction states are used only for non-topmost
+ * transactions; the others appear only in the topmost transaction.
+ */
+typedef enum TBlockState
+{
+	/* not-in-transaction-block states */
+	TBLOCK_DEFAULT,				/* idle */
+	TBLOCK_STARTED,				/* running single-query transaction */
+
+	/* transaction block states */
+	TBLOCK_BEGIN,				/* starting transaction block */
+	TBLOCK_INPROGRESS,			/* live transaction */
+	TBLOCK_IMPLICIT_INPROGRESS, /* live transaction after implicit BEGIN */
+	TBLOCK_PARALLEL_INPROGRESS, /* live transaction inside parallel worker */
+	TBLOCK_END,					/* COMMIT received */
+	TBLOCK_ABORT,				/* failed xact, awaiting ROLLBACK */
+	TBLOCK_ABORT_END,			/* failed xact, ROLLBACK received */
+	TBLOCK_ABORT_PENDING,		/* live xact, ROLLBACK received */
+	TBLOCK_PREPARE,				/* live xact, PREPARE received */
+
+	/* subtransaction states */
+	TBLOCK_SUBBEGIN,			/* starting a subtransaction */
+	TBLOCK_SUBINPROGRESS,		/* live subtransaction */
+	TBLOCK_SUBRELEASE,			/* RELEASE received */
+	TBLOCK_SUBCOMMIT,			/* COMMIT received while TBLOCK_SUBINPROGRESS */
+	TBLOCK_SUBABORT,			/* failed subxact, awaiting ROLLBACK */
+	TBLOCK_SUBABORT_END,		/* failed subxact, ROLLBACK received */
+	TBLOCK_SUBABORT_PENDING,	/* live subxact, ROLLBACK received */
+	TBLOCK_SUBRESTART,			/* live subxact, ROLLBACK TO received */
+	TBLOCK_SUBABORT_RESTART		/* failed subxact, ROLLBACK TO received */
+} TBlockState;
+
+/*
+ *	transaction state structure
+ */
+typedef struct TransactionStateData
+{
+	FullTransactionId fullTransactionId;	/* my FullTransactionId */
+	SubTransactionId subTransactionId;	/* my subxact ID */
+	char	   *name;			/* savepoint name, if any */
+	int			savepointLevel; /* savepoint level */
+	TransState	state;			/* low-level state */
+	TBlockState blockState;		/* high-level state */
+	int			nestingLevel;	/* transaction nesting depth */
+	int			gucNestLevel;	/* GUC context nesting depth */
+	MemoryContext curTransactionContext;	/* my xact-lifetime context */
+	ResourceOwner curTransactionOwner;	/* my query resources */
+	TransactionId *childXids;	/* subcommitted child XIDs, in XID order */
+	int			nChildXids;		/* # of subcommitted child XIDs */
+	int			maxChildXids;	/* allocated size of childXids[] */
+	Oid			prevUser;		/* previous CurrentUserId setting */
+	int			prevSecContext; /* previous SecurityRestrictionContext */
+	bool		prevXactReadOnly;	/* entry-time xact r/o state */
+	bool		startedInRecovery;	/* did we start in recovery? */
+	bool		didLogXid;		/* has xid been included in WAL record? */
+	int			parallelModeLevel;	/* Enter/ExitParallelMode counter */
+	bool		chain;			/* start a new block after this one */
+	bool		topXidLogged;	/* for a subxact: is top-level XID logged? */
+	struct TransactionStateData *parent;	/* back link to parent */
+} TransactionStateData;
+
+typedef TransactionStateData *TransactionState;
+
+/*
+ * Serialized representation used to transmit transaction state to parallel
+ * workers through shared memory.
+ */
+typedef struct SerializedTransactionState
+{
+	int			xactIsoLevel;
+	bool		xactDeferrable;
+	FullTransactionId topFullTransactionId;
+	FullTransactionId currentFullTransactionId;
+	CommandId	currentCommandId;
+	int			nParallelCurrentXids;
+	TransactionId parallelCurrentXids[FLEXIBLE_ARRAY_MEMBER];
+} SerializedTransactionState;
+
+/* The size of SerializedTransactionState, not including the final array. */
+#define SerializedTransactionStateHeaderSize \
+	offsetof(SerializedTransactionState, parallelCurrentXids)
+
+/*
+ * CurrentTransactionState always points to the current transaction state
+ * block.  It will point to TopTransactionStateData when not in a
+ * transaction at all, or when in a top-level transaction.
+ */
+static TransactionStateData TopTransactionStateData = {
+	.state = TRANS_DEFAULT,
+	.blockState = TBLOCK_DEFAULT,
+	.topXidLogged = false,
+};
+
+/*
+ * unreportedXids holds XIDs of all subtransactions that have not yet been
+ * reported in an XLOG_XACT_ASSIGNMENT record.
+ */
+static int	nUnreportedXids;
+static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS];
+
+static TransactionState CurrentTransactionState = &TopTransactionStateData;
+
+/*
+ * The subtransaction ID and command ID assignment counters are global
+ * to a whole transaction, so we do not keep them in the state stack.
+ */
+static SubTransactionId currentSubTransactionId;
+static CommandId currentCommandId;
+static bool currentCommandIdUsed;
+
+/*
+ * xactStartTimestamp is the value of transaction_timestamp().
+ * stmtStartTimestamp is the value of statement_timestamp().
+ * xactStopTimestamp is the time at which we log a commit or abort WAL record.
+ * These do not change as we enter and exit subtransactions, so we don't
+ * keep them inside the TransactionState stack.
+ */
+static TimestampTz xactStartTimestamp;
+static TimestampTz stmtStartTimestamp;
+static TimestampTz xactStopTimestamp;
+
+/*
+ * GID to be used for preparing the current transaction.  This is also
+ * global to a whole transaction, so we don't keep it in the state stack.
+ */
+static char *prepareGID;
+
+/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
+
+/* Flag for logging statements in a transaction. */
+bool		xact_is_sampled = false;
+
+/*
+ * Private context for transaction-abort work --- we reserve space for this
+ * at startup to ensure that AbortTransaction and AbortSubTransaction can work
+ * when we've run out of memory.
+ */
+static MemoryContext TransactionAbortContext = NULL;
+
+/*
+ * List of add-on start- and end-of-xact callbacks
+ */
+typedef struct XactCallbackItem
+{
+	struct XactCallbackItem *next;
+	XactCallback callback;
+	void	   *arg;
+} XactCallbackItem;
+
+static XactCallbackItem *Xact_callbacks = NULL;
+
+/*
+ * List of add-on start- and end-of-subxact callbacks
+ */
+typedef struct SubXactCallbackItem
+{
+	struct SubXactCallbackItem *next;
+	SubXactCallback callback;
+	void	   *arg;
+} SubXactCallbackItem;
+
+static SubXactCallbackItem *SubXact_callbacks = NULL;
+
+
+/* local function prototypes */
+static void AssignTransactionId(TransactionState s);
+static void AbortTransaction(void);
+static void AtAbort_Memory(void);
+static void AtCleanup_Memory(void);
+static void AtAbort_ResourceOwner(void);
+static void AtCCI_LocalCache(void);
+static void AtCommit_Memory(void);
+static void AtStart_Cache(void);
+static void AtStart_Memory(void);
+static void AtStart_ResourceOwner(void);
+static void CallXactCallbacks(XactEvent event);
+static void CallSubXactCallbacks(SubXactEvent event,
+								 SubTransactionId mySubid,
+								 SubTransactionId parentSubid);
+static void CleanupTransaction(void);
+static void CheckTransactionBlock(bool isTopLevel, bool throwError,
+								  const char *stmtType);
+static void CommitTransaction(void);
+static TransactionId RecordTransactionAbort(bool isSubXact);
+static void StartTransaction(void);
+
+static void StartSubTransaction(void);
+static void CommitSubTransaction(void);
+static void AbortSubTransaction(void);
+static void CleanupSubTransaction(void);
+static void PushTransaction(void);
+static void PopTransaction(void);
+
+static void AtSubAbort_Memory(void);
+static void AtSubCleanup_Memory(void);
+static void AtSubAbort_ResourceOwner(void);
+static void AtSubCommit_Memory(void);
+static void AtSubStart_Memory(void);
+static void AtSubStart_ResourceOwner(void);
+
+static void ShowTransactionState(const char *str);
+static void ShowTransactionStateRec(const char *str, TransactionState state);
+static const char *BlockStateAsString(TBlockState blockState);
+static const char *TransStateAsString(TransState state);
+
+
+/* ----------------------------------------------------------------
+ *	transaction state accessors
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	IsTransactionState
+ *
+ *	This returns true if we are inside a valid transaction; that is,
+ *	it is safe to initiate database access, take heavyweight locks, etc.
+ */
+bool
+IsTransactionState(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states.  However, we
+	 * also reject the startup/shutdown states TRANS_START, TRANS_COMMIT,
+	 * TRANS_PREPARE since it might be too soon or too late within those
+	 * transition states to do anything interesting.  Hence, the only "valid"
+	 * state is TRANS_INPROGRESS.
+	 */
+	return (s->state == TRANS_INPROGRESS);
+}
+
+/*
+ *	IsAbortedTransactionBlockState
+ *
+ *	This returns true if we are within an aborted transaction block.
+ */
+bool
+IsAbortedTransactionBlockState(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->blockState == TBLOCK_ABORT ||
+		s->blockState == TBLOCK_SUBABORT)
+		return true;
+
+	return false;
+}
+
+
+/*
+ *	GetTopTransactionId
+ *
+ * This will return the XID of the main transaction, assigning one if
+ * it's not yet set.  Be careful to call this only inside a valid xact.
+ */
+TransactionId
+GetTopTransactionId(void)
+{
+	if (!FullTransactionIdIsValid(XactTopFullTransactionId))
+		AssignTransactionId(&TopTransactionStateData);
+	return XidFromFullTransactionId(XactTopFullTransactionId);
+}
+
+/*
+ *	GetTopTransactionIdIfAny
+ *
+ * This will return the XID of the main transaction, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't yet been assigned an XID.
+ */
+TransactionId
+GetTopTransactionIdIfAny(void)
+{
+	return XidFromFullTransactionId(XactTopFullTransactionId);
+}
+
+/*
+ *	GetCurrentTransactionId
+ *
+ * This will return the XID of the current transaction (main or sub
+ * transaction), assigning one if it's not yet set.  Be careful to call this
+ * only inside a valid xact.
+ */
+TransactionId
+GetCurrentTransactionId(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (!FullTransactionIdIsValid(s->fullTransactionId))
+		AssignTransactionId(s);
+	return XidFromFullTransactionId(s->fullTransactionId);
+}
+
+/*
+ *	GetCurrentTransactionIdIfAny
+ *
+ * This will return the XID of the current sub xact, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't been assigned an XID yet.
+ */
+TransactionId
+GetCurrentTransactionIdIfAny(void)
+{
+	return XidFromFullTransactionId(CurrentTransactionState->fullTransactionId);
+}
+
+/*
+ *	GetTopFullTransactionId
+ *
+ * This will return the FullTransactionId of the main transaction, assigning
+ * one if it's not yet set.  Be careful to call this only inside a valid xact.
+ */
+FullTransactionId
+GetTopFullTransactionId(void)
+{
+	if (!FullTransactionIdIsValid(XactTopFullTransactionId))
+		AssignTransactionId(&TopTransactionStateData);
+	return XactTopFullTransactionId;
+}
+
+/*
+ *	GetTopFullTransactionIdIfAny
+ *
+ * This will return the FullTransactionId of the main transaction, if one is
+ * assigned.  It will return InvalidFullTransactionId if we are not currently
+ * inside a transaction, or inside a transaction that hasn't yet been assigned
+ * one.
+ */
+FullTransactionId
+GetTopFullTransactionIdIfAny(void)
+{
+	return XactTopFullTransactionId;
+}
+
+/*
+ *	GetCurrentFullTransactionId
+ *
+ * This will return the FullTransactionId of the current transaction (main or
+ * sub transaction), assigning one if it's not yet set.  Be careful to call
+ * this only inside a valid xact.
+ */
+FullTransactionId
+GetCurrentFullTransactionId(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (!FullTransactionIdIsValid(s->fullTransactionId))
+		AssignTransactionId(s);
+	return s->fullTransactionId;
+}
+
+/*
+ *	GetCurrentFullTransactionIdIfAny
+ *
+ * This will return the FullTransactionId of the current sub xact, if one is
+ * assigned.  It will return InvalidFullTransactionId if we are not currently
+ * inside a transaction, or inside a transaction that hasn't been assigned one
+ * yet.
+ */
+FullTransactionId
+GetCurrentFullTransactionIdIfAny(void)
+{
+	return CurrentTransactionState->fullTransactionId;
+}
+
+/*
+ *	MarkCurrentTransactionIdLoggedIfAny
+ *
+ * Remember that the current xid - if it is assigned - now has been wal logged.
+ */
+void
+MarkCurrentTransactionIdLoggedIfAny(void)
+{
+	if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId))
+		CurrentTransactionState->didLogXid = true;
+}
+
+/*
+ * IsSubxactTopXidLogPending
+ *
+ * This is used to decide whether we need to WAL log the top-level XID for
+ * operation in a subtransaction.  We require that for logical decoding, see
+ * LogicalDecodingProcessRecord.
+ *
+ * This returns true if wal_level >= logical and we are inside a valid
+ * subtransaction, for which the assignment was not yet written to any WAL
+ * record.
+ */
+bool
+IsSubxactTopXidLogPending(void)
+{
+	/* check whether it is already logged */
+	if (CurrentTransactionState->topXidLogged)
+		return false;
+
+	/* wal_level has to be logical */
+	if (!XLogLogicalInfoActive())
+		return false;
+
+	/* we need to be in a transaction state */
+	if (!IsTransactionState())
+		return false;
+
+	/* it has to be a subtransaction */
+	if (!IsSubTransaction())
+		return false;
+
+	/* the subtransaction has to have a XID assigned */
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		return false;
+
+	return true;
+}
+
+/*
+ * MarkSubxactTopXidLogged
+ *
+ * Remember that the top transaction id for the current subtransaction is WAL
+ * logged now.
+ */
+void
+MarkSubxactTopXidLogged(void)
+{
+	Assert(IsSubxactTopXidLogPending());
+
+	CurrentTransactionState->topXidLogged = true;
+}
+
+/*
+ *	GetStableLatestTransactionId
+ *
+ * Get the transaction's XID if it has one, else read the next-to-be-assigned
+ * XID.  Once we have a value, return that same value for the remainder of the
+ * current transaction.  This is meant to provide the reference point for the
+ * age(xid) function, but might be useful for other maintenance tasks as well.
+ */
+TransactionId
+GetStableLatestTransactionId(void)
+{
+	static LocalTransactionId lxid = InvalidLocalTransactionId;
+	static TransactionId stablexid = InvalidTransactionId;
+
+	if (lxid != MyProc->lxid)
+	{
+		lxid = MyProc->lxid;
+		stablexid = GetTopTransactionIdIfAny();
+		if (!TransactionIdIsValid(stablexid))
+			stablexid = ReadNextTransactionId();
+	}
+
+	Assert(TransactionIdIsValid(stablexid));
+
+	return stablexid;
+}
+
+/*
+ * AssignTransactionId
+ *
+ * Assigns a new permanent FullTransactionId to the given TransactionState.
+ * We do not assign XIDs to transactions until/unless this is called.
+ * Also, any parent TransactionStates that don't yet have XIDs are assigned
+ * one; this maintains the invariant that a child transaction has an XID
+ * following its parent's.
+ */
+static void
+AssignTransactionId(TransactionState s)
+{
+	bool		isSubXact = (s->parent != NULL);
+	ResourceOwner currentOwner;
+	bool		log_unknown_top = false;
+
+	/* Assert that caller didn't screw up */
+	Assert(!FullTransactionIdIsValid(s->fullTransactionId));
+	Assert(s->state == TRANS_INPROGRESS);
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for new XIDs at this point.
+	 */
+	if (IsInParallelMode() || IsParallelWorker())
+		elog(ERROR, "cannot assign XIDs during a parallel operation");
+
+	/*
+	 * Ensure parent(s) have XIDs, so that a child always has an XID later
+	 * than its parent.  Mustn't recurse here, or we might get a stack
+	 * overflow if we're at the bottom of a huge stack of subtransactions none
+	 * of which have XIDs yet.
+	 */
+	if (isSubXact && !FullTransactionIdIsValid(s->parent->fullTransactionId))
+	{
+		TransactionState p = s->parent;
+		TransactionState *parents;
+		size_t		parentOffset = 0;
+
+		parents = palloc(sizeof(TransactionState) * s->nestingLevel);
+		while (p != NULL && !FullTransactionIdIsValid(p->fullTransactionId))
+		{
+			parents[parentOffset++] = p;
+			p = p->parent;
+		}
+
+		/*
+		 * This is technically a recursive call, but the recursion will never
+		 * be more than one layer deep.
+		 */
+		while (parentOffset != 0)
+			AssignTransactionId(parents[--parentOffset]);
+
+		pfree(parents);
+	}
+
+	/*
+	 * When wal_level=logical, guarantee that a subtransaction's xid can only
+	 * be seen in the WAL stream if its toplevel xid has been logged before.
+	 * If necessary we log an xact_assignment record with fewer than
+	 * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set
+	 * for a transaction even though it appears in a WAL record, we just might
+	 * superfluously log something. That can happen when an xid is included
+	 * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in
+	 * xl_standby_locks.
+	 */
+	if (isSubXact && XLogLogicalInfoActive() &&
+		!TopTransactionStateData.didLogXid)
+		log_unknown_top = true;
+
+	/*
+	 * Generate a new FullTransactionId and record its xid in PG_PROC and
+	 * pg_subtrans.
+	 *
+	 * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in
+	 * shared storage other than PG_PROC; because if there's no room for it in
+	 * PG_PROC, the subtrans entry is needed to ensure that other backends see
+	 * the Xid as "running".  See GetNewTransactionId.
+	 */
+	s->fullTransactionId = GetNewTransactionId(isSubXact);
+	if (!isSubXact)
+		XactTopFullTransactionId = s->fullTransactionId;
+
+	if (isSubXact)
+		SubTransSetParent(XidFromFullTransactionId(s->fullTransactionId),
+						  XidFromFullTransactionId(s->parent->fullTransactionId));
+
+	/*
+	 * If it's a top-level transaction, the predicate locking system needs to
+	 * be told about it too.
+	 */
+	if (!isSubXact)
+		RegisterPredicateLockingXid(XidFromFullTransactionId(s->fullTransactionId));
+
+	/*
+	 * Acquire lock on the transaction XID.  (We assume this cannot block.) We
+	 * have to ensure that the lock is assigned to the transaction's own
+	 * ResourceOwner.
+	 */
+	currentOwner = CurrentResourceOwner;
+	CurrentResourceOwner = s->curTransactionOwner;
+
+	XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId));
+
+	CurrentResourceOwner = currentOwner;
+
+	/*
+	 * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
+	 * top-level transaction we issue a WAL record for the assignment. We
+	 * include the top-level xid and all the subxids that have not yet been
+	 * reported using XLOG_XACT_ASSIGNMENT records.
+	 *
+	 * This is required to limit the amount of shared memory required in a hot
+	 * standby server to keep track of in-progress XIDs. See notes for
+	 * RecordKnownAssignedTransactionIds().
+	 *
+	 * We don't keep track of the immediate parent of each subxid, only the
+	 * top-level transaction that each subxact belongs to. This is correct in
+	 * recovery only because aborted subtransactions are separately WAL
+	 * logged.
+	 *
+	 * This is correct even for the case where several levels above us didn't
+	 * have an xid assigned as we recursed up to them beforehand.
+	 */
+	if (isSubXact && XLogStandbyInfoActive())
+	{
+		unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId);
+		nUnreportedXids++;
+
+		/*
+		 * ensure this test matches similar one in
+		 * RecoverPreparedTransactions()
+		 */
+		if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS ||
+			log_unknown_top)
+		{
+			xl_xact_assignment xlrec;
+
+			/*
+			 * xtop is always set by now because we recurse up transaction
+			 * stack to the highest unassigned xid and then come back down
+			 */
+			xlrec.xtop = GetTopTransactionId();
+			Assert(TransactionIdIsValid(xlrec.xtop));
+			xlrec.nsubxacts = nUnreportedXids;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment);
+			XLogRegisterData((char *) unreportedXids,
+							 nUnreportedXids * sizeof(TransactionId));
+
+			(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT);
+
+			nUnreportedXids = 0;
+			/* mark top, not current xact as having been logged */
+			TopTransactionStateData.didLogXid = true;
+		}
+	}
+}
+
+/*
+ *	GetCurrentSubTransactionId
+ */
+SubTransactionId
+GetCurrentSubTransactionId(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	return s->subTransactionId;
+}
+
+/*
+ *	SubTransactionIsActive
+ *
+ * Test if the specified subxact ID is still active.  Note caller is
+ * responsible for checking whether this ID is relevant to the current xact.
+ */
+bool
+SubTransactionIsActive(SubTransactionId subxid)
+{
+	TransactionState s;
+
+	for (s = CurrentTransactionState; s != NULL; s = s->parent)
+	{
+		if (s->state == TRANS_ABORT)
+			continue;
+		if (s->subTransactionId == subxid)
+			return true;
+	}
+	return false;
+}
+
+
+/*
+ *	GetCurrentCommandId
+ *
+ * "used" must be true if the caller intends to use the command ID to mark
+ * inserted/updated/deleted tuples.  false means the ID is being fetched
+ * for read-only purposes (ie, as a snapshot validity cutoff).  See
+ * CommandCounterIncrement() for discussion.
+ */
+CommandId
+GetCurrentCommandId(bool used)
+{
+	/* this is global to a transaction, not subtransaction-local */
+	if (used)
+	{
+		/*
+		 * Forbid setting currentCommandIdUsed in a parallel worker, because
+		 * we have no provision for communicating this back to the leader.  We
+		 * could relax this restriction when currentCommandIdUsed was already
+		 * true at the start of the parallel operation.
+		 */
+		Assert(!IsParallelWorker());
+		currentCommandIdUsed = true;
+	}
+	return currentCommandId;
+}
+
+/*
+ *	SetParallelStartTimestamps
+ *
+ * In a parallel worker, we should inherit the parent transaction's
+ * timestamps rather than setting our own.  The parallel worker
+ * infrastructure must call this to provide those values before
+ * calling StartTransaction() or SetCurrentStatementStartTimestamp().
+ */
+void
+SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts)
+{
+	Assert(IsParallelWorker());
+	xactStartTimestamp = xact_ts;
+	stmtStartTimestamp = stmt_ts;
+}
+
+/*
+ *	GetCurrentTransactionStartTimestamp
+ */
+TimestampTz
+GetCurrentTransactionStartTimestamp(void)
+{
+	return xactStartTimestamp;
+}
+
+/*
+ *	GetCurrentStatementStartTimestamp
+ */
+TimestampTz
+GetCurrentStatementStartTimestamp(void)
+{
+	return stmtStartTimestamp;
+}
+
+/*
+ *	GetCurrentTransactionStopTimestamp
+ *
+ * We return current time if the transaction stop time hasn't been set
+ * (which can happen if we decide we don't need to log an XLOG record).
+ */
+TimestampTz
+GetCurrentTransactionStopTimestamp(void)
+{
+	if (xactStopTimestamp != 0)
+		return xactStopTimestamp;
+	return GetCurrentTimestamp();
+}
+
+/*
+ *	SetCurrentStatementStartTimestamp
+ *
+ * In a parallel worker, this should already have been provided by a call
+ * to SetParallelStartTimestamps().
+ */
+void
+SetCurrentStatementStartTimestamp(void)
+{
+	if (!IsParallelWorker())
+		stmtStartTimestamp = GetCurrentTimestamp();
+	else
+		Assert(stmtStartTimestamp != 0);
+}
+
+/*
+ *	SetCurrentTransactionStopTimestamp
+ */
+static inline void
+SetCurrentTransactionStopTimestamp(void)
+{
+	xactStopTimestamp = GetCurrentTimestamp();
+}
+
+/*
+ *	GetCurrentTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction, one when
+ * inside a top-level transaction, etc.
+ */
+int
+GetCurrentTransactionNestLevel(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	return s->nestingLevel;
+}
+
+
+/*
+ *	TransactionIdIsCurrentTransactionId
+ */
+bool
+TransactionIdIsCurrentTransactionId(TransactionId xid)
+{
+	TransactionState s;
+
+	/*
+	 * We always say that BootstrapTransactionId is "not my transaction ID"
+	 * even when it is (ie, during bootstrap).  Along with the fact that
+	 * transam.c always treats BootstrapTransactionId as already committed,
+	 * this causes the heapam_visibility.c routines to see all tuples as
+	 * committed, which is what we need during bootstrap.  (Bootstrap mode
+	 * only inserts tuples, it never updates or deletes them, so all tuples
+	 * can be presumed good immediately.)
+	 *
+	 * Likewise, InvalidTransactionId and FrozenTransactionId are certainly
+	 * not my transaction ID, so we can just return "false" immediately for
+	 * any non-normal XID.
+	 */
+	if (!TransactionIdIsNormal(xid))
+		return false;
+
+	if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+		return true;
+
+	/*
+	 * In parallel workers, the XIDs we must consider as current are stored in
+	 * ParallelCurrentXids rather than the transaction-state stack.  Note that
+	 * the XIDs in this array are sorted numerically rather than according to
+	 * transactionIdPrecedes order.
+	 */
+	if (nParallelCurrentXids > 0)
+	{
+		int			low,
+					high;
+
+		low = 0;
+		high = nParallelCurrentXids - 1;
+		while (low <= high)
+		{
+			int			middle;
+			TransactionId probe;
+
+			middle = low + (high - low) / 2;
+			probe = ParallelCurrentXids[middle];
+			if (probe == xid)
+				return true;
+			else if (probe < xid)
+				low = middle + 1;
+			else
+				high = middle - 1;
+		}
+		return false;
+	}
+
+	/*
+	 * We will return true for the Xid of the current subtransaction, any of
+	 * its subcommitted children, any of its parents, or any of their
+	 * previously subcommitted children.  However, a transaction being aborted
+	 * is no longer "current", even though it may still have an entry on the
+	 * state stack.
+	 */
+	for (s = CurrentTransactionState; s != NULL; s = s->parent)
+	{
+		int			low,
+					high;
+
+		if (s->state == TRANS_ABORT)
+			continue;
+		if (!FullTransactionIdIsValid(s->fullTransactionId))
+			continue;			/* it can't have any child XIDs either */
+		if (TransactionIdEquals(xid, XidFromFullTransactionId(s->fullTransactionId)))
+			return true;
+		/* As the childXids array is ordered, we can use binary search */
+		low = 0;
+		high = s->nChildXids - 1;
+		while (low <= high)
+		{
+			int			middle;
+			TransactionId probe;
+
+			middle = low + (high - low) / 2;
+			probe = s->childXids[middle];
+			if (TransactionIdEquals(probe, xid))
+				return true;
+			else if (TransactionIdPrecedes(probe, xid))
+				low = middle + 1;
+			else
+				high = middle - 1;
+		}
+	}
+
+	return false;
+}
+
+/*
+ *	TransactionStartedDuringRecovery
+ *
+ * Returns true if the current transaction started while recovery was still
+ * in progress. Recovery might have ended since so RecoveryInProgress() might
+ * return false already.
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+	return CurrentTransactionState->startedInRecovery;
+}
+
+/*
+ *	EnterParallelMode
+ */
+void
+EnterParallelMode(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parallelModeLevel >= 0);
+
+	++s->parallelModeLevel;
+}
+
+/*
+ *	ExitParallelMode
+ */
+void
+ExitParallelMode(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parallelModeLevel > 0);
+	Assert(s->parallelModeLevel > 1 || !ParallelContextActive());
+
+	--s->parallelModeLevel;
+}
+
+/*
+ *	IsInParallelMode
+ *
+ * Are we in a parallel operation, as either the leader or a worker?  Check
+ * this to prohibit operations that change backend-local state expected to
+ * match across all workers.  Mere caches usually don't require such a
+ * restriction.  State modified in a strict push/pop fashion, such as the
+ * active snapshot stack, is often fine.
+ */
+bool
+IsInParallelMode(void)
+{
+	return CurrentTransactionState->parallelModeLevel != 0;
+}
+
+/*
+ *	CommandCounterIncrement
+ */
+void
+CommandCounterIncrement(void)
+{
+	/*
+	 * If the current value of the command counter hasn't been "used" to mark
+	 * tuples, we need not increment it, since there's no need to distinguish
+	 * a read-only command from others.  This helps postpone command counter
+	 * overflow, and keeps no-op CommandCounterIncrement operations cheap.
+	 */
+	if (currentCommandIdUsed)
+	{
+		/*
+		 * Workers synchronize transaction state at the beginning of each
+		 * parallel operation, so we can't account for new commands after that
+		 * point.
+		 */
+		if (IsInParallelMode() || IsParallelWorker())
+			elog(ERROR, "cannot start commands during a parallel operation");
+
+		currentCommandId += 1;
+		if (currentCommandId == InvalidCommandId)
+		{
+			currentCommandId -= 1;
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("cannot have more than 2^32-2 commands in a transaction")));
+		}
+		currentCommandIdUsed = false;
+
+		/* Propagate new command ID into static snapshots */
+		SnapshotSetCommandId(currentCommandId);
+
+		/*
+		 * Make any catalog changes done by the just-completed command visible
+		 * in the local syscache.  We obviously don't need to do this after a
+		 * read-only command.  (But see hacks in inval.c to make real sure we
+		 * don't think a command that queued inval messages was read-only.)
+		 */
+		AtCCI_LocalCache();
+	}
+}
+
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction.  Currently, two-phase commit does not
+ * persist and restore this variable.  So long as all callers use
+ * PreventInTransactionBlock(), that omission has no consequences.
+ */
+void
+ForceSyncCommit(void)
+{
+	forceSyncCommit = true;
+}
+
+
+/* ----------------------------------------------------------------
+ *						StartTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	AtStart_Cache
+ */
+static void
+AtStart_Cache(void)
+{
+	AcceptInvalidationMessages();
+}
+
+/*
+ *	AtStart_Memory
+ */
+static void
+AtStart_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * If this is the first time through, create a private context for
+	 * AbortTransaction to work in.  By reserving some space now, we can
+	 * insulate AbortTransaction from out-of-memory scenarios.  Like
+	 * ErrorContext, we set it up with slow growth rate and a nonzero minimum
+	 * size, so that space will be reserved immediately.
+	 */
+	if (TransactionAbortContext == NULL)
+		TransactionAbortContext =
+			AllocSetContextCreate(TopMemoryContext,
+								  "TransactionAbortContext",
+								  32 * 1024,
+								  32 * 1024,
+								  32 * 1024);
+
+	/*
+	 * We shouldn't have a transaction context already.
+	 */
+	Assert(TopTransactionContext == NULL);
+
+	/*
+	 * Create a toplevel context for the transaction.
+	 */
+	TopTransactionContext =
+		AllocSetContextCreate(TopMemoryContext,
+							  "TopTransactionContext",
+							  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * In a top-level transaction, CurTransactionContext is the same as
+	 * TopTransactionContext.
+	 */
+	CurTransactionContext = TopTransactionContext;
+	s->curTransactionContext = CurTransactionContext;
+
+	/* Make the CurTransactionContext active. */
+	MemoryContextSwitchTo(CurTransactionContext);
+}
+
+/*
+ *	AtStart_ResourceOwner
+ */
+static void
+AtStart_ResourceOwner(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * We shouldn't have a transaction resource owner already.
+	 */
+	Assert(TopTransactionResourceOwner == NULL);
+
+	/*
+	 * Create a toplevel resource owner for the transaction.
+	 */
+	s->curTransactionOwner = ResourceOwnerCreate(NULL, "TopTransaction");
+
+	TopTransactionResourceOwner = s->curTransactionOwner;
+	CurTransactionResourceOwner = s->curTransactionOwner;
+	CurrentResourceOwner = s->curTransactionOwner;
+}
+
+/* ----------------------------------------------------------------
+ *						StartSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubStart_Memory
+ */
+static void
+AtSubStart_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(CurTransactionContext != NULL);
+
+	/*
+	 * Create a CurTransactionContext, which will be used to hold data that
+	 * survives subtransaction commit but disappears on subtransaction abort.
+	 * We make it a child of the immediate parent's CurTransactionContext.
+	 */
+	CurTransactionContext = AllocSetContextCreate(CurTransactionContext,
+												  "CurTransactionContext",
+												  ALLOCSET_DEFAULT_SIZES);
+	s->curTransactionContext = CurTransactionContext;
+
+	/* Make the CurTransactionContext active. */
+	MemoryContextSwitchTo(CurTransactionContext);
+}
+
+/*
+ * AtSubStart_ResourceOwner
+ */
+static void
+AtSubStart_ResourceOwner(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parent != NULL);
+
+	/*
+	 * Create a resource owner for the subtransaction.  We make it a child of
+	 * the immediate parent's resource owner.
+	 */
+	s->curTransactionOwner =
+		ResourceOwnerCreate(s->parent->curTransactionOwner,
+							"SubTransaction");
+
+	CurTransactionResourceOwner = s->curTransactionOwner;
+	CurrentResourceOwner = s->curTransactionOwner;
+}
+
+/* ----------------------------------------------------------------
+ *						CommitTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	RecordTransactionCommit
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID.  (We compute that here just because it's easier.)
+ *
+ * If you change this function, see RecordTransactionCommitPrepared also.
+ */
+static TransactionId
+RecordTransactionCommit(void)
+{
+	TransactionId xid = GetTopTransactionIdIfAny();
+	bool		markXidCommitted = TransactionIdIsValid(xid);
+	TransactionId latestXid = InvalidTransactionId;
+	int			nrels;
+	RelFileNode *rels;
+	int			nchildren;
+	TransactionId *children;
+	int			ndroppedstats = 0;
+	xl_xact_stats_item *droppedstats = NULL;
+	int			nmsgs = 0;
+	SharedInvalidationMessage *invalMessages = NULL;
+	bool		RelcacheInitFileInval = false;
+	bool		wrote_xlog;
+
+	/*
+	 * Log pending invalidations for logical decoding of in-progress
+	 * transactions.  Normally for DDLs, we log this at each command end,
+	 * however, for certain cases where we directly update the system table
+	 * without a transaction block, the invalidations are not logged till this
+	 * time.
+	 */
+	if (XLogLogicalInfoActive())
+		LogLogicalInvalidations();
+
+	/* Get data needed for commit record */
+	nrels = smgrGetPendingDeletes(true, &rels);
+	nchildren = xactGetCommittedChildren(&children);
+	ndroppedstats = pgstat_get_transactional_drops(true, &droppedstats);
+	if (XLogStandbyInfoActive())
+		nmsgs = xactGetCommittedInvalidationMessages(&invalMessages,
+													 &RelcacheInitFileInval);
+	wrote_xlog = (XactLastRecEnd != 0);
+
+	/*
+	 * If we haven't been assigned an XID yet, we neither can, nor do we want
+	 * to write a COMMIT record.
+	 */
+	if (!markXidCommitted)
+	{
+		/*
+		 * We expect that every RelationDropStorage is followed by a catalog
+		 * update, and hence XID assignment, so we shouldn't get here with any
+		 * pending deletes. Same is true for dropping stats.
+		 *
+		 * Use a real test not just an Assert to check this, since it's a bit
+		 * fragile.
+		 */
+		if (nrels != 0 || ndroppedstats != 0)
+			elog(ERROR, "cannot commit a transaction that deleted files but has no xid");
+
+		/* Can't have child XIDs either; AssignTransactionId enforces this */
+		Assert(nchildren == 0);
+
+		/*
+		 * Transactions without an assigned xid can contain invalidation
+		 * messages (e.g. explicit relcache invalidations or catcache
+		 * invalidations for inplace updates); standbys need to process those.
+		 * We can't emit a commit record without an xid, and we don't want to
+		 * force assigning an xid, because that'd be problematic for e.g.
+		 * vacuum.  Hence we emit a bespoke record for the invalidations. We
+		 * don't want to use that in case a commit record is emitted, so they
+		 * happen synchronously with commits (besides not wanting to emit more
+		 * WAL records).
+		 */
+		if (nmsgs != 0)
+		{
+			LogStandbyInvalidations(nmsgs, invalMessages,
+									RelcacheInitFileInval);
+			wrote_xlog = true;	/* not strictly necessary */
+		}
+
+		/*
+		 * If we didn't create XLOG entries, we're done here; otherwise we
+		 * should trigger flushing those entries the same as a commit record
+		 * would.  This will primarily happen for HOT pruning and the like; we
+		 * want these to be flushed to disk in due time.
+		 */
+		if (!wrote_xlog)
+			goto cleanup;
+	}
+	else
+	{
+		bool		replorigin;
+
+		/*
+		 * Are we using the replication origins feature?  Or, in other words,
+		 * are we replaying remote actions?
+		 */
+		replorigin = (replorigin_session_origin != InvalidRepOriginId &&
+					  replorigin_session_origin != DoNotReplicateId);
+
+		/*
+		 * Begin commit critical section and insert the commit XLOG record.
+		 */
+		/* Tell bufmgr and smgr to prepare for commit */
+		BufmgrCommit();
+
+		/*
+		 * Mark ourselves as within our "commit critical section".  This
+		 * forces any concurrent checkpoint to wait until we've updated
+		 * pg_xact.  Without this, it is possible for the checkpoint to set
+		 * REDO after the XLOG record but fail to flush the pg_xact update to
+		 * disk, leading to loss of the transaction commit if the system
+		 * crashes a little later.
+		 *
+		 * Note: we could, but don't bother to, set this flag in
+		 * RecordTransactionAbort.  That's because loss of a transaction abort
+		 * is noncritical; the presumption would be that it aborted, anyway.
+		 *
+		 * It's safe to change the delayChkptFlags flag of our own backend
+		 * without holding the ProcArrayLock, since we're the only one
+		 * modifying it.  This makes checkpoint's determination of which xacts
+		 * are delaying the checkpoint a bit fuzzy, but it doesn't matter.
+		 */
+		Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+		START_CRIT_SECTION();
+		MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+		SetCurrentTransactionStopTimestamp();
+
+		XactLogCommitRecord(xactStopTimestamp,
+							nchildren, children, nrels, rels,
+							ndroppedstats, droppedstats,
+							nmsgs, invalMessages,
+							RelcacheInitFileInval,
+							MyXactFlags,
+							InvalidTransactionId, NULL /* plain commit */ );
+
+		if (replorigin)
+			/* Move LSNs forward for this replication origin */
+			replorigin_session_advance(replorigin_session_origin_lsn,
+									   XactLastRecEnd);
+
+		/*
+		 * Record commit timestamp.  The value comes from plain commit
+		 * timestamp if there's no replication origin; otherwise, the
+		 * timestamp was already set in replorigin_session_origin_timestamp by
+		 * replication.
+		 *
+		 * We don't need to WAL-log anything here, as the commit record
+		 * written above already contains the data.
+		 */
+
+		if (!replorigin || replorigin_session_origin_timestamp == 0)
+			replorigin_session_origin_timestamp = xactStopTimestamp;
+
+		TransactionTreeSetCommitTsData(xid, nchildren, children,
+									   replorigin_session_origin_timestamp,
+									   replorigin_session_origin);
+	}
+
+	/*
+	 * Check if we want to commit asynchronously.  We can allow the XLOG flush
+	 * to happen asynchronously if synchronous_commit=off, or if the current
+	 * transaction has not performed any WAL-logged operation or didn't assign
+	 * an xid.  The transaction can end up not writing any WAL, even if it has
+	 * an xid, if it only wrote to temporary and/or unlogged tables.  It can
+	 * end up having written WAL without an xid if it did HOT pruning.  In
+	 * case of a crash, the loss of such a transaction will be irrelevant;
+	 * temp tables will be lost anyway, unlogged tables will be truncated and
+	 * HOT pruning will be done again later. (Given the foregoing, you might
+	 * think that it would be unnecessary to emit the XLOG record at all in
+	 * this case, but we don't currently try to do that.  It would certainly
+	 * cause problems at least in Hot Standby mode, where the
+	 * KnownAssignedXids machinery requires tracking every XID assignment.  It
+	 * might be OK to skip it only when wal_level < replica, but for now we
+	 * don't.)
+	 *
+	 * However, if we're doing cleanup of any non-temp rels or committing any
+	 * command that wanted to force sync commit, then we must flush XLOG
+	 * immediately.  (We must not allow asynchronous commit if there are any
+	 * non-temp tables to be deleted, because we might delete the files before
+	 * the COMMIT record is flushed to disk.  We do allow asynchronous commit
+	 * if all to-be-deleted tables are temporary though, since they are lost
+	 * anyway if we crash.)
+	 */
+	if ((wrote_xlog && markXidCommitted &&
+		 synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
+		forceSyncCommit || nrels > 0)
+	{
+		XLogFlush(XactLastRecEnd);
+
+		/*
+		 * Now we may update the CLOG, if we wrote a COMMIT record above
+		 */
+		if (markXidCommitted)
+			TransactionIdCommitTree(xid, nchildren, children);
+	}
+	else
+	{
+		/*
+		 * Asynchronous commit case:
+		 *
+		 * This enables possible committed transaction loss in the case of a
+		 * postmaster crash because WAL buffers are left unwritten. Ideally we
+		 * could issue the WAL write without the fsync, but some
+		 * wal_sync_methods do not allow separate write/fsync.
+		 *
+		 * Report the latest async commit LSN, so that the WAL writer knows to
+		 * flush this commit.
+		 */
+		XLogSetAsyncXactLSN(XactLastRecEnd);
+
+		/*
+		 * We must not immediately update the CLOG, since we didn't flush the
+		 * XLOG. Instead, we store the LSN up to which the XLOG must be
+		 * flushed before the CLOG may be updated.
+		 */
+		if (markXidCommitted)
+			TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd);
+	}
+
+	/*
+	 * If we entered a commit critical section, leave it now, and let
+	 * checkpoints proceed.
+	 */
+	if (markXidCommitted)
+	{
+		MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+		END_CRIT_SECTION();
+	}
+
+	/* Compute latestXid while we have the child XIDs handy */
+	latestXid = TransactionIdLatest(xid, nchildren, children);
+
+	/*
+	 * Wait for synchronous replication, if required. Similar to the decision
+	 * above about using committing asynchronously we only want to wait if
+	 * this backend assigned an xid and wrote WAL.  No need to wait if an xid
+	 * was assigned due to temporary/unlogged tables or due to HOT pruning.
+	 *
+	 * Note that at this stage we have marked clog, but still show as running
+	 * in the procarray and continue to hold locks.
+	 */
+	if (wrote_xlog && markXidCommitted)
+		SyncRepWaitForLSN(XactLastRecEnd, true);
+
+	/* remember end of last commit record */
+	XactLastCommitEnd = XactLastRecEnd;
+
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	XactLastRecEnd = 0;
+cleanup:
+	/* Clean up local data */
+	if (rels)
+		pfree(rels);
+	if (ndroppedstats)
+		pfree(droppedstats);
+
+	return latestXid;
+}
+
+
+/*
+ *	AtCCI_LocalCache
+ */
+static void
+AtCCI_LocalCache(void)
+{
+	/*
+	 * Make any pending relation map changes visible.  We must do this before
+	 * processing local sinval messages, so that the map changes will get
+	 * reflected into the relcache when relcache invals are processed.
+	 */
+	AtCCI_RelationMap();
+
+	/*
+	 * Make catalog changes visible to me for the next command.
+	 */
+	CommandEndInvalidationMessages();
+}
+
+/*
+ *	AtCommit_Memory
+ */
+static void
+AtCommit_Memory(void)
+{
+	/*
+	 * Now that we're "out" of a transaction, have the system allocate things
+	 * in the top memory context instead of per-transaction contexts.
+	 */
+	MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Release all transaction-local memory.
+	 */
+	Assert(TopTransactionContext != NULL);
+	MemoryContextDelete(TopTransactionContext);
+	TopTransactionContext = NULL;
+	CurTransactionContext = NULL;
+	CurrentTransactionState->curTransactionContext = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *						CommitSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCommit_Memory
+ */
+static void
+AtSubCommit_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parent != NULL);
+
+	/* Return to parent transaction level's memory context. */
+	CurTransactionContext = s->parent->curTransactionContext;
+	MemoryContextSwitchTo(CurTransactionContext);
+
+	/*
+	 * Ordinarily we cannot throw away the child's CurTransactionContext,
+	 * since the data it contains will be needed at upper commit.  However, if
+	 * there isn't actually anything in it, we can throw it away.  This avoids
+	 * a small memory leak in the common case of "trivial" subxacts.
+	 */
+	if (MemoryContextIsEmpty(s->curTransactionContext))
+	{
+		MemoryContextDelete(s->curTransactionContext);
+		s->curTransactionContext = NULL;
+	}
+}
+
+/*
+ * AtSubCommit_childXids
+ *
+ * Pass my own XID and my child XIDs up to my parent as committed children.
+ */
+static void
+AtSubCommit_childXids(void)
+{
+	TransactionState s = CurrentTransactionState;
+	int			new_nChildXids;
+
+	Assert(s->parent != NULL);
+
+	/*
+	 * The parent childXids array will need to hold my XID and all my
+	 * childXids, in addition to the XIDs already there.
+	 */
+	new_nChildXids = s->parent->nChildXids + s->nChildXids + 1;
+
+	/* Allocate or enlarge the parent array if necessary */
+	if (s->parent->maxChildXids < new_nChildXids)
+	{
+		int			new_maxChildXids;
+		TransactionId *new_childXids;
+
+		/*
+		 * Make it 2x what's needed right now, to avoid having to enlarge it
+		 * repeatedly. But we can't go above MaxAllocSize.  (The latter limit
+		 * is what ensures that we don't need to worry about integer overflow
+		 * here or in the calculation of new_nChildXids.)
+		 */
+		new_maxChildXids = Min(new_nChildXids * 2,
+							   (int) (MaxAllocSize / sizeof(TransactionId)));
+
+		if (new_maxChildXids < new_nChildXids)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					 errmsg("maximum number of committed subtransactions (%d) exceeded",
+							(int) (MaxAllocSize / sizeof(TransactionId)))));
+
+		/*
+		 * We keep the child-XID arrays in TopTransactionContext; this avoids
+		 * setting up child-transaction contexts for what might be just a few
+		 * bytes of grandchild XIDs.
+		 */
+		if (s->parent->childXids == NULL)
+			new_childXids =
+				MemoryContextAlloc(TopTransactionContext,
+								   new_maxChildXids * sizeof(TransactionId));
+		else
+			new_childXids = repalloc(s->parent->childXids,
+									 new_maxChildXids * sizeof(TransactionId));
+
+		s->parent->childXids = new_childXids;
+		s->parent->maxChildXids = new_maxChildXids;
+	}
+
+	/*
+	 * Copy all my XIDs to parent's array.
+	 *
+	 * Note: We rely on the fact that the XID of a child always follows that
+	 * of its parent.  By copying the XID of this subtransaction before the
+	 * XIDs of its children, we ensure that the array stays ordered. Likewise,
+	 * all XIDs already in the array belong to subtransactions started and
+	 * subcommitted before us, so their XIDs must precede ours.
+	 */
+	s->parent->childXids[s->parent->nChildXids] = XidFromFullTransactionId(s->fullTransactionId);
+
+	if (s->nChildXids > 0)
+		memcpy(&s->parent->childXids[s->parent->nChildXids + 1],
+			   s->childXids,
+			   s->nChildXids * sizeof(TransactionId));
+
+	s->parent->nChildXids = new_nChildXids;
+
+	/* Release child's array to avoid leakage */
+	if (s->childXids != NULL)
+		pfree(s->childXids);
+	/* We must reset these to avoid double-free if fail later in commit */
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+}
+
+/* ----------------------------------------------------------------
+ *						AbortTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	RecordTransactionAbort
+ *
+ * Returns latest XID among xact and its children, or InvalidTransactionId
+ * if the xact has no XID.  (We compute that here just because it's easier.)
+ */
+static TransactionId
+RecordTransactionAbort(bool isSubXact)
+{
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+	TransactionId latestXid;
+	int			nrels;
+	RelFileNode *rels;
+	int			ndroppedstats = 0;
+	xl_xact_stats_item *droppedstats = NULL;
+	int			nchildren;
+	TransactionId *children;
+	TimestampTz xact_time;
+
+	/*
+	 * If we haven't been assigned an XID, nobody will care whether we aborted
+	 * or not.  Hence, we're done in that case.  It does not matter if we have
+	 * rels to delete (note that this routine is not responsible for actually
+	 * deleting 'em).  We cannot have any child XIDs, either.
+	 */
+	if (!TransactionIdIsValid(xid))
+	{
+		/* Reset XactLastRecEnd until the next transaction writes something */
+		if (!isSubXact)
+			XactLastRecEnd = 0;
+		return InvalidTransactionId;
+	}
+
+	/*
+	 * We have a valid XID, so we should write an ABORT record for it.
+	 *
+	 * We do not flush XLOG to disk here, since the default assumption after a
+	 * crash would be that we aborted, anyway.  For the same reason, we don't
+	 * need to worry about interlocking against checkpoint start.
+	 */
+
+	/*
+	 * Check that we haven't aborted halfway through RecordTransactionCommit.
+	 */
+	if (TransactionIdDidCommit(xid))
+		elog(PANIC, "cannot abort transaction %u, it was already committed",
+			 xid);
+
+	/* Fetch the data we need for the abort record */
+	nrels = smgrGetPendingDeletes(false, &rels);
+	nchildren = xactGetCommittedChildren(&children);
+	ndroppedstats = pgstat_get_transactional_drops(false, &droppedstats);
+
+	/* XXX do we really need a critical section here? */
+	START_CRIT_SECTION();
+
+	/* Write the ABORT record */
+	if (isSubXact)
+		xact_time = GetCurrentTimestamp();
+	else
+	{
+		SetCurrentTransactionStopTimestamp();
+		xact_time = xactStopTimestamp;
+	}
+
+	XactLogAbortRecord(xact_time,
+					   nchildren, children,
+					   nrels, rels,
+					   ndroppedstats, droppedstats,
+					   MyXactFlags, InvalidTransactionId,
+					   NULL);
+
+	/*
+	 * Report the latest async abort LSN, so that the WAL writer knows to
+	 * flush this abort. There's nothing to be gained by delaying this, since
+	 * WALWriter may as well do this when it can. This is important with
+	 * streaming replication because if we don't flush WAL regularly we will
+	 * find that large aborts leave us with a long backlog for when commits
+	 * occur after the abort, increasing our window of data loss should
+	 * problems occur at that point.
+	 */
+	if (!isSubXact)
+		XLogSetAsyncXactLSN(XactLastRecEnd);
+
+	/*
+	 * Mark the transaction aborted in clog.  This is not absolutely necessary
+	 * but we may as well do it while we are here; also, in the subxact case
+	 * it is helpful because XactLockTableWait makes use of it to avoid
+	 * waiting for already-aborted subtransactions.  It is OK to do it without
+	 * having flushed the ABORT record to disk, because in event of a crash
+	 * we'd be assumed to have aborted anyway.
+	 */
+	TransactionIdAbortTree(xid, nchildren, children);
+
+	END_CRIT_SECTION();
+
+	/* Compute latestXid while we have the child XIDs handy */
+	latestXid = TransactionIdLatest(xid, nchildren, children);
+
+	/*
+	 * If we're aborting a subtransaction, we can immediately remove failed
+	 * XIDs from PGPROC's cache of running child XIDs.  We do that here for
+	 * subxacts, because we already have the child XID array at hand.  For
+	 * main xacts, the equivalent happens just after this function returns.
+	 */
+	if (isSubXact)
+		XidCacheRemoveRunningXids(xid, nchildren, children, latestXid);
+
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	if (!isSubXact)
+		XactLastRecEnd = 0;
+
+	/* And clean up local data */
+	if (rels)
+		pfree(rels);
+	if (ndroppedstats)
+		pfree(droppedstats);
+
+	return latestXid;
+}
+
+/*
+ *	AtAbort_Memory
+ */
+static void
+AtAbort_Memory(void)
+{
+	/*
+	 * Switch into TransactionAbortContext, which should have some free space
+	 * even if nothing else does.  We'll work in this context until we've
+	 * finished cleaning up.
+	 *
+	 * It is barely possible to get here when we've not been able to create
+	 * TransactionAbortContext yet; if so use TopMemoryContext.
+	 */
+	if (TransactionAbortContext != NULL)
+		MemoryContextSwitchTo(TransactionAbortContext);
+	else
+		MemoryContextSwitchTo(TopMemoryContext);
+}
+
+/*
+ * AtSubAbort_Memory
+ */
+static void
+AtSubAbort_Memory(void)
+{
+	Assert(TransactionAbortContext != NULL);
+
+	MemoryContextSwitchTo(TransactionAbortContext);
+}
+
+
+/*
+ *	AtAbort_ResourceOwner
+ */
+static void
+AtAbort_ResourceOwner(void)
+{
+	/*
+	 * Make sure we have a valid ResourceOwner, if possible (else it will be
+	 * NULL, which is OK)
+	 */
+	CurrentResourceOwner = TopTransactionResourceOwner;
+}
+
+/*
+ * AtSubAbort_ResourceOwner
+ */
+static void
+AtSubAbort_ResourceOwner(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/* Make sure we have a valid ResourceOwner */
+	CurrentResourceOwner = s->curTransactionOwner;
+}
+
+
+/*
+ * AtSubAbort_childXids
+ */
+static void
+AtSubAbort_childXids(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * We keep the child-XID arrays in TopTransactionContext (see
+	 * AtSubCommit_childXids).  This means we'd better free the array
+	 * explicitly at abort to avoid leakage.
+	 */
+	if (s->childXids != NULL)
+		pfree(s->childXids);
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+
+	/*
+	 * We could prune the unreportedXids array here. But we don't bother. That
+	 * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it
+	 * would likely introduce more CPU time into the more common paths, so we
+	 * choose not to do that.
+	 */
+}
+
+/* ----------------------------------------------------------------
+ *						CleanupTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	AtCleanup_Memory
+ */
+static void
+AtCleanup_Memory(void)
+{
+	Assert(CurrentTransactionState->parent == NULL);
+
+	/*
+	 * Now that we're "out" of a transaction, have the system allocate things
+	 * in the top memory context instead of per-transaction contexts.
+	 */
+	MemoryContextSwitchTo(TopMemoryContext);
+
+	/*
+	 * Clear the special abort context for next time.
+	 */
+	if (TransactionAbortContext != NULL)
+		MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
+	/*
+	 * Release all transaction-local memory.
+	 */
+	if (TopTransactionContext != NULL)
+		MemoryContextDelete(TopTransactionContext);
+	TopTransactionContext = NULL;
+	CurTransactionContext = NULL;
+	CurrentTransactionState->curTransactionContext = NULL;
+}
+
+
+/* ----------------------------------------------------------------
+ *						CleanupSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCleanup_Memory
+ */
+static void
+AtSubCleanup_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parent != NULL);
+
+	/* Make sure we're not in an about-to-be-deleted context */
+	MemoryContextSwitchTo(s->parent->curTransactionContext);
+	CurTransactionContext = s->parent->curTransactionContext;
+
+	/*
+	 * Clear the special abort context for next time.
+	 */
+	if (TransactionAbortContext != NULL)
+		MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
+	/*
+	 * Delete the subxact local memory contexts. Its CurTransactionContext can
+	 * go too (note this also kills CurTransactionContexts from any children
+	 * of the subxact).
+	 */
+	if (s->curTransactionContext)
+		MemoryContextDelete(s->curTransactionContext);
+	s->curTransactionContext = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *						interface routines
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	StartTransaction
+ */
+static void
+StartTransaction(void)
+{
+	TransactionState s;
+	VirtualTransactionId vxid;
+
+	/*
+	 * Let's just make sure the state stack is empty
+	 */
+	s = &TopTransactionStateData;
+	CurrentTransactionState = s;
+
+	Assert(!FullTransactionIdIsValid(XactTopFullTransactionId));
+
+	/* check the current transaction state */
+	Assert(s->state == TRANS_DEFAULT);
+
+	/*
+	 * Set the current transaction state information appropriately during
+	 * start processing.  Note that once the transaction status is switched
+	 * this process cannot fail until the user ID and the security context
+	 * flags are fetched below.
+	 */
+	s->state = TRANS_START;
+	s->fullTransactionId = InvalidFullTransactionId;	/* until assigned */
+
+	/* Determine if statements are logged in this transaction */
+	xact_is_sampled = log_xact_sample_rate != 0 &&
+		(log_xact_sample_rate == 1 ||
+		 pg_prng_double(&pg_global_prng_state) <= log_xact_sample_rate);
+
+	/*
+	 * initialize current transaction state fields
+	 *
+	 * note: prevXactReadOnly is not used at the outermost level
+	 */
+	s->nestingLevel = 1;
+	s->gucNestLevel = 1;
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+
+	/*
+	 * Once the current user ID and the security context flags are fetched,
+	 * both will be properly reset even if transaction startup fails.
+	 */
+	GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
+
+	/* SecurityRestrictionContext should never be set outside a transaction */
+	Assert(s->prevSecContext == 0);
+
+	/*
+	 * Make sure we've reset xact state variables
+	 *
+	 * If recovery is still in progress, mark this transaction as read-only.
+	 * We have lower level defences in XLogInsert and elsewhere to stop us
+	 * from modifying data during recovery, but this gives the normal
+	 * indication to the user that the transaction is read-only.
+	 */
+	if (RecoveryInProgress())
+	{
+		s->startedInRecovery = true;
+		XactReadOnly = true;
+	}
+	else
+	{
+		s->startedInRecovery = false;
+		XactReadOnly = DefaultXactReadOnly;
+	}
+	XactDeferrable = DefaultXactDeferrable;
+	XactIsoLevel = DefaultXactIsoLevel;
+	forceSyncCommit = false;
+	MyXactFlags = 0;
+
+	/*
+	 * reinitialize within-transaction counters
+	 */
+	s->subTransactionId = TopSubTransactionId;
+	currentSubTransactionId = TopSubTransactionId;
+	currentCommandId = FirstCommandId;
+	currentCommandIdUsed = false;
+
+	/*
+	 * initialize reported xid accounting
+	 */
+	nUnreportedXids = 0;
+	s->didLogXid = false;
+
+	/*
+	 * must initialize resource-management stuff first
+	 */
+	AtStart_Memory();
+	AtStart_ResourceOwner();
+
+	/*
+	 * Assign a new LocalTransactionId, and combine it with the backendId to
+	 * form a virtual transaction id.
+	 */
+	vxid.backendId = MyBackendId;
+	vxid.localTransactionId = GetNextLocalTransactionId();
+
+	/*
+	 * Lock the virtual transaction id before we announce it in the proc array
+	 */
+	VirtualXactLockTableInsert(vxid);
+
+	/*
+	 * Advertise it in the proc array.  We assume assignment of
+	 * localTransactionId is atomic, and the backendId should be set already.
+	 */
+	Assert(MyProc->backendId == vxid.backendId);
+	MyProc->lxid = vxid.localTransactionId;
+
+	TRACE_POSTGRESQL_TRANSACTION_START(vxid.localTransactionId);
+
+	/*
+	 * set transaction_timestamp() (a/k/a now()).  Normally, we want this to
+	 * be the same as the first command's statement_timestamp(), so don't do a
+	 * fresh GetCurrentTimestamp() call (which'd be expensive anyway).  But
+	 * for transactions started inside procedures (i.e., nonatomic SPI
+	 * contexts), we do need to advance the timestamp.  Also, in a parallel
+	 * worker, the timestamp should already have been provided by a call to
+	 * SetParallelStartTimestamps().
+	 */
+	if (!IsParallelWorker())
+	{
+		if (!SPI_inside_nonatomic_context())
+			xactStartTimestamp = stmtStartTimestamp;
+		else
+			xactStartTimestamp = GetCurrentTimestamp();
+	}
+	else
+		Assert(xactStartTimestamp != 0);
+	pgstat_report_xact_timestamp(xactStartTimestamp);
+	/* Mark xactStopTimestamp as unset. */
+	xactStopTimestamp = 0;
+
+	/*
+	 * initialize other subsystems for new transaction
+	 */
+	AtStart_GUC();
+	AtStart_Cache();
+	AfterTriggerBeginXact();
+
+	/*
+	 * done with start processing, set current transaction state to "in
+	 * progress"
+	 */
+	s->state = TRANS_INPROGRESS;
+
+	ShowTransactionState("StartTransaction");
+}
+
+
+/*
+ *	CommitTransaction
+ *
+ * NB: if you change this routine, better look at PrepareTransaction too!
+ */
+static void
+CommitTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+	TransactionId latestXid;
+	bool		is_parallel_worker;
+
+	is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
+
+	/* Enforce parallel mode restrictions during parallel worker commit. */
+	if (is_parallel_worker)
+		EnterParallelMode();
+
+	ShowTransactionState("CommitTransaction");
+
+	/*
+	 * check the current transaction state
+	 */
+	if (s->state != TRANS_INPROGRESS)
+		elog(WARNING, "CommitTransaction while in %s state",
+			 TransStateAsString(s->state));
+	Assert(s->parent == NULL);
+
+	/*
+	 * Do pre-commit processing that involves calling user-defined code, such
+	 * as triggers.  SECURITY_RESTRICTED_OPERATION contexts must not queue an
+	 * action that would run here, because that would bypass the sandbox.
+	 * Since closing cursors could queue trigger actions, triggers could open
+	 * cursors, etc, we have to keep looping until there's nothing left to do.
+	 */
+	for (;;)
+	{
+		/*
+		 * Fire all currently pending deferred triggers.
+		 */
+		AfterTriggerFireDeferred();
+
+		/*
+		 * Close open portals (converting holdable ones into static portals).
+		 * If there weren't any, we are done ... otherwise loop back to check
+		 * if they queued deferred triggers.  Lather, rinse, repeat.
+		 */
+		if (!PreCommit_Portals(false))
+			break;
+	}
+
+	/*
+	 * The remaining actions cannot call any user-defined code, so it's safe
+	 * to start shutting down within-transaction services.  But note that most
+	 * of this stuff could still throw an error, which would switch us into
+	 * the transaction-abort path.
+	 */
+
+	CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT
+					  : XACT_EVENT_PRE_COMMIT);
+
+	/* If we might have parallel workers, clean them up now. */
+	if (IsInParallelMode())
+		AtEOXact_Parallel(true);
+
+	/* Shut down the deferred-trigger manager */
+	AfterTriggerEndXact(true);
+
+	/*
+	 * Let ON COMMIT management do its thing (must happen after closing
+	 * cursors, to avoid dangling-reference problems)
+	 */
+	PreCommit_on_commit_actions();
+
+	/*
+	 * Synchronize files that are created and not WAL-logged during this
+	 * transaction. This must happen before AtEOXact_RelationMap(), so that we
+	 * don't see committed-but-broken files after a crash.
+	 */
+	smgrDoPendingSyncs(true, is_parallel_worker);
+
+	/* close large objects before lower-level cleanup */
+	AtEOXact_LargeObject(true);
+
+	/*
+	 * Insert notifications sent by NOTIFY commands into the queue.  This
+	 * should be late in the pre-commit sequence to minimize time spent
+	 * holding the notify-insertion lock.  However, this could result in
+	 * creating a snapshot, so we must do it before serializable cleanup.
+	 */
+	PreCommit_Notify();
+
+	/*
+	 * Mark serializable transaction as complete for predicate locking
+	 * purposes.  This should be done as late as we can put it and still allow
+	 * errors to be raised for failure patterns found at commit.  This is not
+	 * appropriate in a parallel worker however, because we aren't committing
+	 * the leader's transaction and its serializable state will live on.
+	 */
+	if (!is_parallel_worker)
+		PreCommit_CheckForSerializationFailure();
+
+	/* Prevent cancel/die interrupt while cleaning up */
+	HOLD_INTERRUPTS();
+
+	/* Commit updates to the relation map --- do this as late as possible */
+	AtEOXact_RelationMap(true, is_parallel_worker);
+
+	/*
+	 * set the current transaction state information appropriately during
+	 * commit processing
+	 */
+	s->state = TRANS_COMMIT;
+	s->parallelModeLevel = 0;
+
+	if (!is_parallel_worker)
+	{
+		/*
+		 * We need to mark our XIDs as committed in pg_xact.  This is where we
+		 * durably commit.
+		 */
+		latestXid = RecordTransactionCommit();
+	}
+	else
+	{
+		/*
+		 * We must not mark our XID committed; the parallel leader is
+		 * responsible for that.
+		 */
+		latestXid = InvalidTransactionId;
+
+		/*
+		 * Make sure the leader will know about any WAL we wrote before it
+		 * commits.
+		 */
+		ParallelWorkerReportLastRecEnd(XactLastRecEnd);
+	}
+
+	TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid);
+
+	/*
+	 * Let others know about no transaction in progress by me. Note that this
+	 * must be done _before_ releasing locks we hold and _after_
+	 * RecordTransactionCommit.
+	 */
+	ProcArrayEndTransaction(MyProc, latestXid);
+
+	/*
+	 * This is all post-commit cleanup.  Note that if an error is raised here,
+	 * it's too late to abort the transaction.  This should be just
+	 * noncritical resource releasing.
+	 *
+	 * The ordering of operations is not entirely random.  The idea is:
+	 * release resources visible to other backends (eg, files, buffer pins);
+	 * then release locks; then release backend-local resources. We want to
+	 * release locks at the point where any backend waiting for us will see
+	 * our transaction as being fully cleaned up.
+	 *
+	 * Resources that can be associated with individual queries are handled by
+	 * the ResourceOwner mechanism.  The other calls here are for backend-wide
+	 * state.
+	 */
+
+	CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_COMMIT
+					  : XACT_EVENT_COMMIT);
+
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_BEFORE_LOCKS,
+						 true, true);
+
+	/* Check we've released all buffer pins */
+	AtEOXact_Buffers(true);
+
+	/* Clean up the relation cache */
+	AtEOXact_RelationCache(true);
+
+	/*
+	 * Make catalog changes visible to all backends.  This has to happen after
+	 * relcache references are dropped (see comments for
+	 * AtEOXact_RelationCache), but before locks are released (if anyone is
+	 * waiting for lock on a relation we've modified, we want them to know
+	 * about the catalog change before they start using the relation).
+	 */
+	AtEOXact_Inval(true);
+
+	AtEOXact_MultiXact();
+
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_LOCKS,
+						 true, true);
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_AFTER_LOCKS,
+						 true, true);
+
+	/*
+	 * Likewise, dropping of files deleted during the transaction is best done
+	 * after releasing relcache and buffer pins.  (This is not strictly
+	 * necessary during commit, since such pins should have been released
+	 * already, but this ordering is definitely critical during abort.)  Since
+	 * this may take many seconds, also delay until after releasing locks.
+	 * Other backends will observe the attendant catalog changes and not
+	 * attempt to access affected files.
+	 */
+	smgrDoPendingDeletes(true);
+
+	/*
+	 * Send out notification signals to other backends (and do other
+	 * post-commit NOTIFY cleanup).  This must not happen until after our
+	 * transaction is fully done from the viewpoint of other backends.
+	 */
+	AtCommit_Notify();
+
+	/*
+	 * Everything after this should be purely internal-to-this-backend
+	 * cleanup.
+	 */
+	AtEOXact_GUC(true, 1);
+	AtEOXact_SPI(true);
+	AtEOXact_Enum();
+	AtEOXact_on_commit_actions(true);
+	AtEOXact_Namespace(true, is_parallel_worker);
+	AtEOXact_SMgr();
+	AtEOXact_Files(true);
+	AtEOXact_ComboCid();
+	AtEOXact_HashTables(true);
+	AtEOXact_PgStat(true, is_parallel_worker);
+	AtEOXact_Snapshot(true, false);
+	AtEOXact_ApplyLauncher(true);
+	pgstat_report_xact_timestamp(0);
+
+	CurrentResourceOwner = NULL;
+	ResourceOwnerDelete(TopTransactionResourceOwner);
+	s->curTransactionOwner = NULL;
+	CurTransactionResourceOwner = NULL;
+	TopTransactionResourceOwner = NULL;
+
+	AtCommit_Memory();
+
+	s->fullTransactionId = InvalidFullTransactionId;
+	s->subTransactionId = InvalidSubTransactionId;
+	s->nestingLevel = 0;
+	s->gucNestLevel = 0;
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+
+	XactTopFullTransactionId = InvalidFullTransactionId;
+	nParallelCurrentXids = 0;
+
+	/*
+	 * done with commit processing, set current transaction state back to
+	 * default
+	 */
+	s->state = TRANS_DEFAULT;
+
+	RESUME_INTERRUPTS();
+}
+
+
+/*
+ *	PrepareTransaction
+ *
+ * NB: if you change this routine, better look at CommitTransaction too!
+ */
+static void
+PrepareTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+	TransactionId xid = GetCurrentTransactionId();
+	GlobalTransaction gxact;
+	TimestampTz prepared_at;
+
+	Assert(!IsInParallelMode());
+
+	ShowTransactionState("PrepareTransaction");
+
+	/*
+	 * check the current transaction state
+	 */
+	if (s->state != TRANS_INPROGRESS)
+		elog(WARNING, "PrepareTransaction while in %s state",
+			 TransStateAsString(s->state));
+	Assert(s->parent == NULL);
+
+	/*
+	 * Do pre-commit processing that involves calling user-defined code, such
+	 * as triggers.  Since closing cursors could queue trigger actions,
+	 * triggers could open cursors, etc, we have to keep looping until there's
+	 * nothing left to do.
+	 */
+	for (;;)
+	{
+		/*
+		 * Fire all currently pending deferred triggers.
+		 */
+		AfterTriggerFireDeferred();
+
+		/*
+		 * Close open portals (converting holdable ones into static portals).
+		 * If there weren't any, we are done ... otherwise loop back to check
+		 * if they queued deferred triggers.  Lather, rinse, repeat.
+		 */
+		if (!PreCommit_Portals(true))
+			break;
+	}
+
+	CallXactCallbacks(XACT_EVENT_PRE_PREPARE);
+
+	/*
+	 * The remaining actions cannot call any user-defined code, so it's safe
+	 * to start shutting down within-transaction services.  But note that most
+	 * of this stuff could still throw an error, which would switch us into
+	 * the transaction-abort path.
+	 */
+
+	/* Shut down the deferred-trigger manager */
+	AfterTriggerEndXact(true);
+
+	/*
+	 * Let ON COMMIT management do its thing (must happen after closing
+	 * cursors, to avoid dangling-reference problems)
+	 */
+	PreCommit_on_commit_actions();
+
+	/*
+	 * Synchronize files that are created and not WAL-logged during this
+	 * transaction. This must happen before EndPrepare(), so that we don't see
+	 * committed-but-broken files after a crash and COMMIT PREPARED.
+	 */
+	smgrDoPendingSyncs(true, false);
+
+	/* close large objects before lower-level cleanup */
+	AtEOXact_LargeObject(true);
+
+	/* NOTIFY requires no work at this point */
+
+	/*
+	 * Mark serializable transaction as complete for predicate locking
+	 * purposes.  This should be done as late as we can put it and still allow
+	 * errors to be raised for failure patterns found at commit.
+	 */
+	PreCommit_CheckForSerializationFailure();
+
+	/*
+	 * Don't allow PREPARE TRANSACTION if we've accessed a temporary table in
+	 * this transaction.  Having the prepared xact hold locks on another
+	 * backend's temp table seems a bad idea --- for instance it would prevent
+	 * the backend from exiting.  There are other problems too, such as how to
+	 * clean up the source backend's local buffers and ON COMMIT state if the
+	 * prepared xact includes a DROP of a temp table.
+	 *
+	 * Other objects types, like functions, operators or extensions, share the
+	 * same restriction as they should not be created, locked or dropped as
+	 * this can mess up with this session or even a follow-up session trying
+	 * to use the same temporary namespace.
+	 *
+	 * We must check this after executing any ON COMMIT actions, because they
+	 * might still access a temp relation.
+	 *
+	 * XXX In principle this could be relaxed to allow some useful special
+	 * cases, such as a temp table created and dropped all within the
+	 * transaction.  That seems to require much more bookkeeping though.
+	 */
+	if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPNAMESPACE))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot PREPARE a transaction that has operated on temporary objects")));
+
+	/*
+	 * Likewise, don't allow PREPARE after pg_export_snapshot.  This could be
+	 * supported if we added cleanup logic to twophase.c, but for now it
+	 * doesn't seem worth the trouble.
+	 */
+	if (XactHasExportedSnapshots())
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot PREPARE a transaction that has exported snapshots")));
+
+	/* Prevent cancel/die interrupt while cleaning up */
+	HOLD_INTERRUPTS();
+
+	/*
+	 * set the current transaction state information appropriately during
+	 * prepare processing
+	 */
+	s->state = TRANS_PREPARE;
+
+	prepared_at = GetCurrentTimestamp();
+
+	/* Tell bufmgr and smgr to prepare for commit */
+	BufmgrCommit();
+
+	/*
+	 * Reserve the GID for this transaction. This could fail if the requested
+	 * GID is invalid or already in use.
+	 */
+	gxact = MarkAsPreparing(xid, prepareGID, prepared_at,
+							GetUserId(), MyDatabaseId);
+	prepareGID = NULL;
+
+	/*
+	 * Collect data for the 2PC state file.  Note that in general, no actual
+	 * state change should happen in the called modules during this step,
+	 * since it's still possible to fail before commit, and in that case we
+	 * want transaction abort to be able to clean up.  (In particular, the
+	 * AtPrepare routines may error out if they find cases they cannot
+	 * handle.)  State cleanup should happen in the PostPrepare routines
+	 * below.  However, some modules can go ahead and clear state here because
+	 * they wouldn't do anything with it during abort anyway.
+	 *
+	 * Note: because the 2PC state file records will be replayed in the same
+	 * order they are made, the order of these calls has to match the order in
+	 * which we want things to happen during COMMIT PREPARED or ROLLBACK
+	 * PREPARED; in particular, pay attention to whether things should happen
+	 * before or after releasing the transaction's locks.
+	 */
+	StartPrepare(gxact);
+
+	AtPrepare_Notify();
+	AtPrepare_Locks();
+	AtPrepare_PredicateLocks();
+	AtPrepare_PgStat();
+	AtPrepare_MultiXact();
+	AtPrepare_RelationMap();
+
+	/*
+	 * Here is where we really truly prepare.
+	 *
+	 * We have to record transaction prepares even if we didn't make any
+	 * updates, because the transaction manager might get confused if we lose
+	 * a global transaction.
+	 */
+	EndPrepare(gxact);
+
+	/*
+	 * Now we clean up backend-internal state and release internal resources.
+	 */
+
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	XactLastRecEnd = 0;
+
+	/*
+	 * Transfer our locks to a dummy PGPROC.  This has to be done before
+	 * ProcArrayClearTransaction().  Otherwise, a GetLockConflicts() would
+	 * conclude "xact already committed or aborted" for our locks.
+	 */
+	PostPrepare_Locks(xid);
+
+	/*
+	 * Let others know about no transaction in progress by me.  This has to be
+	 * done *after* the prepared transaction has been marked valid, else
+	 * someone may think it is unlocked and recyclable.
+	 */
+	ProcArrayClearTransaction(MyProc);
+
+	/*
+	 * In normal commit-processing, this is all non-critical post-transaction
+	 * cleanup.  When the transaction is prepared, however, it's important
+	 * that the locks and other per-backend resources are transferred to the
+	 * prepared transaction's PGPROC entry.  Note that if an error is raised
+	 * here, it's too late to abort the transaction. XXX: This probably should
+	 * be in a critical section, to force a PANIC if any of this fails, but
+	 * that cure could be worse than the disease.
+	 */
+
+	CallXactCallbacks(XACT_EVENT_PREPARE);
+
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_BEFORE_LOCKS,
+						 true, true);
+
+	/* Check we've released all buffer pins */
+	AtEOXact_Buffers(true);
+
+	/* Clean up the relation cache */
+	AtEOXact_RelationCache(true);
+
+	/* notify doesn't need a postprepare call */
+
+	PostPrepare_PgStat();
+
+	PostPrepare_Inval();
+
+	PostPrepare_smgr();
+
+	PostPrepare_MultiXact(xid);
+
+	PostPrepare_PredicateLocks(xid);
+
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_LOCKS,
+						 true, true);
+	ResourceOwnerRelease(TopTransactionResourceOwner,
+						 RESOURCE_RELEASE_AFTER_LOCKS,
+						 true, true);
+
+	/*
+	 * Allow another backend to finish the transaction.  After
+	 * PostPrepare_Twophase(), the transaction is completely detached from our
+	 * backend.  The rest is just non-critical cleanup of backend-local state.
+	 */
+	PostPrepare_Twophase();
+
+	/* PREPARE acts the same as COMMIT as far as GUC is concerned */
+	AtEOXact_GUC(true, 1);
+	AtEOXact_SPI(true);
+	AtEOXact_Enum();
+	AtEOXact_on_commit_actions(true);
+	AtEOXact_Namespace(true, false);
+	AtEOXact_SMgr();
+	AtEOXact_Files(true);
+	AtEOXact_ComboCid();
+	AtEOXact_HashTables(true);
+	/* don't call AtEOXact_PgStat here; we fixed pgstat state above */
+	AtEOXact_Snapshot(true, true);
+	pgstat_report_xact_timestamp(0);
+
+	CurrentResourceOwner = NULL;
+	ResourceOwnerDelete(TopTransactionResourceOwner);
+	s->curTransactionOwner = NULL;
+	CurTransactionResourceOwner = NULL;
+	TopTransactionResourceOwner = NULL;
+
+	AtCommit_Memory();
+
+	s->fullTransactionId = InvalidFullTransactionId;
+	s->subTransactionId = InvalidSubTransactionId;
+	s->nestingLevel = 0;
+	s->gucNestLevel = 0;
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+
+	XactTopFullTransactionId = InvalidFullTransactionId;
+	nParallelCurrentXids = 0;
+
+	/*
+	 * done with 1st phase commit processing, set current transaction state
+	 * back to default
+	 */
+	s->state = TRANS_DEFAULT;
+
+	RESUME_INTERRUPTS();
+}
+
+
+/*
+ *	AbortTransaction
+ */
+static void
+AbortTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+	TransactionId latestXid;
+	bool		is_parallel_worker;
+
+	/* Prevent cancel/die interrupt while cleaning up */
+	HOLD_INTERRUPTS();
+
+	/* Make sure we have a valid memory context and resource owner */
+	AtAbort_Memory();
+	AtAbort_ResourceOwner();
+
+	/*
+	 * Release any LW locks we might be holding as quickly as possible.
+	 * (Regular locks, however, must be held till we finish aborting.)
+	 * Releasing LW locks is critical since we might try to grab them again
+	 * while cleaning up!
+	 */
+	LWLockReleaseAll();
+
+	/* Clear wait information and command progress indicator */
+	pgstat_report_wait_end();
+	pgstat_progress_end_command();
+
+	/* Clean up buffer I/O and buffer context locks, too */
+	AbortBufferIO();
+	UnlockBuffers();
+
+	/* Reset WAL record construction state */
+	XLogResetInsertion();
+
+	/* Cancel condition variable sleep */
+	ConditionVariableCancelSleep();
+
+	/*
+	 * Also clean up any open wait for lock, since the lock manager will choke
+	 * if we try to wait for another lock before doing this.
+	 */
+	LockErrorCleanup();
+
+	/*
+	 * If any timeout events are still active, make sure the timeout interrupt
+	 * is scheduled.  This covers possible loss of a timeout interrupt due to
+	 * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+	 * We delay this till after LockErrorCleanup so that we don't uselessly
+	 * reschedule lock or deadlock check timeouts.
+	 */
+	reschedule_timeouts();
+
+	/*
+	 * Re-enable signals, in case we got here by longjmp'ing out of a signal
+	 * handler.  We do this fairly early in the sequence so that the timeout
+	 * infrastructure will be functional if needed while aborting.
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * check the current transaction state
+	 */
+	is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS);
+	if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE)
+		elog(WARNING, "AbortTransaction while in %s state",
+			 TransStateAsString(s->state));
+	Assert(s->parent == NULL);
+
+	/*
+	 * set the current transaction state information appropriately during the
+	 * abort processing
+	 */
+	s->state = TRANS_ABORT;
+
+	/*
+	 * Reset user ID which might have been changed transiently.  We need this
+	 * to clean up in case control escaped out of a SECURITY DEFINER function
+	 * or other local change of CurrentUserId; therefore, the prior value of
+	 * SecurityRestrictionContext also needs to be restored.
+	 *
+	 * (Note: it is not necessary to restore session authorization or role
+	 * settings here because those can only be changed via GUC, and GUC will
+	 * take care of rolling them back if need be.)
+	 */
+	SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
+
+	/* Forget about any active REINDEX. */
+	ResetReindexState(s->nestingLevel);
+
+	/* Reset logical streaming state. */
+	ResetLogicalStreamingState();
+
+	/* Reset snapshot export state. */
+	SnapBuildResetExportedSnapshotState();
+
+	/* If in parallel mode, clean up workers and exit parallel mode. */
+	if (IsInParallelMode())
+	{
+		AtEOXact_Parallel(false);
+		s->parallelModeLevel = 0;
+	}
+
+	/*
+	 * do abort processing
+	 */
+	AfterTriggerEndXact(false); /* 'false' means it's abort */
+	AtAbort_Portals();
+	smgrDoPendingSyncs(false, is_parallel_worker);
+	AtEOXact_LargeObject(false);
+	AtAbort_Notify();
+	AtEOXact_RelationMap(false, is_parallel_worker);
+	AtAbort_Twophase();
+
+	/*
+	 * Advertise the fact that we aborted in pg_xact (assuming that we got as
+	 * far as assigning an XID to advertise).  But if we're inside a parallel
+	 * worker, skip this; the user backend must be the one to write the abort
+	 * record.
+	 */
+	if (!is_parallel_worker)
+		latestXid = RecordTransactionAbort(false);
+	else
+	{
+		latestXid = InvalidTransactionId;
+
+		/*
+		 * Since the parallel leader won't get our value of XactLastRecEnd in
+		 * this case, we nudge WAL-writer ourselves in this case.  See related
+		 * comments in RecordTransactionAbort for why this matters.
+		 */
+		XLogSetAsyncXactLSN(XactLastRecEnd);
+	}
+
+	TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid);
+
+	/*
+	 * Let others know about no transaction in progress by me. Note that this
+	 * must be done _before_ releasing locks we hold and _after_
+	 * RecordTransactionAbort.
+	 */
+	ProcArrayEndTransaction(MyProc, latestXid);
+
+	/*
+	 * Post-abort cleanup.  See notes in CommitTransaction() concerning
+	 * ordering.  We can skip all of it if the transaction failed before
+	 * creating a resource owner.
+	 */
+	if (TopTransactionResourceOwner != NULL)
+	{
+		if (is_parallel_worker)
+			CallXactCallbacks(XACT_EVENT_PARALLEL_ABORT);
+		else
+			CallXactCallbacks(XACT_EVENT_ABORT);
+
+		ResourceOwnerRelease(TopTransactionResourceOwner,
+							 RESOURCE_RELEASE_BEFORE_LOCKS,
+							 false, true);
+		AtEOXact_Buffers(false);
+		AtEOXact_RelationCache(false);
+		AtEOXact_Inval(false);
+		AtEOXact_MultiXact();
+		ResourceOwnerRelease(TopTransactionResourceOwner,
+							 RESOURCE_RELEASE_LOCKS,
+							 false, true);
+		ResourceOwnerRelease(TopTransactionResourceOwner,
+							 RESOURCE_RELEASE_AFTER_LOCKS,
+							 false, true);
+		smgrDoPendingDeletes(false);
+
+		AtEOXact_GUC(false, 1);
+		AtEOXact_SPI(false);
+		AtEOXact_Enum();
+		AtEOXact_on_commit_actions(false);
+		AtEOXact_Namespace(false, is_parallel_worker);
+		AtEOXact_SMgr();
+		AtEOXact_Files(false);
+		AtEOXact_ComboCid();
+		AtEOXact_HashTables(false);
+		AtEOXact_PgStat(false, is_parallel_worker);
+		AtEOXact_ApplyLauncher(false);
+		pgstat_report_xact_timestamp(0);
+	}
+
+	/*
+	 * State remains TRANS_ABORT until CleanupTransaction().
+	 */
+	RESUME_INTERRUPTS();
+}
+
+/*
+ *	CleanupTransaction
+ */
+static void
+CleanupTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * State should still be TRANS_ABORT from AbortTransaction().
+	 */
+	if (s->state != TRANS_ABORT)
+		elog(FATAL, "CleanupTransaction: unexpected state %s",
+			 TransStateAsString(s->state));
+
+	/*
+	 * do abort cleanup processing
+	 */
+	AtCleanup_Portals();		/* now safe to release portal memory */
+	AtEOXact_Snapshot(false, true); /* and release the transaction's snapshots */
+
+	CurrentResourceOwner = NULL;	/* and resource owner */
+	if (TopTransactionResourceOwner)
+		ResourceOwnerDelete(TopTransactionResourceOwner);
+	s->curTransactionOwner = NULL;
+	CurTransactionResourceOwner = NULL;
+	TopTransactionResourceOwner = NULL;
+
+	AtCleanup_Memory();			/* and transaction memory */
+
+	s->fullTransactionId = InvalidFullTransactionId;
+	s->subTransactionId = InvalidSubTransactionId;
+	s->nestingLevel = 0;
+	s->gucNestLevel = 0;
+	s->childXids = NULL;
+	s->nChildXids = 0;
+	s->maxChildXids = 0;
+	s->parallelModeLevel = 0;
+
+	XactTopFullTransactionId = InvalidFullTransactionId;
+	nParallelCurrentXids = 0;
+
+	/*
+	 * done with abort processing, set current transaction state back to
+	 * default
+	 */
+	s->state = TRANS_DEFAULT;
+}
+
+/*
+ *	StartTransactionCommand
+ */
+void
+StartTransactionCommand(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
+	{
+			/*
+			 * if we aren't in a transaction block, we just do our usual start
+			 * transaction.
+			 */
+		case TBLOCK_DEFAULT:
+			StartTransaction();
+			s->blockState = TBLOCK_STARTED;
+			break;
+
+			/*
+			 * We are somewhere in a transaction block or subtransaction and
+			 * about to start a new command.  For now we do nothing, but
+			 * someday we may do command-local resource initialization. (Note
+			 * that any needed CommandCounterIncrement was done by the
+			 * previous CommitTransactionCommand.)
+			 */
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+		case TBLOCK_SUBINPROGRESS:
+			break;
+
+			/*
+			 * Here we are in a failed transaction block (one of the commands
+			 * caused an abort) so we do nothing but remain in the abort
+			 * state.  Eventually we will get a ROLLBACK command which will
+			 * get us out of this state.  (It is up to other code to ensure
+			 * that no commands other than ROLLBACK will be processed in these
+			 * states.)
+			 */
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(ERROR, "StartTransactionCommand: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	/*
+	 * We must switch to CurTransactionContext before returning. This is
+	 * already done if we called StartTransaction, otherwise not.
+	 */
+	Assert(CurTransactionContext != NULL);
+	MemoryContextSwitchTo(CurTransactionContext);
+}
+
+
+/*
+ * Simple system for saving and restoring transaction characteristics
+ * (isolation level, read only, deferrable).  We need this for transaction
+ * chaining, so that we can set the characteristics of the new transaction to
+ * be the same as the previous one.  (We need something like this because the
+ * GUC system resets the characteristics at transaction end, so for example
+ * just skipping the reset in StartTransaction() won't work.)
+ */
+void
+SaveTransactionCharacteristics(SavedTransactionCharacteristics *s)
+{
+	s->save_XactIsoLevel = XactIsoLevel;
+	s->save_XactReadOnly = XactReadOnly;
+	s->save_XactDeferrable = XactDeferrable;
+}
+
+void
+RestoreTransactionCharacteristics(const SavedTransactionCharacteristics *s)
+{
+	XactIsoLevel = s->save_XactIsoLevel;
+	XactReadOnly = s->save_XactReadOnly;
+	XactDeferrable = s->save_XactDeferrable;
+}
+
+
+/*
+ *	CommitTransactionCommand
+ */
+void
+CommitTransactionCommand(void)
+{
+	TransactionState s = CurrentTransactionState;
+	SavedTransactionCharacteristics savetc;
+
+	/* Must save in case we need to restore below */
+	SaveTransactionCharacteristics(&savetc);
+
+	switch (s->blockState)
+	{
+			/*
+			 * These shouldn't happen.  TBLOCK_DEFAULT means the previous
+			 * StartTransactionCommand didn't set the STARTED state
+			 * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended
+			 * by EndParallelWorkerTransaction(), not this function.
+			 */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_PARALLEL_INPROGRESS:
+			elog(FATAL, "CommitTransactionCommand: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+
+			/*
+			 * If we aren't in a transaction block, just do our usual
+			 * transaction commit, and return to the idle state.
+			 */
+		case TBLOCK_STARTED:
+			CommitTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * We are completing a "BEGIN TRANSACTION" command, so we change
+			 * to the "transaction block in progress" state and return.  (We
+			 * assume the BEGIN did nothing to the database, so we need no
+			 * CommandCounterIncrement.)
+			 */
+		case TBLOCK_BEGIN:
+			s->blockState = TBLOCK_INPROGRESS;
+			break;
+
+			/*
+			 * This is the case when we have finished executing a command
+			 * someplace within a transaction block.  We increment the command
+			 * counter and return.
+			 */
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+		case TBLOCK_SUBINPROGRESS:
+			CommandCounterIncrement();
+			break;
+
+			/*
+			 * We are completing a "COMMIT" command.  Do it and return to the
+			 * idle state.
+			 */
+		case TBLOCK_END:
+			CommitTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			if (s->chain)
+			{
+				StartTransaction();
+				s->blockState = TBLOCK_INPROGRESS;
+				s->chain = false;
+				RestoreTransactionCharacteristics(&savetc);
+			}
+			break;
+
+			/*
+			 * Here we are in the middle of a transaction block but one of the
+			 * commands caused an abort so we do nothing but remain in the
+			 * abort state.  Eventually we will get a ROLLBACK command.
+			 */
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+			break;
+
+			/*
+			 * Here we were in an aborted transaction block and we just got
+			 * the ROLLBACK command from the user, so clean up the
+			 * already-aborted transaction and return to the idle state.
+			 */
+		case TBLOCK_ABORT_END:
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			if (s->chain)
+			{
+				StartTransaction();
+				s->blockState = TBLOCK_INPROGRESS;
+				s->chain = false;
+				RestoreTransactionCharacteristics(&savetc);
+			}
+			break;
+
+			/*
+			 * Here we were in a perfectly good transaction block but the user
+			 * told us to ROLLBACK anyway.  We have to abort the transaction
+			 * and then clean up.
+			 */
+		case TBLOCK_ABORT_PENDING:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			if (s->chain)
+			{
+				StartTransaction();
+				s->blockState = TBLOCK_INPROGRESS;
+				s->chain = false;
+				RestoreTransactionCharacteristics(&savetc);
+			}
+			break;
+
+			/*
+			 * We are completing a "PREPARE TRANSACTION" command.  Do it and
+			 * return to the idle state.
+			 */
+		case TBLOCK_PREPARE:
+			PrepareTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * We were just issued a SAVEPOINT inside a transaction block.
+			 * Start a subtransaction.  (DefineSavepoint already did
+			 * PushTransaction, so as to have someplace to put the SUBBEGIN
+			 * state.)
+			 */
+		case TBLOCK_SUBBEGIN:
+			StartSubTransaction();
+			s->blockState = TBLOCK_SUBINPROGRESS;
+			break;
+
+			/*
+			 * We were issued a RELEASE command, so we end the current
+			 * subtransaction and return to the parent transaction. The parent
+			 * might be ended too, so repeat till we find an INPROGRESS
+			 * transaction or subtransaction.
+			 */
+		case TBLOCK_SUBRELEASE:
+			do
+			{
+				CommitSubTransaction();
+				s = CurrentTransactionState;	/* changed by pop */
+			} while (s->blockState == TBLOCK_SUBRELEASE);
+
+			Assert(s->blockState == TBLOCK_INPROGRESS ||
+				   s->blockState == TBLOCK_SUBINPROGRESS);
+			break;
+
+			/*
+			 * We were issued a COMMIT, so we end the current subtransaction
+			 * hierarchy and perform final commit. We do this by rolling up
+			 * any subtransactions into their parent, which leads to O(N^2)
+			 * operations with respect to resource owners - this isn't that
+			 * bad until we approach a thousands of savepoints but is
+			 * necessary for correctness should after triggers create new
+			 * resource owners.
+			 */
+		case TBLOCK_SUBCOMMIT:
+			do
+			{
+				CommitSubTransaction();
+				s = CurrentTransactionState;	/* changed by pop */
+			} while (s->blockState == TBLOCK_SUBCOMMIT);
+			/* If we had a COMMIT command, finish off the main xact too */
+			if (s->blockState == TBLOCK_END)
+			{
+				Assert(s->parent == NULL);
+				CommitTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+				if (s->chain)
+				{
+					StartTransaction();
+					s->blockState = TBLOCK_INPROGRESS;
+					s->chain = false;
+					RestoreTransactionCharacteristics(&savetc);
+				}
+			}
+			else if (s->blockState == TBLOCK_PREPARE)
+			{
+				Assert(s->parent == NULL);
+				PrepareTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+			}
+			else
+				elog(ERROR, "CommitTransactionCommand: unexpected state %s",
+					 BlockStateAsString(s->blockState));
+			break;
+
+			/*
+			 * The current already-failed subtransaction is ending due to a
+			 * ROLLBACK or ROLLBACK TO command, so pop it and recursively
+			 * examine the parent (which could be in any of several states).
+			 */
+		case TBLOCK_SUBABORT_END:
+			CleanupSubTransaction();
+			CommitTransactionCommand();
+			break;
+
+			/*
+			 * As above, but it's not dead yet, so abort first.
+			 */
+		case TBLOCK_SUBABORT_PENDING:
+			AbortSubTransaction();
+			CleanupSubTransaction();
+			CommitTransactionCommand();
+			break;
+
+			/*
+			 * The current subtransaction is the target of a ROLLBACK TO
+			 * command.  Abort and pop it, then start a new subtransaction
+			 * with the same name.
+			 */
+		case TBLOCK_SUBRESTART:
+			{
+				char	   *name;
+				int			savepointLevel;
+
+				/* save name and keep Cleanup from freeing it */
+				name = s->name;
+				s->name = NULL;
+				savepointLevel = s->savepointLevel;
+
+				AbortSubTransaction();
+				CleanupSubTransaction();
+
+				DefineSavepoint(NULL);
+				s = CurrentTransactionState;	/* changed by push */
+				s->name = name;
+				s->savepointLevel = savepointLevel;
+
+				/* This is the same as TBLOCK_SUBBEGIN case */
+				AssertState(s->blockState == TBLOCK_SUBBEGIN);
+				StartSubTransaction();
+				s->blockState = TBLOCK_SUBINPROGRESS;
+			}
+			break;
+
+			/*
+			 * Same as above, but the subtransaction had already failed, so we
+			 * don't need AbortSubTransaction.
+			 */
+		case TBLOCK_SUBABORT_RESTART:
+			{
+				char	   *name;
+				int			savepointLevel;
+
+				/* save name and keep Cleanup from freeing it */
+				name = s->name;
+				s->name = NULL;
+				savepointLevel = s->savepointLevel;
+
+				CleanupSubTransaction();
+
+				DefineSavepoint(NULL);
+				s = CurrentTransactionState;	/* changed by push */
+				s->name = name;
+				s->savepointLevel = savepointLevel;
+
+				/* This is the same as TBLOCK_SUBBEGIN case */
+				AssertState(s->blockState == TBLOCK_SUBBEGIN);
+				StartSubTransaction();
+				s->blockState = TBLOCK_SUBINPROGRESS;
+			}
+			break;
+	}
+}
+
+/*
+ *	AbortCurrentTransaction
+ */
+void
+AbortCurrentTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
+	{
+		case TBLOCK_DEFAULT:
+			if (s->state == TRANS_DEFAULT)
+			{
+				/* we are idle, so nothing to do */
+			}
+			else
+			{
+				/*
+				 * We can get here after an error during transaction start
+				 * (state will be TRANS_START).  Need to clean up the
+				 * incompletely started transaction.  First, adjust the
+				 * low-level state to suppress warning message from
+				 * AbortTransaction.
+				 */
+				if (s->state == TRANS_START)
+					s->state = TRANS_INPROGRESS;
+				AbortTransaction();
+				CleanupTransaction();
+			}
+			break;
+
+			/*
+			 * If we aren't in a transaction block, we just do the basic abort
+			 * & cleanup transaction.  For this purpose, we treat an implicit
+			 * transaction block as if it were a simple statement.
+			 */
+		case TBLOCK_STARTED:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * If we are in TBLOCK_BEGIN it means something screwed up right
+			 * after reading "BEGIN TRANSACTION".  We assume that the user
+			 * will interpret the error as meaning the BEGIN failed to get him
+			 * into a transaction block, so we should abort and return to idle
+			 * state.
+			 */
+		case TBLOCK_BEGIN:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * We are somewhere in a transaction block and we've gotten a
+			 * failure, so we abort the transaction and set up the persistent
+			 * ABORT state.  We will stay in ABORT until we get a ROLLBACK.
+			 */
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_PARALLEL_INPROGRESS:
+			AbortTransaction();
+			s->blockState = TBLOCK_ABORT;
+			/* CleanupTransaction happens when we exit TBLOCK_ABORT_END */
+			break;
+
+			/*
+			 * Here, we failed while trying to COMMIT.  Clean up the
+			 * transaction and return to idle state (we do not want to stay in
+			 * the transaction).
+			 */
+		case TBLOCK_END:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * Here, we are already in an aborted transaction state and are
+			 * waiting for a ROLLBACK, but for some reason we failed again! So
+			 * we just remain in the abort state.
+			 */
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+			break;
+
+			/*
+			 * We are in a failed transaction and we got the ROLLBACK command.
+			 * We have already aborted, we just need to cleanup and go to idle
+			 * state.
+			 */
+		case TBLOCK_ABORT_END:
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * We are in a live transaction and we got a ROLLBACK command.
+			 * Abort, cleanup, go to idle state.
+			 */
+		case TBLOCK_ABORT_PENDING:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * Here, we failed while trying to PREPARE.  Clean up the
+			 * transaction and return to idle state (we do not want to stay in
+			 * the transaction).
+			 */
+		case TBLOCK_PREPARE:
+			AbortTransaction();
+			CleanupTransaction();
+			s->blockState = TBLOCK_DEFAULT;
+			break;
+
+			/*
+			 * We got an error inside a subtransaction.  Abort just the
+			 * subtransaction, and go to the persistent SUBABORT state until
+			 * we get ROLLBACK.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+			AbortSubTransaction();
+			s->blockState = TBLOCK_SUBABORT;
+			break;
+
+			/*
+			 * If we failed while trying to create a subtransaction, clean up
+			 * the broken subtransaction and abort the parent.  The same
+			 * applies if we get a failure while ending a subtransaction.
+			 */
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+			AbortSubTransaction();
+			CleanupSubTransaction();
+			AbortCurrentTransaction();
+			break;
+
+			/*
+			 * Same as above, except the Abort() was already done.
+			 */
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_SUBABORT_RESTART:
+			CleanupSubTransaction();
+			AbortCurrentTransaction();
+			break;
+	}
+}
+
+/*
+ *	PreventInTransactionBlock
+ *
+ *	This routine is to be called by statements that must not run inside
+ *	a transaction block, typically because they have non-rollback-able
+ *	side effects or do internal commits.
+ *
+ *	If this routine completes successfully, then the calling statement is
+ *	guaranteed that if it completes without error, its results will be
+ *	committed immediately.
+ *
+ *	If we have already started a transaction block, issue an error; also issue
+ *	an error if we appear to be running inside a user-defined function (which
+ *	could issue more commands and possibly cause a failure after the statement
+ *	completes).  Subtransactions are verboten too.
+ *
+ *	We must also set XACT_FLAGS_NEEDIMMEDIATECOMMIT in MyXactFlags, to ensure
+ *	that postgres.c follows through by committing after the statement is done.
+ *
+ *	isTopLevel: passed down from ProcessUtility to determine whether we are
+ *	inside a function.  (We will always fail if this is false, but it's
+ *	convenient to centralize the check here instead of making callers do it.)
+ *	stmtType: statement type name, for error messages.
+ */
+void
+PreventInTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+	/*
+	 * xact block already started?
+	 */
+	if (IsTransactionBlock())
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+		/* translator: %s represents an SQL statement name */
+				 errmsg("%s cannot run inside a transaction block",
+						stmtType)));
+
+	/*
+	 * subtransaction?
+	 */
+	if (IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+		/* translator: %s represents an SQL statement name */
+				 errmsg("%s cannot run inside a subtransaction",
+						stmtType)));
+
+	/*
+	 * inside a pipeline that has started an implicit transaction?
+	 */
+	if (MyXactFlags & XACT_FLAGS_PIPELINING)
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+		/* translator: %s represents an SQL statement name */
+				 errmsg("%s cannot be executed within a pipeline",
+						stmtType)));
+
+	/*
+	 * inside a function call?
+	 */
+	if (!isTopLevel)
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+		/* translator: %s represents an SQL statement name */
+				 errmsg("%s cannot be executed from a function", stmtType)));
+
+	/* If we got past IsTransactionBlock test, should be in default state */
+	if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
+		CurrentTransactionState->blockState != TBLOCK_STARTED)
+		elog(FATAL, "cannot prevent transaction chain");
+
+	/* All okay.  Set the flag to make sure the right thing happens later. */
+	MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT;
+}
+
+/*
+ *	WarnNoTransactionBlock
+ *	RequireTransactionBlock
+ *
+ *	These two functions allow for warnings or errors if a command is executed
+ *	outside of a transaction block.  This is useful for commands that have no
+ *	effects that persist past transaction end (and so calling them outside a
+ *	transaction block is presumably an error).  DECLARE CURSOR is an example.
+ *	While top-level transaction control commands (BEGIN/COMMIT/ABORT) and SET
+ *	that have no effect issue warnings, all other no-effect commands generate
+ *	errors.
+ *
+ *	If we appear to be running inside a user-defined function, we do not
+ *	issue anything, since the function could issue more commands that make
+ *	use of the current statement's results.  Likewise subtransactions.
+ *	Thus these are inverses for PreventInTransactionBlock.
+ *
+ *	isTopLevel: passed down from ProcessUtility to determine whether we are
+ *	inside a function.
+ *	stmtType: statement type name, for warning or error messages.
+ */
+void
+WarnNoTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+	CheckTransactionBlock(isTopLevel, false, stmtType);
+}
+
+void
+RequireTransactionBlock(bool isTopLevel, const char *stmtType)
+{
+	CheckTransactionBlock(isTopLevel, true, stmtType);
+}
+
+/*
+ * This is the implementation of the above two.
+ */
+static void
+CheckTransactionBlock(bool isTopLevel, bool throwError, const char *stmtType)
+{
+	/*
+	 * xact block already started?
+	 */
+	if (IsTransactionBlock())
+		return;
+
+	/*
+	 * subtransaction?
+	 */
+	if (IsSubTransaction())
+		return;
+
+	/*
+	 * inside a function call?
+	 */
+	if (!isTopLevel)
+		return;
+
+	ereport(throwError ? ERROR : WARNING,
+			(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+	/* translator: %s represents an SQL statement name */
+			 errmsg("%s can only be used in transaction blocks",
+					stmtType)));
+}
+
+/*
+ *	IsInTransactionBlock
+ *
+ *	This routine is for statements that need to behave differently inside
+ *	a transaction block than when running as single commands.  ANALYZE is
+ *	currently the only example.
+ *
+ *	If this routine returns "false", then the calling statement is allowed
+ *	to perform internal transaction-commit-and-start cycles; there is not a
+ *	risk of messing up any transaction already in progress.  (Note that this
+ *	is not the identical guarantee provided by PreventInTransactionBlock,
+ *	since we will not force a post-statement commit.)
+ *
+ *	isTopLevel: passed down from ProcessUtility to determine whether we are
+ *	inside a function.
+ */
+bool
+IsInTransactionBlock(bool isTopLevel)
+{
+	/*
+	 * Return true on same conditions that would make
+	 * PreventInTransactionBlock error out
+	 */
+	if (IsTransactionBlock())
+		return true;
+
+	if (IsSubTransaction())
+		return true;
+
+	if (MyXactFlags & XACT_FLAGS_PIPELINING)
+		return true;
+
+	if (!isTopLevel)
+		return true;
+
+	if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
+		CurrentTransactionState->blockState != TBLOCK_STARTED)
+		return true;
+
+	return false;
+}
+
+
+/*
+ * Register or deregister callback functions for start- and end-of-xact
+ * operations.
+ *
+ * These functions are intended for use by dynamically loaded modules.
+ * For built-in modules we generally just hardwire the appropriate calls
+ * (mainly because it's easier to control the order that way, where needed).
+ *
+ * At transaction end, the callback occurs post-commit or post-abort, so the
+ * callback functions can only do noncritical cleanup.
+ */
+void
+RegisterXactCallback(XactCallback callback, void *arg)
+{
+	XactCallbackItem *item;
+
+	item = (XactCallbackItem *)
+		MemoryContextAlloc(TopMemoryContext, sizeof(XactCallbackItem));
+	item->callback = callback;
+	item->arg = arg;
+	item->next = Xact_callbacks;
+	Xact_callbacks = item;
+}
+
+void
+UnregisterXactCallback(XactCallback callback, void *arg)
+{
+	XactCallbackItem *item;
+	XactCallbackItem *prev;
+
+	prev = NULL;
+	for (item = Xact_callbacks; item; prev = item, item = item->next)
+	{
+		if (item->callback == callback && item->arg == arg)
+		{
+			if (prev)
+				prev->next = item->next;
+			else
+				Xact_callbacks = item->next;
+			pfree(item);
+			break;
+		}
+	}
+}
+
+static void
+CallXactCallbacks(XactEvent event)
+{
+	XactCallbackItem *item;
+
+	for (item = Xact_callbacks; item; item = item->next)
+		item->callback(event, item->arg);
+}
+
+
+/*
+ * Register or deregister callback functions for start- and end-of-subxact
+ * operations.
+ *
+ * Pretty much same as above, but for subtransaction events.
+ *
+ * At subtransaction end, the callback occurs post-subcommit or post-subabort,
+ * so the callback functions can only do noncritical cleanup.  At
+ * subtransaction start, the callback is called when the subtransaction has
+ * finished initializing.
+ */
+void
+RegisterSubXactCallback(SubXactCallback callback, void *arg)
+{
+	SubXactCallbackItem *item;
+
+	item = (SubXactCallbackItem *)
+		MemoryContextAlloc(TopMemoryContext, sizeof(SubXactCallbackItem));
+	item->callback = callback;
+	item->arg = arg;
+	item->next = SubXact_callbacks;
+	SubXact_callbacks = item;
+}
+
+void
+UnregisterSubXactCallback(SubXactCallback callback, void *arg)
+{
+	SubXactCallbackItem *item;
+	SubXactCallbackItem *prev;
+
+	prev = NULL;
+	for (item = SubXact_callbacks; item; prev = item, item = item->next)
+	{
+		if (item->callback == callback && item->arg == arg)
+		{
+			if (prev)
+				prev->next = item->next;
+			else
+				SubXact_callbacks = item->next;
+			pfree(item);
+			break;
+		}
+	}
+}
+
+static void
+CallSubXactCallbacks(SubXactEvent event,
+					 SubTransactionId mySubid,
+					 SubTransactionId parentSubid)
+{
+	SubXactCallbackItem *item;
+
+	for (item = SubXact_callbacks; item; item = item->next)
+		item->callback(event, mySubid, parentSubid, item->arg);
+}
+
+
+/* ----------------------------------------------------------------
+ *					   transaction block support
+ * ----------------------------------------------------------------
+ */
+
+/*
+ *	BeginTransactionBlock
+ *		This executes a BEGIN command.
+ */
+void
+BeginTransactionBlock(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
+	{
+			/*
+			 * We are not inside a transaction block, so allow one to begin.
+			 */
+		case TBLOCK_STARTED:
+			s->blockState = TBLOCK_BEGIN;
+			break;
+
+			/*
+			 * BEGIN converts an implicit transaction block to a regular one.
+			 * (Note that we allow this even if we've already done some
+			 * commands, which is a bit odd but matches historical practice.)
+			 */
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			s->blockState = TBLOCK_BEGIN;
+			break;
+
+			/*
+			 * Already a transaction block in progress.
+			 */
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+			ereport(WARNING,
+					(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+					 errmsg("there is already a transaction in progress")));
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "BeginTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+}
+
+/*
+ *	PrepareTransactionBlock
+ *		This executes a PREPARE command.
+ *
+ * Since PREPARE may actually do a ROLLBACK, the result indicates what
+ * happened: true for PREPARE, false for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming PrepareTransaction().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
+ */
+bool
+PrepareTransactionBlock(const char *gid)
+{
+	TransactionState s;
+	bool		result;
+
+	/* Set up to commit the current transaction */
+	result = EndTransactionBlock(false);
+
+	/* If successful, change outer tblock state to PREPARE */
+	if (result)
+	{
+		s = CurrentTransactionState;
+
+		while (s->parent != NULL)
+			s = s->parent;
+
+		if (s->blockState == TBLOCK_END)
+		{
+			/* Save GID where PrepareTransaction can find it again */
+			prepareGID = MemoryContextStrdup(TopTransactionContext, gid);
+
+			s->blockState = TBLOCK_PREPARE;
+		}
+		else
+		{
+			/*
+			 * ignore case where we are not in a transaction;
+			 * EndTransactionBlock already issued a warning.
+			 */
+			Assert(s->blockState == TBLOCK_STARTED ||
+				   s->blockState == TBLOCK_IMPLICIT_INPROGRESS);
+			/* Don't send back a PREPARE result tag... */
+			result = false;
+		}
+	}
+
+	return result;
+}
+
+/*
+ *	EndTransactionBlock
+ *		This executes a COMMIT command.
+ *
+ * Since COMMIT may actually do a ROLLBACK, the result indicates what
+ * happened: true for COMMIT, false for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming CommitTransactionCommand().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
+ */
+bool
+EndTransactionBlock(bool chain)
+{
+	TransactionState s = CurrentTransactionState;
+	bool		result = false;
+
+	switch (s->blockState)
+	{
+			/*
+			 * We are in a transaction block, so tell CommitTransactionCommand
+			 * to COMMIT.
+			 */
+		case TBLOCK_INPROGRESS:
+			s->blockState = TBLOCK_END;
+			result = true;
+			break;
+
+			/*
+			 * We are in an implicit transaction block.  If AND CHAIN was
+			 * specified, error.  Otherwise commit, but issue a warning
+			 * because there was no explicit BEGIN before this.
+			 */
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			if (chain)
+				ereport(ERROR,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+				/* translator: %s represents an SQL statement name */
+						 errmsg("%s can only be used in transaction blocks",
+								"COMMIT AND CHAIN")));
+			else
+				ereport(WARNING,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+						 errmsg("there is no transaction in progress")));
+			s->blockState = TBLOCK_END;
+			result = true;
+			break;
+
+			/*
+			 * We are in a failed transaction block.  Tell
+			 * CommitTransactionCommand it's time to exit the block.
+			 */
+		case TBLOCK_ABORT:
+			s->blockState = TBLOCK_ABORT_END;
+			break;
+
+			/*
+			 * We are in a live subtransaction block.  Set up to subcommit all
+			 * open subtransactions and then commit the main transaction.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+			while (s->parent != NULL)
+			{
+				if (s->blockState == TBLOCK_SUBINPROGRESS)
+					s->blockState = TBLOCK_SUBCOMMIT;
+				else
+					elog(FATAL, "EndTransactionBlock: unexpected state %s",
+						 BlockStateAsString(s->blockState));
+				s = s->parent;
+			}
+			if (s->blockState == TBLOCK_INPROGRESS)
+				s->blockState = TBLOCK_END;
+			else
+				elog(FATAL, "EndTransactionBlock: unexpected state %s",
+					 BlockStateAsString(s->blockState));
+			result = true;
+			break;
+
+			/*
+			 * Here we are inside an aborted subtransaction.  Treat the COMMIT
+			 * as ROLLBACK: set up to abort everything and exit the main
+			 * transaction.
+			 */
+		case TBLOCK_SUBABORT:
+			while (s->parent != NULL)
+			{
+				if (s->blockState == TBLOCK_SUBINPROGRESS)
+					s->blockState = TBLOCK_SUBABORT_PENDING;
+				else if (s->blockState == TBLOCK_SUBABORT)
+					s->blockState = TBLOCK_SUBABORT_END;
+				else
+					elog(FATAL, "EndTransactionBlock: unexpected state %s",
+						 BlockStateAsString(s->blockState));
+				s = s->parent;
+			}
+			if (s->blockState == TBLOCK_INPROGRESS)
+				s->blockState = TBLOCK_ABORT_PENDING;
+			else if (s->blockState == TBLOCK_ABORT)
+				s->blockState = TBLOCK_ABORT_END;
+			else
+				elog(FATAL, "EndTransactionBlock: unexpected state %s",
+					 BlockStateAsString(s->blockState));
+			break;
+
+			/*
+			 * The user issued COMMIT when not inside a transaction.  For
+			 * COMMIT without CHAIN, issue a WARNING, staying in
+			 * TBLOCK_STARTED state.  The upcoming call to
+			 * CommitTransactionCommand() will then close the transaction and
+			 * put us back into the default state.  For COMMIT AND CHAIN,
+			 * error.
+			 */
+		case TBLOCK_STARTED:
+			if (chain)
+				ereport(ERROR,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+				/* translator: %s represents an SQL statement name */
+						 errmsg("%s can only be used in transaction blocks",
+								"COMMIT AND CHAIN")));
+			else
+				ereport(WARNING,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+						 errmsg("there is no transaction in progress")));
+			result = true;
+			break;
+
+			/*
+			 * The user issued a COMMIT that somehow ran inside a parallel
+			 * worker.  We can't cope with that.
+			 */
+		case TBLOCK_PARALLEL_INPROGRESS:
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+					 errmsg("cannot commit during a parallel operation")));
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "EndTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	Assert(s->blockState == TBLOCK_STARTED ||
+		   s->blockState == TBLOCK_END ||
+		   s->blockState == TBLOCK_ABORT_END ||
+		   s->blockState == TBLOCK_ABORT_PENDING);
+
+	s->chain = chain;
+
+	return result;
+}
+
+/*
+ *	UserAbortTransactionBlock
+ *		This executes a ROLLBACK command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+UserAbortTransactionBlock(bool chain)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
+	{
+			/*
+			 * We are inside a transaction block and we got a ROLLBACK command
+			 * from the user, so tell CommitTransactionCommand to abort and
+			 * exit the transaction block.
+			 */
+		case TBLOCK_INPROGRESS:
+			s->blockState = TBLOCK_ABORT_PENDING;
+			break;
+
+			/*
+			 * We are inside a failed transaction block and we got a ROLLBACK
+			 * command from the user.  Abort processing is already done, so
+			 * CommitTransactionCommand just has to cleanup and go back to
+			 * idle state.
+			 */
+		case TBLOCK_ABORT:
+			s->blockState = TBLOCK_ABORT_END;
+			break;
+
+			/*
+			 * We are inside a subtransaction.  Mark everything up to top
+			 * level as exitable.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_SUBABORT:
+			while (s->parent != NULL)
+			{
+				if (s->blockState == TBLOCK_SUBINPROGRESS)
+					s->blockState = TBLOCK_SUBABORT_PENDING;
+				else if (s->blockState == TBLOCK_SUBABORT)
+					s->blockState = TBLOCK_SUBABORT_END;
+				else
+					elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+						 BlockStateAsString(s->blockState));
+				s = s->parent;
+			}
+			if (s->blockState == TBLOCK_INPROGRESS)
+				s->blockState = TBLOCK_ABORT_PENDING;
+			else if (s->blockState == TBLOCK_ABORT)
+				s->blockState = TBLOCK_ABORT_END;
+			else
+				elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+					 BlockStateAsString(s->blockState));
+			break;
+
+			/*
+			 * The user issued ABORT when not inside a transaction.  For
+			 * ROLLBACK without CHAIN, issue a WARNING and go to abort state.
+			 * The upcoming call to CommitTransactionCommand() will then put
+			 * us back into the default state.  For ROLLBACK AND CHAIN, error.
+			 *
+			 * We do the same thing with ABORT inside an implicit transaction,
+			 * although in this case we might be rolling back actual database
+			 * state changes.  (It's debatable whether we should issue a
+			 * WARNING in this case, but we have done so historically.)
+			 */
+		case TBLOCK_STARTED:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			if (chain)
+				ereport(ERROR,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+				/* translator: %s represents an SQL statement name */
+						 errmsg("%s can only be used in transaction blocks",
+								"ROLLBACK AND CHAIN")));
+			else
+				ereport(WARNING,
+						(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+						 errmsg("there is no transaction in progress")));
+			s->blockState = TBLOCK_ABORT_PENDING;
+			break;
+
+			/*
+			 * The user issued an ABORT that somehow ran inside a parallel
+			 * worker.  We can't cope with that.
+			 */
+		case TBLOCK_PARALLEL_INPROGRESS:
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+					 errmsg("cannot abort during a parallel operation")));
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	Assert(s->blockState == TBLOCK_ABORT_END ||
+		   s->blockState == TBLOCK_ABORT_PENDING);
+
+	s->chain = chain;
+}
+
+/*
+ * BeginImplicitTransactionBlock
+ *		Start an implicit transaction block if we're not already in one.
+ *
+ * Unlike BeginTransactionBlock, this is called directly from the main loop
+ * in postgres.c, not within a Portal.  So we can just change blockState
+ * without a lot of ceremony.  We do not expect caller to do
+ * CommitTransactionCommand/StartTransactionCommand.
+ */
+void
+BeginImplicitTransactionBlock(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * If we are in STARTED state (that is, no transaction block is open),
+	 * switch to IMPLICIT_INPROGRESS state, creating an implicit transaction
+	 * block.
+	 *
+	 * For caller convenience, we consider all other transaction states as
+	 * legal here; otherwise the caller would need its own state check, which
+	 * seems rather pointless.
+	 */
+	if (s->blockState == TBLOCK_STARTED)
+		s->blockState = TBLOCK_IMPLICIT_INPROGRESS;
+}
+
+/*
+ * EndImplicitTransactionBlock
+ *		End an implicit transaction block, if we're in one.
+ *
+ * Like EndTransactionBlock, we just make any needed blockState change here.
+ * The real work will be done in the upcoming CommitTransactionCommand().
+ */
+void
+EndImplicitTransactionBlock(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * If we are in IMPLICIT_INPROGRESS state, switch back to STARTED state,
+	 * allowing CommitTransactionCommand to commit whatever happened during
+	 * the implicit transaction block as though it were a single statement.
+	 *
+	 * For caller convenience, we consider all other transaction states as
+	 * legal here; otherwise the caller would need its own state check, which
+	 * seems rather pointless.
+	 */
+	if (s->blockState == TBLOCK_IMPLICIT_INPROGRESS)
+		s->blockState = TBLOCK_STARTED;
+}
+
+/*
+ * DefineSavepoint
+ *		This executes a SAVEPOINT command.
+ */
+void
+DefineSavepoint(const char *name)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for new subtransactions after that
+	 * point.  (Note that this check will certainly error out if s->blockState
+	 * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+	 * below.)
+	 */
+	if (IsInParallelMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot define savepoints during a parallel operation")));
+
+	switch (s->blockState)
+	{
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_SUBINPROGRESS:
+			/* Normal subtransaction start */
+			PushTransaction();
+			s = CurrentTransactionState;	/* changed by push */
+
+			/*
+			 * Savepoint names, like the TransactionState block itself, live
+			 * in TopTransactionContext.
+			 */
+			if (name)
+				s->name = MemoryContextStrdup(TopTransactionContext, name);
+			break;
+
+			/*
+			 * We disallow savepoint commands in implicit transaction blocks.
+			 * There would be no great difficulty in allowing them so far as
+			 * this module is concerned, but a savepoint seems inconsistent
+			 * with exec_simple_query's behavior of abandoning the whole query
+			 * string upon error.  Also, the point of an implicit transaction
+			 * block (as opposed to a regular one) is to automatically close
+			 * after an error, so it's hard to see how a savepoint would fit
+			 * into that.
+			 *
+			 * The error messages for this are phrased as if there were no
+			 * active transaction block at all, which is historical but
+			 * perhaps could be improved.
+			 */
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			ereport(ERROR,
+					(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+			/* translator: %s represents an SQL statement name */
+					 errmsg("%s can only be used in transaction blocks",
+							"SAVEPOINT")));
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "DefineSavepoint: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+}
+
+/*
+ * ReleaseSavepoint
+ *		This executes a RELEASE command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+ReleaseSavepoint(const char *name)
+{
+	TransactionState s = CurrentTransactionState;
+	TransactionState target,
+				xact;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for transaction state change after that
+	 * point.  (Note that this check will certainly error out if s->blockState
+	 * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+	 * below.)
+	 */
+	if (IsInParallelMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot release savepoints during a parallel operation")));
+
+	switch (s->blockState)
+	{
+			/*
+			 * We can't release a savepoint if there is no savepoint defined.
+			 */
+		case TBLOCK_INPROGRESS:
+			ereport(ERROR,
+					(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+					 errmsg("savepoint \"%s\" does not exist", name)));
+			break;
+
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			/* See comment about implicit transactions in DefineSavepoint */
+			ereport(ERROR,
+					(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+			/* translator: %s represents an SQL statement name */
+					 errmsg("%s can only be used in transaction blocks",
+							"RELEASE SAVEPOINT")));
+			break;
+
+			/*
+			 * We are in a non-aborted subtransaction.  This is the only valid
+			 * case.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "ReleaseSavepoint: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	for (target = s; PointerIsValid(target); target = target->parent)
+	{
+		if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
+			break;
+	}
+
+	if (!PointerIsValid(target))
+		ereport(ERROR,
+				(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+				 errmsg("savepoint \"%s\" does not exist", name)));
+
+	/* disallow crossing savepoint level boundaries */
+	if (target->savepointLevel != s->savepointLevel)
+		ereport(ERROR,
+				(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+				 errmsg("savepoint \"%s\" does not exist within current savepoint level", name)));
+
+	/*
+	 * Mark "commit pending" all subtransactions up to the target
+	 * subtransaction.  The actual commits will happen when control gets to
+	 * CommitTransactionCommand.
+	 */
+	xact = CurrentTransactionState;
+	for (;;)
+	{
+		Assert(xact->blockState == TBLOCK_SUBINPROGRESS);
+		xact->blockState = TBLOCK_SUBRELEASE;
+		if (xact == target)
+			break;
+		xact = xact->parent;
+		Assert(PointerIsValid(xact));
+	}
+}
+
+/*
+ * RollbackToSavepoint
+ *		This executes a ROLLBACK TO <savepoint> command.
+ *
+ * As above, we don't actually do anything here except change blockState.
+ */
+void
+RollbackToSavepoint(const char *name)
+{
+	TransactionState s = CurrentTransactionState;
+	TransactionState target,
+				xact;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for transaction state change after that
+	 * point.  (Note that this check will certainly error out if s->blockState
+	 * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case
+	 * below.)
+	 */
+	if (IsInParallelMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot rollback to savepoints during a parallel operation")));
+
+	switch (s->blockState)
+	{
+			/*
+			 * We can't rollback to a savepoint if there is no savepoint
+			 * defined.
+			 */
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_ABORT:
+			ereport(ERROR,
+					(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+					 errmsg("savepoint \"%s\" does not exist", name)));
+			break;
+
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			/* See comment about implicit transactions in DefineSavepoint */
+			ereport(ERROR,
+					(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+			/* translator: %s represents an SQL statement name */
+					 errmsg("%s can only be used in transaction blocks",
+							"ROLLBACK TO SAVEPOINT")));
+			break;
+
+			/*
+			 * There is at least one savepoint, so proceed.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_SUBABORT:
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	for (target = s; PointerIsValid(target); target = target->parent)
+	{
+		if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
+			break;
+	}
+
+	if (!PointerIsValid(target))
+		ereport(ERROR,
+				(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+				 errmsg("savepoint \"%s\" does not exist", name)));
+
+	/* disallow crossing savepoint level boundaries */
+	if (target->savepointLevel != s->savepointLevel)
+		ereport(ERROR,
+				(errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+				 errmsg("savepoint \"%s\" does not exist within current savepoint level", name)));
+
+	/*
+	 * Mark "abort pending" all subtransactions up to the target
+	 * subtransaction.  The actual aborts will happen when control gets to
+	 * CommitTransactionCommand.
+	 */
+	xact = CurrentTransactionState;
+	for (;;)
+	{
+		if (xact == target)
+			break;
+		if (xact->blockState == TBLOCK_SUBINPROGRESS)
+			xact->blockState = TBLOCK_SUBABORT_PENDING;
+		else if (xact->blockState == TBLOCK_SUBABORT)
+			xact->blockState = TBLOCK_SUBABORT_END;
+		else
+			elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+				 BlockStateAsString(xact->blockState));
+		xact = xact->parent;
+		Assert(PointerIsValid(xact));
+	}
+
+	/* And mark the target as "restart pending" */
+	if (xact->blockState == TBLOCK_SUBINPROGRESS)
+		xact->blockState = TBLOCK_SUBRESTART;
+	else if (xact->blockState == TBLOCK_SUBABORT)
+		xact->blockState = TBLOCK_SUBABORT_RESTART;
+	else
+		elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+			 BlockStateAsString(xact->blockState));
+}
+
+/*
+ * BeginInternalSubTransaction
+ *		This is the same as DefineSavepoint except it allows TBLOCK_STARTED,
+ *		TBLOCK_IMPLICIT_INPROGRESS, TBLOCK_END, and TBLOCK_PREPARE states,
+ *		and therefore it can safely be used in functions that might be called
+ *		when not inside a BEGIN block or when running deferred triggers at
+ *		COMMIT/PREPARE time.  Also, it automatically does
+ *		CommitTransactionCommand/StartTransactionCommand instead of expecting
+ *		the caller to do it.
+ */
+void
+BeginInternalSubTransaction(const char *name)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for new subtransactions after that
+	 * point. We might be able to make an exception for the type of
+	 * subtransaction established by this function, which is typically used in
+	 * contexts where we're going to release or roll back the subtransaction
+	 * before proceeding further, so that no enduring change to the
+	 * transaction state occurs. For now, however, we prohibit this case along
+	 * with all the others.
+	 */
+	if (IsInParallelMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot start subtransactions during a parallel operation")));
+
+	switch (s->blockState)
+	{
+		case TBLOCK_STARTED:
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+		case TBLOCK_END:
+		case TBLOCK_PREPARE:
+		case TBLOCK_SUBINPROGRESS:
+			/* Normal subtransaction start */
+			PushTransaction();
+			s = CurrentTransactionState;	/* changed by push */
+
+			/*
+			 * Savepoint names, like the TransactionState block itself, live
+			 * in TopTransactionContext.
+			 */
+			if (name)
+				s->name = MemoryContextStrdup(TopTransactionContext, name);
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_BEGIN:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+			elog(FATAL, "BeginInternalSubTransaction: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	CommitTransactionCommand();
+	StartTransactionCommand();
+}
+
+/*
+ * ReleaseCurrentSubTransaction
+ *
+ * RELEASE (ie, commit) the innermost subtransaction, regardless of its
+ * savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
+ */
+void
+ReleaseCurrentSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * Workers synchronize transaction state at the beginning of each parallel
+	 * operation, so we can't account for commit of subtransactions after that
+	 * point.  This should not happen anyway.  Code calling this would
+	 * typically have called BeginInternalSubTransaction() first, failing
+	 * there.
+	 */
+	if (IsInParallelMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+				 errmsg("cannot commit subtransactions during a parallel operation")));
+
+	if (s->blockState != TBLOCK_SUBINPROGRESS)
+		elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s",
+			 BlockStateAsString(s->blockState));
+	Assert(s->state == TRANS_INPROGRESS);
+	MemoryContextSwitchTo(CurTransactionContext);
+	CommitSubTransaction();
+	s = CurrentTransactionState;	/* changed by pop */
+	Assert(s->state == TRANS_INPROGRESS);
+}
+
+/*
+ * RollbackAndReleaseCurrentSubTransaction
+ *
+ * ROLLBACK and RELEASE (ie, abort) the innermost subtransaction, regardless
+ * of its savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
+ */
+void
+RollbackAndReleaseCurrentSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/*
+	 * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted
+	 * during parallel operations.  That's because we may be in the leader,
+	 * recovering from an error thrown while we were in parallel mode.  We
+	 * won't reach here in a worker, because BeginInternalSubTransaction()
+	 * will have failed.
+	 */
+
+	switch (s->blockState)
+	{
+			/* Must be in a subtransaction */
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_SUBABORT:
+			break;
+
+			/* These cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_ABORT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+		case TBLOCK_PREPARE:
+			elog(FATAL, "RollbackAndReleaseCurrentSubTransaction: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
+	}
+
+	/*
+	 * Abort the current subtransaction, if needed.
+	 */
+	if (s->blockState == TBLOCK_SUBINPROGRESS)
+		AbortSubTransaction();
+
+	/* And clean it up, too */
+	CleanupSubTransaction();
+
+	s = CurrentTransactionState;	/* changed by pop */
+	AssertState(s->blockState == TBLOCK_SUBINPROGRESS ||
+				s->blockState == TBLOCK_INPROGRESS ||
+				s->blockState == TBLOCK_IMPLICIT_INPROGRESS ||
+				s->blockState == TBLOCK_STARTED);
+}
+
+/*
+ *	AbortOutOfAnyTransaction
+ *
+ *	This routine is provided for error recovery purposes.  It aborts any
+ *	active transaction or transaction block, leaving the system in a known
+ *	idle state.
+ */
+void
+AbortOutOfAnyTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/* Ensure we're not running in a doomed memory context */
+	AtAbort_Memory();
+
+	/*
+	 * Get out of any transaction or nested transaction
+	 */
+	do
+	{
+		switch (s->blockState)
+		{
+			case TBLOCK_DEFAULT:
+				if (s->state == TRANS_DEFAULT)
+				{
+					/* Not in a transaction, do nothing */
+				}
+				else
+				{
+					/*
+					 * We can get here after an error during transaction start
+					 * (state will be TRANS_START).  Need to clean up the
+					 * incompletely started transaction.  First, adjust the
+					 * low-level state to suppress warning message from
+					 * AbortTransaction.
+					 */
+					if (s->state == TRANS_START)
+						s->state = TRANS_INPROGRESS;
+					AbortTransaction();
+					CleanupTransaction();
+				}
+				break;
+			case TBLOCK_STARTED:
+			case TBLOCK_BEGIN:
+			case TBLOCK_INPROGRESS:
+			case TBLOCK_IMPLICIT_INPROGRESS:
+			case TBLOCK_PARALLEL_INPROGRESS:
+			case TBLOCK_END:
+			case TBLOCK_ABORT_PENDING:
+			case TBLOCK_PREPARE:
+				/* In a transaction, so clean up */
+				AbortTransaction();
+				CleanupTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+				break;
+			case TBLOCK_ABORT:
+			case TBLOCK_ABORT_END:
+
+				/*
+				 * AbortTransaction is already done, still need Cleanup.
+				 * However, if we failed partway through running ROLLBACK,
+				 * there will be an active portal running that command, which
+				 * we need to shut down before doing CleanupTransaction.
+				 */
+				AtAbort_Portals();
+				CleanupTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+				break;
+
+				/*
+				 * In a subtransaction, so clean it up and abort parent too
+				 */
+			case TBLOCK_SUBBEGIN:
+			case TBLOCK_SUBINPROGRESS:
+			case TBLOCK_SUBRELEASE:
+			case TBLOCK_SUBCOMMIT:
+			case TBLOCK_SUBABORT_PENDING:
+			case TBLOCK_SUBRESTART:
+				AbortSubTransaction();
+				CleanupSubTransaction();
+				s = CurrentTransactionState;	/* changed by pop */
+				break;
+
+			case TBLOCK_SUBABORT:
+			case TBLOCK_SUBABORT_END:
+			case TBLOCK_SUBABORT_RESTART:
+				/* As above, but AbortSubTransaction already done */
+				if (s->curTransactionOwner)
+				{
+					/* As in TBLOCK_ABORT, might have a live portal to zap */
+					AtSubAbort_Portals(s->subTransactionId,
+									   s->parent->subTransactionId,
+									   s->curTransactionOwner,
+									   s->parent->curTransactionOwner);
+				}
+				CleanupSubTransaction();
+				s = CurrentTransactionState;	/* changed by pop */
+				break;
+		}
+	} while (s->blockState != TBLOCK_DEFAULT);
+
+	/* Should be out of all subxacts now */
+	Assert(s->parent == NULL);
+
+	/* If we didn't actually have anything to do, revert to TopMemoryContext */
+	AtCleanup_Memory();
+}
+
+/*
+ * IsTransactionBlock --- are we within a transaction block?
+ */
+bool
+IsTransactionBlock(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->blockState == TBLOCK_DEFAULT || s->blockState == TBLOCK_STARTED)
+		return false;
+
+	return true;
+}
+
+/*
+ * IsTransactionOrTransactionBlock --- are we within either a transaction
+ * or a transaction block?	(The backend is only really "idle" when this
+ * returns false.)
+ *
+ * This should match up with IsTransactionBlock and IsTransactionState.
+ */
+bool
+IsTransactionOrTransactionBlock(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->blockState == TBLOCK_DEFAULT)
+		return false;
+
+	return true;
+}
+
+/*
+ * TransactionBlockStatusCode - return status code to send in ReadyForQuery
+ */
+char
+TransactionBlockStatusCode(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
+	{
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+			return 'I';			/* idle --- not in transaction */
+		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_IMPLICIT_INPROGRESS:
+		case TBLOCK_PARALLEL_INPROGRESS:
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_END:
+		case TBLOCK_SUBRELEASE:
+		case TBLOCK_SUBCOMMIT:
+		case TBLOCK_PREPARE:
+			return 'T';			/* in transaction */
+		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_ABORT_END:
+		case TBLOCK_SUBABORT_END:
+		case TBLOCK_ABORT_PENDING:
+		case TBLOCK_SUBABORT_PENDING:
+		case TBLOCK_SUBRESTART:
+		case TBLOCK_SUBABORT_RESTART:
+			return 'E';			/* in failed transaction */
+	}
+
+	/* should never get here */
+	elog(FATAL, "invalid transaction block state: %s",
+		 BlockStateAsString(s->blockState));
+	return 0;					/* keep compiler quiet */
+}
+
+/*
+ * IsSubTransaction
+ */
+bool
+IsSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->nestingLevel >= 2)
+		return true;
+
+	return false;
+}
+
+/*
+ * StartSubTransaction
+ *
+ * If you're wondering why this is separate from PushTransaction: it's because
+ * we can't conveniently do this stuff right inside DefineSavepoint.  The
+ * SAVEPOINT utility command will be executed inside a Portal, and if we
+ * muck with CurrentMemoryContext or CurrentResourceOwner then exit from
+ * the Portal will undo those settings.  So we make DefineSavepoint just
+ * push a dummy transaction block, and when control returns to the main
+ * idle loop, CommitTransactionCommand will be called, and we'll come here
+ * to finish starting the subtransaction.
+ */
+static void
+StartSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->state != TRANS_DEFAULT)
+		elog(WARNING, "StartSubTransaction while in %s state",
+			 TransStateAsString(s->state));
+
+	s->state = TRANS_START;
+
+	/*
+	 * Initialize subsystems for new subtransaction
+	 *
+	 * must initialize resource-management stuff first
+	 */
+	AtSubStart_Memory();
+	AtSubStart_ResourceOwner();
+	AfterTriggerBeginSubXact();
+
+	s->state = TRANS_INPROGRESS;
+
+	/*
+	 * Call start-of-subxact callbacks
+	 */
+	CallSubXactCallbacks(SUBXACT_EVENT_START_SUB, s->subTransactionId,
+						 s->parent->subTransactionId);
+
+	ShowTransactionState("StartSubTransaction");
+}
+
+/*
+ * CommitSubTransaction
+ *
+ *	The caller has to make sure to always reassign CurrentTransactionState
+ *	if it has a local pointer to it after calling this function.
+ */
+static void
+CommitSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	ShowTransactionState("CommitSubTransaction");
+
+	if (s->state != TRANS_INPROGRESS)
+		elog(WARNING, "CommitSubTransaction while in %s state",
+			 TransStateAsString(s->state));
+
+	/* Pre-commit processing goes here */
+
+	CallSubXactCallbacks(SUBXACT_EVENT_PRE_COMMIT_SUB, s->subTransactionId,
+						 s->parent->subTransactionId);
+
+	/* If in parallel mode, clean up workers and exit parallel mode. */
+	if (IsInParallelMode())
+	{
+		AtEOSubXact_Parallel(true, s->subTransactionId);
+		s->parallelModeLevel = 0;
+	}
+
+	/* Do the actual "commit", such as it is */
+	s->state = TRANS_COMMIT;
+
+	/* Must CCI to ensure commands of subtransaction are seen as done */
+	CommandCounterIncrement();
+
+	/*
+	 * Prior to 8.4 we marked subcommit in clog at this point.  We now only
+	 * perform that step, if required, as part of the atomic update of the
+	 * whole transaction tree at top level commit or abort.
+	 */
+
+	/* Post-commit cleanup */
+	if (FullTransactionIdIsValid(s->fullTransactionId))
+		AtSubCommit_childXids();
+	AfterTriggerEndSubXact(true);
+	AtSubCommit_Portals(s->subTransactionId,
+						s->parent->subTransactionId,
+						s->parent->nestingLevel,
+						s->parent->curTransactionOwner);
+	AtEOSubXact_LargeObject(true, s->subTransactionId,
+							s->parent->subTransactionId);
+	AtSubCommit_Notify();
+
+	CallSubXactCallbacks(SUBXACT_EVENT_COMMIT_SUB, s->subTransactionId,
+						 s->parent->subTransactionId);
+
+	ResourceOwnerRelease(s->curTransactionOwner,
+						 RESOURCE_RELEASE_BEFORE_LOCKS,
+						 true, false);
+	AtEOSubXact_RelationCache(true, s->subTransactionId,
+							  s->parent->subTransactionId);
+	AtEOSubXact_Inval(true);
+	AtSubCommit_smgr();
+
+	/*
+	 * The only lock we actually release here is the subtransaction XID lock.
+	 */
+	CurrentResourceOwner = s->curTransactionOwner;
+	if (FullTransactionIdIsValid(s->fullTransactionId))
+		XactLockTableDelete(XidFromFullTransactionId(s->fullTransactionId));
+
+	/*
+	 * Other locks should get transferred to their parent resource owner.
+	 */
+	ResourceOwnerRelease(s->curTransactionOwner,
+						 RESOURCE_RELEASE_LOCKS,
+						 true, false);
+	ResourceOwnerRelease(s->curTransactionOwner,
+						 RESOURCE_RELEASE_AFTER_LOCKS,
+						 true, false);
+
+	AtEOXact_GUC(true, s->gucNestLevel);
+	AtEOSubXact_SPI(true, s->subTransactionId);
+	AtEOSubXact_on_commit_actions(true, s->subTransactionId,
+								  s->parent->subTransactionId);
+	AtEOSubXact_Namespace(true, s->subTransactionId,
+						  s->parent->subTransactionId);
+	AtEOSubXact_Files(true, s->subTransactionId,
+					  s->parent->subTransactionId);
+	AtEOSubXact_HashTables(true, s->nestingLevel);
+	AtEOSubXact_PgStat(true, s->nestingLevel);
+	AtSubCommit_Snapshot(s->nestingLevel);
+
+	/*
+	 * We need to restore the upper transaction's read-only state, in case the
+	 * upper is read-write while the child is read-only; GUC will incorrectly
+	 * think it should leave the child state in place.
+	 */
+	XactReadOnly = s->prevXactReadOnly;
+
+	CurrentResourceOwner = s->parent->curTransactionOwner;
+	CurTransactionResourceOwner = s->parent->curTransactionOwner;
+	ResourceOwnerDelete(s->curTransactionOwner);
+	s->curTransactionOwner = NULL;
+
+	AtSubCommit_Memory();
+
+	s->state = TRANS_DEFAULT;
+
+	PopTransaction();
+}
+
+/*
+ * AbortSubTransaction
+ */
+static void
+AbortSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	/* Prevent cancel/die interrupt while cleaning up */
+	HOLD_INTERRUPTS();
+
+	/* Make sure we have a valid memory context and resource owner */
+	AtSubAbort_Memory();
+	AtSubAbort_ResourceOwner();
+
+	/*
+	 * Release any LW locks we might be holding as quickly as possible.
+	 * (Regular locks, however, must be held till we finish aborting.)
+	 * Releasing LW locks is critical since we might try to grab them again
+	 * while cleaning up!
+	 *
+	 * FIXME This may be incorrect --- Are there some locks we should keep?
+	 * Buffer locks, for example?  I don't think so but I'm not sure.
+	 */
+	LWLockReleaseAll();
+
+	pgstat_report_wait_end();
+	pgstat_progress_end_command();
+	AbortBufferIO();
+	UnlockBuffers();
+
+	/* Reset WAL record construction state */
+	XLogResetInsertion();
+
+	/* Cancel condition variable sleep */
+	ConditionVariableCancelSleep();
+
+	/*
+	 * Also clean up any open wait for lock, since the lock manager will choke
+	 * if we try to wait for another lock before doing this.
+	 */
+	LockErrorCleanup();
+
+	/*
+	 * If any timeout events are still active, make sure the timeout interrupt
+	 * is scheduled.  This covers possible loss of a timeout interrupt due to
+	 * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm).
+	 * We delay this till after LockErrorCleanup so that we don't uselessly
+	 * reschedule lock or deadlock check timeouts.
+	 */
+	reschedule_timeouts();
+
+	/*
+	 * Re-enable signals, in case we got here by longjmp'ing out of a signal
+	 * handler.  We do this fairly early in the sequence so that the timeout
+	 * infrastructure will be functional if needed while aborting.
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * check the current transaction state
+	 */
+	ShowTransactionState("AbortSubTransaction");
+
+	if (s->state != TRANS_INPROGRESS)
+		elog(WARNING, "AbortSubTransaction while in %s state",
+			 TransStateAsString(s->state));
+
+	s->state = TRANS_ABORT;
+
+	/*
+	 * Reset user ID which might have been changed transiently.  (See notes in
+	 * AbortTransaction.)
+	 */
+	SetUserIdAndSecContext(s->prevUser, s->prevSecContext);
+
+	/* Forget about any active REINDEX. */
+	ResetReindexState(s->nestingLevel);
+
+	/* Reset logical streaming state. */
+	ResetLogicalStreamingState();
+
+	/*
+	 * No need for SnapBuildResetExportedSnapshotState() here, snapshot
+	 * exports are not supported in subtransactions.
+	 */
+
+	/* Exit from parallel mode, if necessary. */
+	if (IsInParallelMode())
+	{
+		AtEOSubXact_Parallel(false, s->subTransactionId);
+		s->parallelModeLevel = 0;
+	}
+
+	/*
+	 * We can skip all this stuff if the subxact failed before creating a
+	 * ResourceOwner...
+	 */
+	if (s->curTransactionOwner)
+	{
+		AfterTriggerEndSubXact(false);
+		AtSubAbort_Portals(s->subTransactionId,
+						   s->parent->subTransactionId,
+						   s->curTransactionOwner,
+						   s->parent->curTransactionOwner);
+		AtEOSubXact_LargeObject(false, s->subTransactionId,
+								s->parent->subTransactionId);
+		AtSubAbort_Notify();
+
+		/* Advertise the fact that we aborted in pg_xact. */
+		(void) RecordTransactionAbort(true);
+
+		/* Post-abort cleanup */
+		if (FullTransactionIdIsValid(s->fullTransactionId))
+			AtSubAbort_childXids();
+
+		CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId,
+							 s->parent->subTransactionId);
+
+		ResourceOwnerRelease(s->curTransactionOwner,
+							 RESOURCE_RELEASE_BEFORE_LOCKS,
+							 false, false);
+		AtEOSubXact_RelationCache(false, s->subTransactionId,
+								  s->parent->subTransactionId);
+		AtEOSubXact_Inval(false);
+		ResourceOwnerRelease(s->curTransactionOwner,
+							 RESOURCE_RELEASE_LOCKS,
+							 false, false);
+		ResourceOwnerRelease(s->curTransactionOwner,
+							 RESOURCE_RELEASE_AFTER_LOCKS,
+							 false, false);
+		AtSubAbort_smgr();
+
+		AtEOXact_GUC(false, s->gucNestLevel);
+		AtEOSubXact_SPI(false, s->subTransactionId);
+		AtEOSubXact_on_commit_actions(false, s->subTransactionId,
+									  s->parent->subTransactionId);
+		AtEOSubXact_Namespace(false, s->subTransactionId,
+							  s->parent->subTransactionId);
+		AtEOSubXact_Files(false, s->subTransactionId,
+						  s->parent->subTransactionId);
+		AtEOSubXact_HashTables(false, s->nestingLevel);
+		AtEOSubXact_PgStat(false, s->nestingLevel);
+		AtSubAbort_Snapshot(s->nestingLevel);
+	}
+
+	/*
+	 * Restore the upper transaction's read-only state, too.  This should be
+	 * redundant with GUC's cleanup but we may as well do it for consistency
+	 * with the commit case.
+	 */
+	XactReadOnly = s->prevXactReadOnly;
+
+	RESUME_INTERRUPTS();
+}
+
+/*
+ * CleanupSubTransaction
+ *
+ *	The caller has to make sure to always reassign CurrentTransactionState
+ *	if it has a local pointer to it after calling this function.
+ */
+static void
+CleanupSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	ShowTransactionState("CleanupSubTransaction");
+
+	if (s->state != TRANS_ABORT)
+		elog(WARNING, "CleanupSubTransaction while in %s state",
+			 TransStateAsString(s->state));
+
+	AtSubCleanup_Portals(s->subTransactionId);
+
+	CurrentResourceOwner = s->parent->curTransactionOwner;
+	CurTransactionResourceOwner = s->parent->curTransactionOwner;
+	if (s->curTransactionOwner)
+		ResourceOwnerDelete(s->curTransactionOwner);
+	s->curTransactionOwner = NULL;
+
+	AtSubCleanup_Memory();
+
+	s->state = TRANS_DEFAULT;
+
+	PopTransaction();
+}
+
+/*
+ * PushTransaction
+ *		Create transaction state stack entry for a subtransaction
+ *
+ *	The caller has to make sure to always reassign CurrentTransactionState
+ *	if it has a local pointer to it after calling this function.
+ */
+static void
+PushTransaction(void)
+{
+	TransactionState p = CurrentTransactionState;
+	TransactionState s;
+
+	/*
+	 * We keep subtransaction state nodes in TopTransactionContext.
+	 */
+	s = (TransactionState)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(TransactionStateData));
+
+	/*
+	 * Assign a subtransaction ID, watching out for counter wraparound.
+	 */
+	currentSubTransactionId += 1;
+	if (currentSubTransactionId == InvalidSubTransactionId)
+	{
+		currentSubTransactionId -= 1;
+		pfree(s);
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot have more than 2^32-1 subtransactions in a transaction")));
+	}
+
+	/*
+	 * We can now stack a minimally valid subtransaction without fear of
+	 * failure.
+	 */
+	s->fullTransactionId = InvalidFullTransactionId;	/* until assigned */
+	s->subTransactionId = currentSubTransactionId;
+	s->parent = p;
+	s->nestingLevel = p->nestingLevel + 1;
+	s->gucNestLevel = NewGUCNestLevel();
+	s->savepointLevel = p->savepointLevel;
+	s->state = TRANS_DEFAULT;
+	s->blockState = TBLOCK_SUBBEGIN;
+	GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
+	s->prevXactReadOnly = XactReadOnly;
+	s->parallelModeLevel = 0;
+	s->topXidLogged = false;
+
+	CurrentTransactionState = s;
+
+	/*
+	 * AbortSubTransaction and CleanupSubTransaction have to be able to cope
+	 * with the subtransaction from here on out; in particular they should not
+	 * assume that it necessarily has a transaction context, resource owner,
+	 * or XID.
+	 */
+}
+
+/*
+ * PopTransaction
+ *		Pop back to parent transaction state
+ *
+ *	The caller has to make sure to always reassign CurrentTransactionState
+ *	if it has a local pointer to it after calling this function.
+ */
+static void
+PopTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->state != TRANS_DEFAULT)
+		elog(WARNING, "PopTransaction while in %s state",
+			 TransStateAsString(s->state));
+
+	if (s->parent == NULL)
+		elog(FATAL, "PopTransaction with no parent");
+
+	CurrentTransactionState = s->parent;
+
+	/* Let's just make sure CurTransactionContext is good */
+	CurTransactionContext = s->parent->curTransactionContext;
+	MemoryContextSwitchTo(CurTransactionContext);
+
+	/* Ditto for ResourceOwner links */
+	CurTransactionResourceOwner = s->parent->curTransactionOwner;
+	CurrentResourceOwner = s->parent->curTransactionOwner;
+
+	/* Free the old child structure */
+	if (s->name)
+		pfree(s->name);
+	pfree(s);
+}
+
+/*
+ * EstimateTransactionStateSpace
+ *		Estimate the amount of space that will be needed by
+ *		SerializeTransactionState.  It would be OK to overestimate slightly,
+ *		but it's simple for us to work out the precise value, so we do.
+ */
+Size
+EstimateTransactionStateSpace(void)
+{
+	TransactionState s;
+	Size		nxids = 0;
+	Size		size = SerializedTransactionStateHeaderSize;
+
+	for (s = CurrentTransactionState; s != NULL; s = s->parent)
+	{
+		if (FullTransactionIdIsValid(s->fullTransactionId))
+			nxids = add_size(nxids, 1);
+		nxids = add_size(nxids, s->nChildXids);
+	}
+
+	return add_size(size, mul_size(sizeof(TransactionId), nxids));
+}
+
+/*
+ * SerializeTransactionState
+ *		Write out relevant details of our transaction state that will be
+ *		needed by a parallel worker.
+ *
+ * We need to save and restore XactDeferrable, XactIsoLevel, and the XIDs
+ * associated with this transaction.  These are serialized into a
+ * caller-supplied buffer big enough to hold the number of bytes reported by
+ * EstimateTransactionStateSpace().  We emit the XIDs in sorted order for the
+ * convenience of the receiving process.
+ */
+void
+SerializeTransactionState(Size maxsize, char *start_address)
+{
+	TransactionState s;
+	Size		nxids = 0;
+	Size		i = 0;
+	TransactionId *workspace;
+	SerializedTransactionState *result;
+
+	result = (SerializedTransactionState *) start_address;
+
+	result->xactIsoLevel = XactIsoLevel;
+	result->xactDeferrable = XactDeferrable;
+	result->topFullTransactionId = XactTopFullTransactionId;
+	result->currentFullTransactionId =
+		CurrentTransactionState->fullTransactionId;
+	result->currentCommandId = currentCommandId;
+
+	/*
+	 * If we're running in a parallel worker and launching a parallel worker
+	 * of our own, we can just pass along the information that was passed to
+	 * us.
+	 */
+	if (nParallelCurrentXids > 0)
+	{
+		result->nParallelCurrentXids = nParallelCurrentXids;
+		memcpy(&result->parallelCurrentXids[0], ParallelCurrentXids,
+			   nParallelCurrentXids * sizeof(TransactionId));
+		return;
+	}
+
+	/*
+	 * OK, we need to generate a sorted list of XIDs that our workers should
+	 * view as current.  First, figure out how many there are.
+	 */
+	for (s = CurrentTransactionState; s != NULL; s = s->parent)
+	{
+		if (FullTransactionIdIsValid(s->fullTransactionId))
+			nxids = add_size(nxids, 1);
+		nxids = add_size(nxids, s->nChildXids);
+	}
+	Assert(SerializedTransactionStateHeaderSize + nxids * sizeof(TransactionId)
+		   <= maxsize);
+
+	/* Copy them to our scratch space. */
+	workspace = palloc(nxids * sizeof(TransactionId));
+	for (s = CurrentTransactionState; s != NULL; s = s->parent)
+	{
+		if (FullTransactionIdIsValid(s->fullTransactionId))
+			workspace[i++] = XidFromFullTransactionId(s->fullTransactionId);
+		if (s->nChildXids > 0)
+			memcpy(&workspace[i], s->childXids,
+				   s->nChildXids * sizeof(TransactionId));
+		i += s->nChildXids;
+	}
+	Assert(i == nxids);
+
+	/* Sort them. */
+	qsort(workspace, nxids, sizeof(TransactionId), xidComparator);
+
+	/* Copy data into output area. */
+	result->nParallelCurrentXids = nxids;
+	memcpy(&result->parallelCurrentXids[0], workspace,
+		   nxids * sizeof(TransactionId));
+}
+
+/*
+ * StartParallelWorkerTransaction
+ *		Start a parallel worker transaction, restoring the relevant
+ *		transaction state serialized by SerializeTransactionState.
+ */
+void
+StartParallelWorkerTransaction(char *tstatespace)
+{
+	SerializedTransactionState *tstate;
+
+	Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT);
+	StartTransaction();
+
+	tstate = (SerializedTransactionState *) tstatespace;
+	XactIsoLevel = tstate->xactIsoLevel;
+	XactDeferrable = tstate->xactDeferrable;
+	XactTopFullTransactionId = tstate->topFullTransactionId;
+	CurrentTransactionState->fullTransactionId =
+		tstate->currentFullTransactionId;
+	currentCommandId = tstate->currentCommandId;
+	nParallelCurrentXids = tstate->nParallelCurrentXids;
+	ParallelCurrentXids = &tstate->parallelCurrentXids[0];
+
+	CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS;
+}
+
+/*
+ * EndParallelWorkerTransaction
+ *		End a parallel worker transaction.
+ */
+void
+EndParallelWorkerTransaction(void)
+{
+	Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS);
+	CommitTransaction();
+	CurrentTransactionState->blockState = TBLOCK_DEFAULT;
+}
+
+/*
+ * ShowTransactionState
+ *		Debug support
+ */
+static void
+ShowTransactionState(const char *str)
+{
+	/* skip work if message will definitely not be printed */
+	if (message_level_is_interesting(DEBUG5))
+		ShowTransactionStateRec(str, CurrentTransactionState);
+}
+
+/*
+ * ShowTransactionStateRec
+ *		Recursive subroutine for ShowTransactionState
+ */
+static void
+ShowTransactionStateRec(const char *str, TransactionState s)
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	if (s->nChildXids > 0)
+	{
+		int			i;
+
+		appendStringInfo(&buf, ", children: %u", s->childXids[0]);
+		for (i = 1; i < s->nChildXids; i++)
+			appendStringInfo(&buf, " %u", s->childXids[i]);
+	}
+
+	if (s->parent)
+		ShowTransactionStateRec(str, s->parent);
+
+	ereport(DEBUG5,
+			(errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s",
+							 str, s->nestingLevel,
+							 PointerIsValid(s->name) ? s->name : "unnamed",
+							 BlockStateAsString(s->blockState),
+							 TransStateAsString(s->state),
+							 (unsigned int) XidFromFullTransactionId(s->fullTransactionId),
+							 (unsigned int) s->subTransactionId,
+							 (unsigned int) currentCommandId,
+							 currentCommandIdUsed ? " (used)" : "",
+							 buf.data)));
+
+	pfree(buf.data);
+}
+
+/*
+ * BlockStateAsString
+ *		Debug support
+ */
+static const char *
+BlockStateAsString(TBlockState blockState)
+{
+	switch (blockState)
+	{
+		case TBLOCK_DEFAULT:
+			return "DEFAULT";
+		case TBLOCK_STARTED:
+			return "STARTED";
+		case TBLOCK_BEGIN:
+			return "BEGIN";
+		case TBLOCK_INPROGRESS:
+			return "INPROGRESS";
+		case TBLOCK_IMPLICIT_INPROGRESS:
+			return "IMPLICIT_INPROGRESS";
+		case TBLOCK_PARALLEL_INPROGRESS:
+			return "PARALLEL_INPROGRESS";
+		case TBLOCK_END:
+			return "END";
+		case TBLOCK_ABORT:
+			return "ABORT";
+		case TBLOCK_ABORT_END:
+			return "ABORT_END";
+		case TBLOCK_ABORT_PENDING:
+			return "ABORT_PENDING";
+		case TBLOCK_PREPARE:
+			return "PREPARE";
+		case TBLOCK_SUBBEGIN:
+			return "SUBBEGIN";
+		case TBLOCK_SUBINPROGRESS:
+			return "SUBINPROGRESS";
+		case TBLOCK_SUBRELEASE:
+			return "SUBRELEASE";
+		case TBLOCK_SUBCOMMIT:
+			return "SUBCOMMIT";
+		case TBLOCK_SUBABORT:
+			return "SUBABORT";
+		case TBLOCK_SUBABORT_END:
+			return "SUBABORT_END";
+		case TBLOCK_SUBABORT_PENDING:
+			return "SUBABORT_PENDING";
+		case TBLOCK_SUBRESTART:
+			return "SUBRESTART";
+		case TBLOCK_SUBABORT_RESTART:
+			return "SUBABORT_RESTART";
+	}
+	return "UNRECOGNIZED";
+}
+
+/*
+ * TransStateAsString
+ *		Debug support
+ */
+static const char *
+TransStateAsString(TransState state)
+{
+	switch (state)
+	{
+		case TRANS_DEFAULT:
+			return "DEFAULT";
+		case TRANS_START:
+			return "START";
+		case TRANS_INPROGRESS:
+			return "INPROGRESS";
+		case TRANS_COMMIT:
+			return "COMMIT";
+		case TRANS_ABORT:
+			return "ABORT";
+		case TRANS_PREPARE:
+			return "PREPARE";
+	}
+	return "UNRECOGNIZED";
+}
+
+/*
+ * xactGetCommittedChildren
+ *
+ * Gets the list of committed children of the current transaction.  The return
+ * value is the number of child transactions.  *ptr is set to point to an
+ * array of TransactionIds.  The array is allocated in TopTransactionContext;
+ * the caller should *not* pfree() it (this is a change from pre-8.4 code!).
+ * If there are no subxacts, *ptr is set to NULL.
+ */
+int
+xactGetCommittedChildren(TransactionId **ptr)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->nChildXids == 0)
+		*ptr = NULL;
+	else
+		*ptr = s->childXids;
+
+	return s->nChildXids;
+}
+
+/*
+ *	XLOG support routines
+ */
+
+
+/*
+ * Log the commit record for a plain or twophase transaction commit.
+ *
+ * A 2pc commit will be emitted when twophase_xid is valid, a plain one
+ * otherwise.
+ */
+XLogRecPtr
+XactLogCommitRecord(TimestampTz commit_time,
+					int nsubxacts, TransactionId *subxacts,
+					int nrels, RelFileNode *rels,
+					int ndroppedstats, xl_xact_stats_item *droppedstats,
+					int nmsgs, SharedInvalidationMessage *msgs,
+					bool relcacheInval,
+					int xactflags, TransactionId twophase_xid,
+					const char *twophase_gid)
+{
+	xl_xact_commit xlrec;
+	xl_xact_xinfo xl_xinfo;
+	xl_xact_dbinfo xl_dbinfo;
+	xl_xact_subxacts xl_subxacts;
+	xl_xact_relfilenodes xl_relfilenodes;
+	xl_xact_stats_items xl_dropped_stats;
+	xl_xact_invals xl_invals;
+	xl_xact_twophase xl_twophase;
+	xl_xact_origin xl_origin;
+	uint8		info;
+
+	Assert(CritSectionCount > 0);
+
+	xl_xinfo.xinfo = 0;
+
+	/* decide between a plain and 2pc commit */
+	if (!TransactionIdIsValid(twophase_xid))
+		info = XLOG_XACT_COMMIT;
+	else
+		info = XLOG_XACT_COMMIT_PREPARED;
+
+	/* First figure out and collect all the information needed */
+
+	xlrec.xact_time = commit_time;
+
+	if (relcacheInval)
+		xl_xinfo.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+	if (forceSyncCommit)
+		xl_xinfo.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT;
+	if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK))
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS;
+
+	/*
+	 * Check if the caller would like to ask standbys for immediate feedback
+	 * once this commit is applied.
+	 */
+	if (synchronous_commit >= SYNCHRONOUS_COMMIT_REMOTE_APPLY)
+		xl_xinfo.xinfo |= XACT_COMPLETION_APPLY_FEEDBACK;
+
+	/*
+	 * Relcache invalidations requires information about the current database
+	 * and so does logical decoding.
+	 */
+	if (nmsgs > 0 || XLogLogicalInfoActive())
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO;
+		xl_dbinfo.dbId = MyDatabaseId;
+		xl_dbinfo.tsId = MyDatabaseTableSpace;
+	}
+
+	if (nsubxacts > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS;
+		xl_subxacts.nsubxacts = nsubxacts;
+	}
+
+	if (nrels > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
+		xl_relfilenodes.nrels = nrels;
+		info |= XLR_SPECIAL_REL_UPDATE;
+	}
+
+	if (ndroppedstats > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS;
+		xl_dropped_stats.nitems = ndroppedstats;
+	}
+
+	if (nmsgs > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_INVALS;
+		xl_invals.nmsgs = nmsgs;
+	}
+
+	if (TransactionIdIsValid(twophase_xid))
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE;
+		xl_twophase.xid = twophase_xid;
+		Assert(twophase_gid != NULL);
+
+		if (XLogLogicalInfoActive())
+			xl_xinfo.xinfo |= XACT_XINFO_HAS_GID;
+	}
+
+	/* dump transaction origin information */
+	if (replorigin_session_origin != InvalidRepOriginId)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN;
+
+		xl_origin.origin_lsn = replorigin_session_origin_lsn;
+		xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
+	}
+
+	if (xl_xinfo.xinfo != 0)
+		info |= XLOG_XACT_HAS_INFO;
+
+	/* Then include all the collected data into the commit record. */
+
+	XLogBeginInsert();
+
+	XLogRegisterData((char *) (&xlrec), sizeof(xl_xact_commit));
+
+	if (xl_xinfo.xinfo != 0)
+		XLogRegisterData((char *) (&xl_xinfo.xinfo), sizeof(xl_xinfo.xinfo));
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO)
+		XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo));
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS)
+	{
+		XLogRegisterData((char *) (&xl_subxacts),
+						 MinSizeOfXactSubxacts);
+		XLogRegisterData((char *) subxacts,
+						 nsubxacts * sizeof(TransactionId));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES)
+	{
+		XLogRegisterData((char *) (&xl_relfilenodes),
+						 MinSizeOfXactRelfilenodes);
+		XLogRegisterData((char *) rels,
+						 nrels * sizeof(RelFileNode));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS)
+	{
+		XLogRegisterData((char *) (&xl_dropped_stats),
+						 MinSizeOfXactStatsItems);
+		XLogRegisterData((char *) droppedstats,
+						 ndroppedstats * sizeof(xl_xact_stats_item));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_INVALS)
+	{
+		XLogRegisterData((char *) (&xl_invals), MinSizeOfXactInvals);
+		XLogRegisterData((char *) msgs,
+						 nmsgs * sizeof(SharedInvalidationMessage));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE)
+	{
+		XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase));
+		if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID)
+			XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1);
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN)
+		XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin));
+
+	/* we allow filtering by xacts */
+	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+	return XLogInsert(RM_XACT_ID, info);
+}
+
+/*
+ * Log the commit record for a plain or twophase transaction abort.
+ *
+ * A 2pc abort will be emitted when twophase_xid is valid, a plain one
+ * otherwise.
+ */
+XLogRecPtr
+XactLogAbortRecord(TimestampTz abort_time,
+				   int nsubxacts, TransactionId *subxacts,
+				   int nrels, RelFileNode *rels,
+				   int ndroppedstats, xl_xact_stats_item *droppedstats,
+				   int xactflags, TransactionId twophase_xid,
+				   const char *twophase_gid)
+{
+	xl_xact_abort xlrec;
+	xl_xact_xinfo xl_xinfo;
+	xl_xact_subxacts xl_subxacts;
+	xl_xact_relfilenodes xl_relfilenodes;
+	xl_xact_stats_items xl_dropped_stats;
+	xl_xact_twophase xl_twophase;
+	xl_xact_dbinfo xl_dbinfo;
+	xl_xact_origin xl_origin;
+
+	uint8		info;
+
+	Assert(CritSectionCount > 0);
+
+	xl_xinfo.xinfo = 0;
+
+	/* decide between a plain and 2pc abort */
+	if (!TransactionIdIsValid(twophase_xid))
+		info = XLOG_XACT_ABORT;
+	else
+		info = XLOG_XACT_ABORT_PREPARED;
+
+
+	/* First figure out and collect all the information needed */
+
+	xlrec.xact_time = abort_time;
+
+	if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK))
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS;
+
+	if (nsubxacts > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS;
+		xl_subxacts.nsubxacts = nsubxacts;
+	}
+
+	if (nrels > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
+		xl_relfilenodes.nrels = nrels;
+		info |= XLR_SPECIAL_REL_UPDATE;
+	}
+
+	if (ndroppedstats > 0)
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS;
+		xl_dropped_stats.nitems = ndroppedstats;
+	}
+
+	if (TransactionIdIsValid(twophase_xid))
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE;
+		xl_twophase.xid = twophase_xid;
+		Assert(twophase_gid != NULL);
+
+		if (XLogLogicalInfoActive())
+			xl_xinfo.xinfo |= XACT_XINFO_HAS_GID;
+	}
+
+	if (TransactionIdIsValid(twophase_xid) && XLogLogicalInfoActive())
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO;
+		xl_dbinfo.dbId = MyDatabaseId;
+		xl_dbinfo.tsId = MyDatabaseTableSpace;
+	}
+
+	/*
+	 * Dump transaction origin information only for abort prepared. We need
+	 * this during recovery to update the replication origin progress.
+	 */
+	if ((replorigin_session_origin != InvalidRepOriginId) &&
+		TransactionIdIsValid(twophase_xid))
+	{
+		xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN;
+
+		xl_origin.origin_lsn = replorigin_session_origin_lsn;
+		xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
+	}
+
+	if (xl_xinfo.xinfo != 0)
+		info |= XLOG_XACT_HAS_INFO;
+
+	/* Then include all the collected data into the abort record. */
+
+	XLogBeginInsert();
+
+	XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbort);
+
+	if (xl_xinfo.xinfo != 0)
+		XLogRegisterData((char *) (&xl_xinfo), sizeof(xl_xinfo));
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO)
+		XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo));
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS)
+	{
+		XLogRegisterData((char *) (&xl_subxacts),
+						 MinSizeOfXactSubxacts);
+		XLogRegisterData((char *) subxacts,
+						 nsubxacts * sizeof(TransactionId));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES)
+	{
+		XLogRegisterData((char *) (&xl_relfilenodes),
+						 MinSizeOfXactRelfilenodes);
+		XLogRegisterData((char *) rels,
+						 nrels * sizeof(RelFileNode));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS)
+	{
+		XLogRegisterData((char *) (&xl_dropped_stats),
+						 MinSizeOfXactStatsItems);
+		XLogRegisterData((char *) droppedstats,
+						 ndroppedstats * sizeof(xl_xact_stats_item));
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE)
+	{
+		XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase));
+		if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID)
+			XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1);
+	}
+
+	if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN)
+		XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin));
+
+	if (TransactionIdIsValid(twophase_xid))
+		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+
+	return XLogInsert(RM_XACT_ID, info);
+}
+
+/*
+ * Before 9.0 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
+static void
+xact_redo_commit(xl_xact_parsed_commit *parsed,
+				 TransactionId xid,
+				 XLogRecPtr lsn,
+				 RepOriginId origin_id)
+{
+	TransactionId max_xid;
+	TimestampTz commit_time;
+
+	Assert(TransactionIdIsValid(xid));
+
+	max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts);
+
+	/* Make sure nextXid is beyond any XID mentioned in the record. */
+	AdvanceNextFullTransactionIdPastXid(max_xid);
+
+	Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) ==
+		   (origin_id == InvalidRepOriginId));
+
+	if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+		commit_time = parsed->origin_timestamp;
+	else
+		commit_time = parsed->xact_time;
+
+	/* Set the transaction commit timestamp and metadata */
+	TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts,
+								   commit_time, origin_id);
+
+	if (standbyState == STANDBY_DISABLED)
+	{
+		/*
+		 * Mark the transaction committed in pg_xact.
+		 */
+		TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts);
+	}
+	else
+	{
+		/*
+		 * If a transaction completion record arrives that has as-yet
+		 * unobserved subtransactions then this will not have been fully
+		 * handled by the call to RecordKnownAssignedTransactionIds() in the
+		 * main recovery loop in xlog.c. So we need to do bookkeeping again to
+		 * cover that case. This is confusing and it is easy to think this
+		 * call is irrelevant, which has happened three times in development
+		 * already. Leave it in.
+		 */
+		RecordKnownAssignedTransactionIds(max_xid);
+
+		/*
+		 * Mark the transaction committed in pg_xact. We use async commit
+		 * protocol during recovery to provide information on database
+		 * consistency for when users try to set hint bits. It is important
+		 * that we do not set hint bits until the minRecoveryPoint is past
+		 * this commit record. This ensures that if we crash we don't see hint
+		 * bits set on changes made by transactions that haven't yet
+		 * recovered. It's unlikely but it's good to be safe.
+		 */
+		TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn);
+
+		/*
+		 * We must mark clog before we update the ProcArray.
+		 */
+		ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+
+		/*
+		 * Send any cache invalidations attached to the commit. We must
+		 * maintain the same order of invalidation then release locks as
+		 * occurs in CommitTransaction().
+		 */
+		ProcessCommittedInvalidationMessages(parsed->msgs, parsed->nmsgs,
+											 XactCompletionRelcacheInitFileInval(parsed->xinfo),
+											 parsed->dbId, parsed->tsId);
+
+		/*
+		 * Release locks, if any. We do this for both two phase and normal one
+		 * phase transactions. In effect we are ignoring the prepare phase and
+		 * just going straight to lock release.
+		 */
+		if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS)
+			StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts);
+	}
+
+	if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+	{
+		/* recover apply progress */
+		replorigin_advance(origin_id, parsed->origin_lsn, lsn,
+						   false /* backward */ , false /* WAL */ );
+	}
+
+	/* Make sure files supposed to be dropped are dropped */
+	if (parsed->nrels > 0)
+	{
+		/*
+		 * First update minimum recovery point to cover this WAL record. Once
+		 * a relation is deleted, there's no going back. The buffer manager
+		 * enforces the WAL-first rule for normal updates to relation files,
+		 * so that the minimum recovery point is always updated before the
+		 * corresponding change in the data file is flushed to disk, but we
+		 * have to do the same here since we're bypassing the buffer manager.
+		 *
+		 * Doing this before deleting the files means that if a deletion fails
+		 * for some reason, you cannot start up the system even after restart,
+		 * until you fix the underlying situation so that the deletion will
+		 * succeed. Alternatively, we could update the minimum recovery point
+		 * after deletion, but that would leave a small window where the
+		 * WAL-first rule would be violated.
+		 */
+		XLogFlush(lsn);
+
+		/* Make sure files supposed to be dropped are dropped */
+		DropRelationFiles(parsed->xnodes, parsed->nrels, true);
+	}
+
+	if (parsed->nstats > 0)
+	{
+		/* see equivalent call for relations above */
+		XLogFlush(lsn);
+
+		pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true);
+	}
+
+	/*
+	 * We issue an XLogFlush() for the same reason we emit ForceSyncCommit()
+	 * in normal operation. For example, in CREATE DATABASE, we copy all files
+	 * from the template database, and then commit the transaction. If we
+	 * crash after all the files have been copied but before the commit, you
+	 * have files in the data directory without an entry in pg_database. To
+	 * minimize the window for that, we use ForceSyncCommit() to rush the
+	 * commit record to disk as quick as possible. We have the same window
+	 * during recovery, and forcing an XLogFlush() (which updates
+	 * minRecoveryPoint during recovery) helps to reduce that problem window,
+	 * for any user that requested ForceSyncCommit().
+	 */
+	if (XactCompletionForceSyncCommit(parsed->xinfo))
+		XLogFlush(lsn);
+
+	/*
+	 * If asked by the primary (because someone is waiting for a synchronous
+	 * commit = remote_apply), we will need to ask walreceiver to send a reply
+	 * immediately.
+	 */
+	if (XactCompletionApplyFeedback(parsed->xinfo))
+		XLogRequestWalReceiverReply();
+}
+
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ *
+ * Note also that an abort can be for a subtransaction and its children,
+ * not just for a top level abort. That means we have to consider
+ * topxid != xid, whereas in commit we would find topxid == xid always
+ * because subtransaction commit is never WAL logged.
+ */
+static void
+xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid,
+				XLogRecPtr lsn, RepOriginId origin_id)
+{
+	TransactionId max_xid;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/* Make sure nextXid is beyond any XID mentioned in the record. */
+	max_xid = TransactionIdLatest(xid,
+								  parsed->nsubxacts,
+								  parsed->subxacts);
+	AdvanceNextFullTransactionIdPastXid(max_xid);
+
+	if (standbyState == STANDBY_DISABLED)
+	{
+		/* Mark the transaction aborted in pg_xact, no need for async stuff */
+		TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
+	}
+	else
+	{
+		/*
+		 * If a transaction completion record arrives that has as-yet
+		 * unobserved subtransactions then this will not have been fully
+		 * handled by the call to RecordKnownAssignedTransactionIds() in the
+		 * main recovery loop in xlog.c. So we need to do bookkeeping again to
+		 * cover that case. This is confusing and it is easy to think this
+		 * call is irrelevant, which has happened three times in development
+		 * already. Leave it in.
+		 */
+		RecordKnownAssignedTransactionIds(max_xid);
+
+		/* Mark the transaction aborted in pg_xact, no need for async stuff */
+		TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
+
+		/*
+		 * We must update the ProcArray after we have marked clog.
+		 */
+		ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+
+		/*
+		 * There are no invalidation messages to send or undo.
+		 */
+
+		/*
+		 * Release locks, if any. There are no invalidations to send.
+		 */
+		if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS)
+			StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts);
+	}
+
+	if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN)
+	{
+		/* recover apply progress */
+		replorigin_advance(origin_id, parsed->origin_lsn, lsn,
+						   false /* backward */ , false /* WAL */ );
+	}
+
+	/* Make sure files supposed to be dropped are dropped */
+	if (parsed->nrels > 0)
+	{
+		/*
+		 * See comments about update of minimum recovery point on truncation,
+		 * in xact_redo_commit().
+		 */
+		XLogFlush(lsn);
+
+		DropRelationFiles(parsed->xnodes, parsed->nrels, true);
+	}
+
+	if (parsed->nstats > 0)
+	{
+		/* see equivalent call for relations above */
+		XLogFlush(lsn);
+
+		pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true);
+	}
+}
+
+void
+xact_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	/* Backup blocks are not used in xact records */
+	Assert(!XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_XACT_COMMIT)
+	{
+		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+		xl_xact_parsed_commit parsed;
+
+		ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed);
+		xact_redo_commit(&parsed, XLogRecGetXid(record),
+						 record->EndRecPtr, XLogRecGetOrigin(record));
+	}
+	else if (info == XLOG_XACT_COMMIT_PREPARED)
+	{
+		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+		xl_xact_parsed_commit parsed;
+
+		ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed);
+		xact_redo_commit(&parsed, parsed.twophase_xid,
+						 record->EndRecPtr, XLogRecGetOrigin(record));
+
+		/* Delete TwoPhaseState gxact entry and/or 2PC file. */
+		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+		PrepareRedoRemove(parsed.twophase_xid, false);
+		LWLockRelease(TwoPhaseStateLock);
+	}
+	else if (info == XLOG_XACT_ABORT)
+	{
+		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+		xl_xact_parsed_abort parsed;
+
+		ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed);
+		xact_redo_abort(&parsed, XLogRecGetXid(record),
+						record->EndRecPtr, XLogRecGetOrigin(record));
+	}
+	else if (info == XLOG_XACT_ABORT_PREPARED)
+	{
+		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+		xl_xact_parsed_abort parsed;
+
+		ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed);
+		xact_redo_abort(&parsed, parsed.twophase_xid,
+						record->EndRecPtr, XLogRecGetOrigin(record));
+
+		/* Delete TwoPhaseState gxact entry and/or 2PC file. */
+		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+		PrepareRedoRemove(parsed.twophase_xid, false);
+		LWLockRelease(TwoPhaseStateLock);
+	}
+	else if (info == XLOG_XACT_PREPARE)
+	{
+		/*
+		 * Store xid and start/end pointers of the WAL record in TwoPhaseState
+		 * gxact entry.
+		 */
+		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
+		PrepareRedoAdd(XLogRecGetData(record),
+					   record->ReadRecPtr,
+					   record->EndRecPtr,
+					   XLogRecGetOrigin(record));
+		LWLockRelease(TwoPhaseStateLock);
+	}
+	else if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+		if (standbyState >= STANDBY_INITIALIZED)
+			ProcArrayApplyXidAssignment(xlrec->xtop,
+										xlrec->nsubxacts, xlrec->xsub);
+	}
+	else if (info == XLOG_XACT_INVALIDATIONS)
+	{
+		/*
+		 * XXX we do ignore this for now, what matters are invalidations
+		 * written into the commit record.
+		 */
+	}
+	else
+		elog(PANIC, "xact_redo: unknown op code %u", info);
+}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
new file mode 100644
index 0000000..59f94b0
--- /dev/null
+++ b/src/backend/access/transam/xlog.c
@@ -0,0 +1,8906 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlog.c
+ *		PostgreSQL write-ahead log manager
+ *
+ * The Write-Ahead Log (WAL) functionality is split into several source
+ * files, in addition to this one:
+ *
+ * xloginsert.c - Functions for constructing WAL records
+ * xlogrecovery.c - WAL recovery and standby code
+ * xlogreader.c - Facility for reading WAL files and parsing WAL records
+ * xlogutils.c - Helper functions for WAL redo routines
+ *
+ * This file contains functions for coordinating database startup and
+ * checkpointing, and managing the write-ahead log buffers when the
+ * system is running.
+ *
+ * StartupXLOG() is the main entry point of the startup process.  It
+ * coordinates database startup, performing WAL recovery, and the
+ * transition from WAL recovery into normal operations.
+ *
+ * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
+ * callers should not call this directly, but use the functions in
+ * xloginsert.c to construct the WAL record.  XLogFlush() can be used
+ * to force the WAL to disk.
+ *
+ * In addition to those, there are many other functions for interrogating
+ * the current system state, and for starting/stopping backups.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heaptoast.h"
+#include "access/multixact.h"
+#include "access/rewriteheap.h"
+#include "access/subtrans.h"
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xloginsert.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "backup/basebackup.h"
+#include "catalog/catversion.h"
+#include "catalog/pg_control.h"
+#include "catalog/pg_database.h"
+#include "common/controldata_utils.h"
+#include "common/file_utils.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "port/pg_iovec.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "postmaster/walwriter.h"
+#include "replication/logical.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/snapbuild.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/large_object.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/reinit.h"
+#include "storage/smgr.h"
+#include "storage/spin.h"
+#include "storage/sync.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/relmapper.h"
+#include "utils/pg_rusage.h"
+#include "utils/snapmgr.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+extern uint32 bootstrap_data_checksum_version;
+
+/* timeline ID to be used when bootstrapping */
+#define BootstrapTimeLineID		1
+
+/* User-settable parameters */
+int			max_wal_size_mb = 1024; /* 1 GB */
+int			min_wal_size_mb = 80;	/* 80 MB */
+int			wal_keep_size_mb = 0;
+int			XLOGbuffers = -1;
+int			XLogArchiveTimeout = 0;
+int			XLogArchiveMode = ARCHIVE_MODE_OFF;
+char	   *XLogArchiveCommand = NULL;
+bool		EnableHotStandby = false;
+bool		fullPageWrites = true;
+bool		wal_log_hints = false;
+int			wal_compression = WAL_COMPRESSION_NONE;
+char	   *wal_consistency_checking_string = NULL;
+bool	   *wal_consistency_checking = NULL;
+bool		wal_init_zero = true;
+bool		wal_recycle = true;
+bool		log_checkpoints = true;
+int			sync_method = DEFAULT_SYNC_METHOD;
+int			wal_level = WAL_LEVEL_MINIMAL;
+int			CommitDelay = 0;	/* precommit delay in microseconds */
+int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
+int			wal_retrieve_retry_interval = 5000;
+int			max_slot_wal_keep_size_mb = -1;
+int			wal_decode_buffer_size = 512 * 1024;
+bool		track_wal_io_timing = false;
+
+#ifdef WAL_DEBUG
+bool		XLOG_DEBUG = false;
+#endif
+
+int			wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
+
+/*
+ * Number of WAL insertion locks to use. A higher value allows more insertions
+ * to happen concurrently, but adds some CPU overhead to flushing the WAL,
+ * which needs to iterate all the locks.
+ */
+#define NUM_XLOGINSERT_LOCKS  8
+
+/*
+ * Max distance from last checkpoint, before triggering a new xlog-based
+ * checkpoint.
+ */
+int			CheckPointSegments;
+
+/* Estimated distance between checkpoints, in bytes */
+static double CheckPointDistanceEstimate = 0;
+static double PrevCheckPointDistance = 0;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry sync_method_options[] = {
+	{"fsync", SYNC_METHOD_FSYNC, false},
+#ifdef HAVE_FSYNC_WRITETHROUGH
+	{"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
+#endif
+#ifdef HAVE_FDATASYNC
+	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
+#endif
+#ifdef OPEN_SYNC_FLAG
+	{"open_sync", SYNC_METHOD_OPEN, false},
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+	{"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
+#endif
+	{NULL, 0, false}
+};
+
+
+/*
+ * Although only "on", "off", and "always" are documented,
+ * we accept all the likely variants of "on" and "off".
+ */
+const struct config_enum_entry archive_mode_options[] = {
+	{"always", ARCHIVE_MODE_ALWAYS, false},
+	{"on", ARCHIVE_MODE_ON, false},
+	{"off", ARCHIVE_MODE_OFF, false},
+	{"true", ARCHIVE_MODE_ON, true},
+	{"false", ARCHIVE_MODE_OFF, true},
+	{"yes", ARCHIVE_MODE_ON, true},
+	{"no", ARCHIVE_MODE_OFF, true},
+	{"1", ARCHIVE_MODE_ON, true},
+	{"0", ARCHIVE_MODE_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
+ * Statistics for current checkpoint are collected in this global struct.
+ * Because only the checkpointer or a stand-alone backend can perform
+ * checkpoints, this will be unused in normal backends.
+ */
+CheckpointStatsData CheckpointStats;
+
+/*
+ * During recovery, lastFullPageWrites keeps track of full_page_writes that
+ * the replayed WAL records indicate. It's initialized with full_page_writes
+ * that the recovery starting checkpoint record indicates, and then updated
+ * each time XLOG_FPW_CHANGE record is replayed.
+ */
+static bool lastFullPageWrites;
+
+/*
+ * Local copy of the state tracked by SharedRecoveryState in shared memory,
+ * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
+ * means "not known, need to check the shared state".
+ */
+static bool LocalRecoveryInProgress = true;
+
+/*
+ * Local state for XLogInsertAllowed():
+ *		1: unconditionally allowed to insert XLOG
+ *		0: unconditionally not allowed to insert XLOG
+ *		-1: must check RecoveryInProgress(); disallow until it is false
+ * Most processes start with -1 and transition to 1 after seeing that recovery
+ * is not in progress.  But we can also force the value for special cases.
+ * The coding in XLogInsertAllowed() depends on the first two of these states
+ * being numerically the same as bool true and false.
+ */
+static int	LocalXLogInsertAllowed = -1;
+
+/*
+ * ProcLastRecPtr points to the start of the last XLOG record inserted by the
+ * current backend.  It is updated for all inserts.  XactLastRecEnd points to
+ * end+1 of the last record, and is reset when we end a top-level transaction,
+ * or start a new one; so it can be used to tell if the current transaction has
+ * created any XLOG records.
+ *
+ * While in parallel mode, this may not be fully up to date.  When committing,
+ * a transaction can assume this covers all xlog records written either by the
+ * user backend or by any parallel worker which was present at any point during
+ * the transaction.  But when aborting, or when still in parallel mode, other
+ * parallel backends may have written WAL records at later LSNs than the value
+ * stored here.  The parallel leader advances its own copy, when necessary,
+ * in WaitForParallelWorkersToFinish.
+ */
+XLogRecPtr	ProcLastRecPtr = InvalidXLogRecPtr;
+XLogRecPtr	XactLastRecEnd = InvalidXLogRecPtr;
+XLogRecPtr	XactLastCommitEnd = InvalidXLogRecPtr;
+
+/*
+ * RedoRecPtr is this backend's local copy of the REDO record pointer
+ * (which is almost but not quite the same as a pointer to the most recent
+ * CHECKPOINT record).  We update this from the shared-memory copy,
+ * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
+ * hold an insertion lock).  See XLogInsertRecord for details.  We are also
+ * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
+ * see GetRedoRecPtr.
+ *
+ * NB: Code that uses this variable must be prepared not only for the
+ * possibility that it may be arbitrarily out of date, but also for the
+ * possibility that it might be set to InvalidXLogRecPtr. We used to
+ * initialize it as a side effect of the first call to RecoveryInProgress(),
+ * which meant that most code that might use it could assume that it had a
+ * real if perhaps stale value. That's no longer the case.
+ */
+static XLogRecPtr RedoRecPtr;
+
+/*
+ * doPageWrites is this backend's local copy of (forcePageWrites ||
+ * fullPageWrites).  It is used together with RedoRecPtr to decide whether
+ * a full-page image of a page need to be taken.
+ *
+ * NB: Initially this is false, and there's no guarantee that it will be
+ * initialized to any other value before it is first used. Any code that
+ * makes use of it must recheck the value after obtaining a WALInsertLock,
+ * and respond appropriately if it turns out that the previous value wasn't
+ * accurate.
+ */
+static bool doPageWrites;
+
+/*----------
+ * Shared-memory data structures for XLOG control
+ *
+ * LogwrtRqst indicates a byte position that we need to write and/or fsync
+ * the log up to (all records before that point must be written or fsynced).
+ * LogwrtResult indicates the byte positions we have already written/fsynced.
+ * These structs are identical but are declared separately to indicate their
+ * slightly different functions.
+ *
+ * To read XLogCtl->LogwrtResult, you must hold either info_lck or
+ * WALWriteLock.  To update it, you need to hold both locks.  The point of
+ * this arrangement is that the value can be examined by code that already
+ * holds WALWriteLock without needing to grab info_lck as well.  In addition
+ * to the shared variable, each backend has a private copy of LogwrtResult,
+ * which is updated when convenient.
+ *
+ * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
+ * (protected by info_lck), but we don't need to cache any copies of it.
+ *
+ * info_lck is only held long enough to read/update the protected variables,
+ * so it's a plain spinlock.  The other locks are held longer (potentially
+ * over I/O operations), so we use LWLocks for them.  These locks are:
+ *
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
+ * It is only held while initializing and changing the mapping.  If the
+ * contents of the buffer being replaced haven't been written yet, the mapping
+ * lock is released while the write is done, and reacquired afterwards.
+ *
+ * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
+ * XLogFlush).
+ *
+ * ControlFileLock: must be held to read/update control file or create
+ * new log file.
+ *
+ *----------
+ */
+
+typedef struct XLogwrtRqst
+{
+	XLogRecPtr	Write;			/* last byte + 1 to write out */
+	XLogRecPtr	Flush;			/* last byte + 1 to flush */
+} XLogwrtRqst;
+
+typedef struct XLogwrtResult
+{
+	XLogRecPtr	Write;			/* last byte + 1 written out */
+	XLogRecPtr	Flush;			/* last byte + 1 flushed */
+} XLogwrtResult;
+
+/*
+ * Inserting to WAL is protected by a small fixed number of WAL insertion
+ * locks. To insert to the WAL, you must hold one of the locks - it doesn't
+ * matter which one. To lock out other concurrent insertions, you must hold
+ * of them. Each WAL insertion lock consists of a lightweight lock, plus an
+ * indicator of how far the insertion has progressed (insertingAt).
+ *
+ * The insertingAt values are read when a process wants to flush WAL from
+ * the in-memory buffers to disk, to check that all the insertions to the
+ * region the process is about to write out have finished. You could simply
+ * wait for all currently in-progress insertions to finish, but the
+ * insertingAt indicator allows you to ignore insertions to later in the WAL,
+ * so that you only wait for the insertions that are modifying the buffers
+ * you're about to write out.
+ *
+ * This isn't just an optimization. If all the WAL buffers are dirty, an
+ * inserter that's holding a WAL insert lock might need to evict an old WAL
+ * buffer, which requires flushing the WAL. If it's possible for an inserter
+ * to block on another inserter unnecessarily, deadlock can arise when two
+ * inserters holding a WAL insert lock wait for each other to finish their
+ * insertion.
+ *
+ * Small WAL records that don't cross a page boundary never update the value,
+ * the WAL record is just copied to the page and the lock is released. But
+ * to avoid the deadlock-scenario explained above, the indicator is always
+ * updated before sleeping while holding an insertion lock.
+ *
+ * lastImportantAt contains the LSN of the last important WAL record inserted
+ * using a given lock. This value is used to detect if there has been
+ * important WAL activity since the last time some action, like a checkpoint,
+ * was performed - allowing to not repeat the action if not. The LSN is
+ * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
+ * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
+ * records.  Tracking the WAL activity directly in WALInsertLock has the
+ * advantage of not needing any additional locks to update the value.
+ */
+typedef struct
+{
+	LWLock		lock;
+	XLogRecPtr	insertingAt;
+	XLogRecPtr	lastImportantAt;
+} WALInsertLock;
+
+/*
+ * All the WAL insertion locks are allocated as an array in shared memory. We
+ * force the array stride to be a power of 2, which saves a few cycles in
+ * indexing, but more importantly also ensures that individual slots don't
+ * cross cache line boundaries. (Of course, we have to also ensure that the
+ * array start address is suitably aligned.)
+ */
+typedef union WALInsertLockPadded
+{
+	WALInsertLock l;
+	char		pad[PG_CACHE_LINE_SIZE];
+} WALInsertLockPadded;
+
+/*
+ * Session status of running backup, used for sanity checks in SQL-callable
+ * functions to start and stop backups.
+ */
+static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
+
+/*
+ * Shared state data for WAL insertion.
+ */
+typedef struct XLogCtlInsert
+{
+	slock_t		insertpos_lck;	/* protects CurrBytePos and PrevBytePos */
+
+	/*
+	 * CurrBytePos is the end of reserved WAL. The next record will be
+	 * inserted at that position. PrevBytePos is the start position of the
+	 * previously inserted (or rather, reserved) record - it is copied to the
+	 * prev-link of the next record. These are stored as "usable byte
+	 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
+	 */
+	uint64		CurrBytePos;
+	uint64		PrevBytePos;
+
+	/*
+	 * Make sure the above heavily-contended spinlock and byte positions are
+	 * on their own cache line. In particular, the RedoRecPtr and full page
+	 * write variables below should be on a different cache line. They are
+	 * read on every WAL insertion, but updated rarely, and we don't want
+	 * those reads to steal the cache line containing Curr/PrevBytePos.
+	 */
+	char		pad[PG_CACHE_LINE_SIZE];
+
+	/*
+	 * fullPageWrites is the authoritative value used by all backends to
+	 * determine whether to write full-page image to WAL. This shared value,
+	 * instead of the process-local fullPageWrites, is required because, when
+	 * full_page_writes is changed by SIGHUP, we must WAL-log it before it
+	 * actually affects WAL-logging by backends.  Checkpointer sets at startup
+	 * or after SIGHUP.
+	 *
+	 * To read these fields, you must hold an insertion lock. To modify them,
+	 * you must hold ALL the locks.
+	 */
+	XLogRecPtr	RedoRecPtr;		/* current redo point for insertions */
+	bool		forcePageWrites;	/* forcing full-page writes for PITR? */
+	bool		fullPageWrites;
+
+	/*
+	 * runningBackups is a counter indicating the number of backups currently
+	 * in progress. forcePageWrites is set to true when runningBackups is
+	 * non-zero. lastBackupStart is the latest checkpoint redo location used
+	 * as a starting point for an online backup.
+	 */
+	int			runningBackups;
+	XLogRecPtr	lastBackupStart;
+
+	/*
+	 * WAL insertion locks.
+	 */
+	WALInsertLockPadded *WALInsertLocks;
+} XLogCtlInsert;
+
+/*
+ * Total shared-memory state for XLOG.
+ */
+typedef struct XLogCtlData
+{
+	XLogCtlInsert Insert;
+
+	/* Protected by info_lck: */
+	XLogwrtRqst LogwrtRqst;
+	XLogRecPtr	RedoRecPtr;		/* a recent copy of Insert->RedoRecPtr */
+	FullTransactionId ckptFullXid;	/* nextXid of latest checkpoint */
+	XLogRecPtr	asyncXactLSN;	/* LSN of newest async commit/abort */
+	XLogRecPtr	replicationSlotMinLSN;	/* oldest LSN needed by any slot */
+
+	XLogSegNo	lastRemovedSegNo;	/* latest removed/recycled XLOG segment */
+
+	/* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
+	XLogRecPtr	unloggedLSN;
+	slock_t		ulsn_lck;
+
+	/* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
+	pg_time_t	lastSegSwitchTime;
+	XLogRecPtr	lastSegSwitchLSN;
+
+	/*
+	 * Protected by info_lck and WALWriteLock (you must hold either lock to
+	 * read it, but both to update)
+	 */
+	XLogwrtResult LogwrtResult;
+
+	/*
+	 * Latest initialized page in the cache (last byte position + 1).
+	 *
+	 * To change the identity of a buffer (and InitializedUpTo), you need to
+	 * hold WALBufMappingLock.  To change the identity of a buffer that's
+	 * still dirty, the old page needs to be written out first, and for that
+	 * you need WALWriteLock, and you need to ensure that there are no
+	 * in-progress insertions to the page by calling
+	 * WaitXLogInsertionsToFinish().
+	 */
+	XLogRecPtr	InitializedUpTo;
+
+	/*
+	 * These values do not change after startup, although the pointed-to pages
+	 * and xlblocks values certainly do.  xlblocks values are protected by
+	 * WALBufMappingLock.
+	 */
+	char	   *pages;			/* buffers for unwritten XLOG pages */
+	XLogRecPtr *xlblocks;		/* 1st byte ptr-s + XLOG_BLCKSZ */
+	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
+
+	/*
+	 * InsertTimeLineID is the timeline into which new WAL is being inserted
+	 * and flushed. It is zero during recovery, and does not change once set.
+	 *
+	 * If we create a new timeline when the system was started up,
+	 * PrevTimeLineID is the old timeline's ID that we forked off from.
+	 * Otherwise it's equal to InsertTimeLineID.
+	 */
+	TimeLineID	InsertTimeLineID;
+	TimeLineID	PrevTimeLineID;
+
+	/*
+	 * SharedRecoveryState indicates if we're still in crash or archive
+	 * recovery.  Protected by info_lck.
+	 */
+	RecoveryState SharedRecoveryState;
+
+	/*
+	 * InstallXLogFileSegmentActive indicates whether the checkpointer should
+	 * arrange for future segments by recycling and/or PreallocXlogFiles().
+	 * Protected by ControlFileLock.  Only the startup process changes it.  If
+	 * true, anyone can use InstallXLogFileSegment().  If false, the startup
+	 * process owns the exclusive right to install segments, by reading from
+	 * the archive and possibly replacing existing files.
+	 */
+	bool		InstallXLogFileSegmentActive;
+
+	/*
+	 * WalWriterSleeping indicates whether the WAL writer is currently in
+	 * low-power mode (and hence should be nudged if an async commit occurs).
+	 * Protected by info_lck.
+	 */
+	bool		WalWriterSleeping;
+
+	/*
+	 * During recovery, we keep a copy of the latest checkpoint record here.
+	 * lastCheckPointRecPtr points to start of checkpoint record and
+	 * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
+	 * checkpointer when it wants to create a restartpoint.
+	 *
+	 * Protected by info_lck.
+	 */
+	XLogRecPtr	lastCheckPointRecPtr;
+	XLogRecPtr	lastCheckPointEndPtr;
+	CheckPoint	lastCheckPoint;
+
+	/*
+	 * lastFpwDisableRecPtr points to the start of the last replayed
+	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
+	 */
+	XLogRecPtr	lastFpwDisableRecPtr;
+
+	slock_t		info_lck;		/* locks shared variables shown above */
+} XLogCtlData;
+
+static XLogCtlData *XLogCtl = NULL;
+
+/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
+static WALInsertLockPadded *WALInsertLocks = NULL;
+
+/*
+ * We maintain an image of pg_control in shared memory.
+ */
+static ControlFileData *ControlFile = NULL;
+
+/*
+ * Calculate the amount of space left on the page after 'endptr'. Beware
+ * multiple evaluation!
+ */
+#define INSERT_FREESPACE(endptr)	\
+	(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
+
+/* Macro to advance to next buffer index. */
+#define NextBufIdx(idx)		\
+		(((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
+
+/*
+ * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
+ * would hold if it was in cache, the page containing 'recptr'.
+ */
+#define XLogRecPtrToBufIdx(recptr)	\
+	(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
+
+/*
+ * These are the number of bytes in a WAL page usable for WAL data.
+ */
+#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
+
+/*
+ * Convert values of GUCs measured in megabytes to equiv. segment count.
+ * Rounds down.
+ */
+#define ConvertToXSegs(x, segsize)	XLogMBVarToSegs((x), (segsize))
+
+/* The number of bytes in a WAL segment usable for WAL data. */
+static int	UsableBytesInSegment;
+
+/*
+ * Private, possibly out-of-date copy of shared LogwrtResult.
+ * See discussion above.
+ */
+static XLogwrtResult LogwrtResult = {0, 0};
+
+/*
+ * openLogFile is -1 or a kernel FD for an open log file segment.
+ * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
+ * These variables are only used to write the XLOG, and so will normally refer
+ * to the active segment.
+ *
+ * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
+ */
+static int	openLogFile = -1;
+static XLogSegNo openLogSegNo = 0;
+static TimeLineID openLogTLI = 0;
+
+/*
+ * Local copies of equivalent fields in the control file.  When running
+ * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
+ * expect to replay all the WAL available, and updateMinRecoveryPoint is
+ * switched to false to prevent any updates while replaying records.
+ * Those values are kept consistent as long as crash recovery runs.
+ */
+static XLogRecPtr LocalMinRecoveryPoint;
+static TimeLineID LocalMinRecoveryPointTLI;
+static bool updateMinRecoveryPoint = true;
+
+/* For WALInsertLockAcquire/Release functions */
+static int	MyLockNo = 0;
+static bool holdingAllLocks = false;
+
+#ifdef WAL_DEBUG
+static MemoryContext walDebugCxt = NULL;
+#endif
+
+static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
+										XLogRecPtr EndOfLog,
+										TimeLineID newTLI);
+static void CheckRequiredParameterValues(void);
+static void XLogReportParameters(void);
+static int	LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
+static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
+												  XLogRecPtr missingContrecPtr,
+												  TimeLineID newTLI);
+static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
+static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
+
+static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
+								  bool opportunistic);
+static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
+static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
+								   bool find_free, XLogSegNo max_segno,
+								   TimeLineID tli);
+static void XLogFileClose(void);
+static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
+static void RemoveTempXlogFiles(void);
+static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
+							   XLogRecPtr endptr, TimeLineID insertTLI);
+static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
+						   XLogSegNo *endlogSegNo, TimeLineID insertTLI);
+static void UpdateLastRemovedPtr(char *filename);
+static void ValidateXLOGDirectoryStructure(void);
+static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
+static bool PerformRecoveryXLogAction(void);
+static void InitControlFile(uint64 sysidentifier);
+static void WriteControlFile(void);
+static void ReadControlFile(void);
+static void UpdateControlFile(void);
+static char *str_time(pg_time_t tnow);
+
+static void pg_backup_start_callback(int code, Datum arg);
+
+static int	get_sync_bit(int method);
+
+static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
+								XLogRecData *rdata,
+								XLogRecPtr StartPos, XLogRecPtr EndPos,
+								TimeLineID tli);
+static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
+									  XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
+static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+							  XLogRecPtr *PrevPtr);
+static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
+static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
+static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
+static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
+static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+
+static void WALInsertLockAcquire(void);
+static void WALInsertLockAcquireExclusive(void);
+static void WALInsertLockRelease(void);
+static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
+
+/*
+ * Insert an XLOG record represented by an already-constructed chain of data
+ * chunks.  This is a low-level routine; to construct the WAL record header
+ * and data, use the higher-level routines in xloginsert.c.
+ *
+ * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
+ * WAL record applies to, that were not included in the record as full page
+ * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
+ * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
+ * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
+ * record is always inserted.
+ *
+ * 'flags' gives more in-depth control on the record being inserted. See
+ * XLogSetRecordFlags() for details.
+ *
+ * 'topxid_included' tells whether the top-transaction id is logged along with
+ * current subtransaction. See XLogRecordAssemble().
+ *
+ * The first XLogRecData in the chain must be for the record header, and its
+ * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
+ * xl_crc fields in the header, the rest of the header must already be filled
+ * by the caller.
+ *
+ * Returns XLOG pointer to end of record (beginning of next record).
+ * This can be used as LSN for data pages affected by the logged action.
+ * (LSN is the XLOG point up to which the XLOG must be flushed to disk
+ * before the data page can be written out.  This implements the basic
+ * WAL rule "write the log before the data".)
+ */
+XLogRecPtr
+XLogInsertRecord(XLogRecData *rdata,
+				 XLogRecPtr fpw_lsn,
+				 uint8 flags,
+				 int num_fpi,
+				 bool topxid_included)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	pg_crc32c	rdata_crc;
+	bool		inserted;
+	XLogRecord *rechdr = (XLogRecord *) rdata->data;
+	uint8		info = rechdr->xl_info & ~XLR_INFO_MASK;
+	bool		isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
+							   info == XLOG_SWITCH);
+	XLogRecPtr	StartPos;
+	XLogRecPtr	EndPos;
+	bool		prevDoPageWrites = doPageWrites;
+	TimeLineID	insertTLI;
+
+	/* we assume that all of the record header is in the first chunk */
+	Assert(rdata->len >= SizeOfXLogRecord);
+
+	/* cross-check on whether we should be here or not */
+	if (!XLogInsertAllowed())
+		elog(ERROR, "cannot make new WAL entries during recovery");
+
+	/*
+	 * Given that we're not in recovery, InsertTimeLineID is set and can't
+	 * change, so we can read it without a lock.
+	 */
+	insertTLI = XLogCtl->InsertTimeLineID;
+
+	/*----------
+	 *
+	 * We have now done all the preparatory work we can without holding a
+	 * lock or modifying shared state. From here on, inserting the new WAL
+	 * record to the shared WAL buffer cache is a two-step process:
+	 *
+	 * 1. Reserve the right amount of space from the WAL. The current head of
+	 *	  reserved space is kept in Insert->CurrBytePos, and is protected by
+	 *	  insertpos_lck.
+	 *
+	 * 2. Copy the record to the reserved WAL space. This involves finding the
+	 *	  correct WAL buffer containing the reserved space, and copying the
+	 *	  record in place. This can be done concurrently in multiple processes.
+	 *
+	 * To keep track of which insertions are still in-progress, each concurrent
+	 * inserter acquires an insertion lock. In addition to just indicating that
+	 * an insertion is in progress, the lock tells others how far the inserter
+	 * has progressed. There is a small fixed number of insertion locks,
+	 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
+	 * boundary, it updates the value stored in the lock to the how far it has
+	 * inserted, to allow the previous buffer to be flushed.
+	 *
+	 * Holding onto an insertion lock also protects RedoRecPtr and
+	 * fullPageWrites from changing until the insertion is finished.
+	 *
+	 * Step 2 can usually be done completely in parallel. If the required WAL
+	 * page is not initialized yet, you have to grab WALBufMappingLock to
+	 * initialize it, but the WAL writer tries to do that ahead of insertions
+	 * to avoid that from happening in the critical path.
+	 *
+	 *----------
+	 */
+	START_CRIT_SECTION();
+	if (isLogSwitch)
+		WALInsertLockAcquireExclusive();
+	else
+		WALInsertLockAcquire();
+
+	/*
+	 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
+	 * to go back and have the caller recompute everything. This can only
+	 * happen just after a checkpoint, so it's better to be slow in this case
+	 * and fast otherwise.
+	 *
+	 * Also check to see if fullPageWrites or forcePageWrites was just turned
+	 * on; if we weren't already doing full-page writes then go back and
+	 * recompute.
+	 *
+	 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
+	 * affect the contents of the XLOG record, so we'll update our local copy
+	 * but not force a recomputation.  (If doPageWrites was just turned off,
+	 * we could recompute the record without full pages, but we choose not to
+	 * bother.)
+	 */
+	if (RedoRecPtr != Insert->RedoRecPtr)
+	{
+		Assert(RedoRecPtr < Insert->RedoRecPtr);
+		RedoRecPtr = Insert->RedoRecPtr;
+	}
+	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
+
+	if (doPageWrites &&
+		(!prevDoPageWrites ||
+		 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
+	{
+		/*
+		 * Oops, some buffer now needs to be backed up that the caller didn't
+		 * back up.  Start over.
+		 */
+		WALInsertLockRelease();
+		END_CRIT_SECTION();
+		return InvalidXLogRecPtr;
+	}
+
+	/*
+	 * Reserve space for the record in the WAL. This also sets the xl_prev
+	 * pointer.
+	 */
+	if (isLogSwitch)
+		inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
+	else
+	{
+		ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
+								  &rechdr->xl_prev);
+		inserted = true;
+	}
+
+	if (inserted)
+	{
+		/*
+		 * Now that xl_prev has been filled in, calculate CRC of the record
+		 * header.
+		 */
+		rdata_crc = rechdr->xl_crc;
+		COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
+		FIN_CRC32C(rdata_crc);
+		rechdr->xl_crc = rdata_crc;
+
+		/*
+		 * All the record data, including the header, is now ready to be
+		 * inserted. Copy the record in the space reserved.
+		 */
+		CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
+							StartPos, EndPos, insertTLI);
+
+		/*
+		 * Unless record is flagged as not important, update LSN of last
+		 * important record in the current slot. When holding all locks, just
+		 * update the first one.
+		 */
+		if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
+		{
+			int			lockno = holdingAllLocks ? 0 : MyLockNo;
+
+			WALInsertLocks[lockno].l.lastImportantAt = StartPos;
+		}
+	}
+	else
+	{
+		/*
+		 * This was an xlog-switch record, but the current insert location was
+		 * already exactly at the beginning of a segment, so there was no need
+		 * to do anything.
+		 */
+	}
+
+	/*
+	 * Done! Let others know that we're finished.
+	 */
+	WALInsertLockRelease();
+
+	END_CRIT_SECTION();
+
+	MarkCurrentTransactionIdLoggedIfAny();
+
+	/*
+	 * Mark top transaction id is logged (if needed) so that we should not try
+	 * to log it again with the next WAL record in the current subtransaction.
+	 */
+	if (topxid_included)
+		MarkSubxactTopXidLogged();
+
+	/*
+	 * Update shared LogwrtRqst.Write, if we crossed page boundary.
+	 */
+	if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+	{
+		SpinLockAcquire(&XLogCtl->info_lck);
+		/* advance global request to include new block(s) */
+		if (XLogCtl->LogwrtRqst.Write < EndPos)
+			XLogCtl->LogwrtRqst.Write = EndPos;
+		/* update local result copy while I have the chance */
+		LogwrtResult = XLogCtl->LogwrtResult;
+		SpinLockRelease(&XLogCtl->info_lck);
+	}
+
+	/*
+	 * If this was an XLOG_SWITCH record, flush the record and the empty
+	 * padding space that fills the rest of the segment, and perform
+	 * end-of-segment actions (eg, notifying archiver).
+	 */
+	if (isLogSwitch)
+	{
+		TRACE_POSTGRESQL_WAL_SWITCH();
+		XLogFlush(EndPos);
+
+		/*
+		 * Even though we reserved the rest of the segment for us, which is
+		 * reflected in EndPos, we return a pointer to just the end of the
+		 * xlog-switch record.
+		 */
+		if (inserted)
+		{
+			EndPos = StartPos + SizeOfXLogRecord;
+			if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
+			{
+				uint64		offset = XLogSegmentOffset(EndPos, wal_segment_size);
+
+				if (offset == EndPos % XLOG_BLCKSZ)
+					EndPos += SizeOfXLogLongPHD;
+				else
+					EndPos += SizeOfXLogShortPHD;
+			}
+		}
+	}
+
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG)
+	{
+		static XLogReaderState *debug_reader = NULL;
+		XLogRecord *record;
+		DecodedXLogRecord *decoded;
+		StringInfoData buf;
+		StringInfoData recordBuf;
+		char	   *errormsg = NULL;
+		MemoryContext oldCxt;
+
+		oldCxt = MemoryContextSwitchTo(walDebugCxt);
+
+		initStringInfo(&buf);
+		appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
+
+		/*
+		 * We have to piece together the WAL record data from the XLogRecData
+		 * entries, so that we can pass it to the rm_desc function as one
+		 * contiguous chunk.
+		 */
+		initStringInfo(&recordBuf);
+		for (; rdata != NULL; rdata = rdata->next)
+			appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
+
+		/* We also need temporary space to decode the record. */
+		record = (XLogRecord *) recordBuf.data;
+		decoded = (DecodedXLogRecord *)
+			palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
+
+		if (!debug_reader)
+			debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
+											  XL_ROUTINE(), NULL);
+
+		if (!debug_reader)
+		{
+			appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
+		}
+		else if (!DecodeXLogRecord(debug_reader,
+								   decoded,
+								   record,
+								   EndPos,
+								   &errormsg))
+		{
+			appendStringInfo(&buf, "error decoding record: %s",
+							 errormsg ? errormsg : "no error message");
+		}
+		else
+		{
+			appendStringInfoString(&buf, " - ");
+
+			debug_reader->record = decoded;
+			xlog_outdesc(&buf, debug_reader);
+			debug_reader->record = NULL;
+		}
+		elog(LOG, "%s", buf.data);
+
+		pfree(decoded);
+		pfree(buf.data);
+		pfree(recordBuf.data);
+		MemoryContextSwitchTo(oldCxt);
+	}
+#endif
+
+	/*
+	 * Update our global variables
+	 */
+	ProcLastRecPtr = StartPos;
+	XactLastRecEnd = EndPos;
+
+	/* Report WAL traffic to the instrumentation. */
+	if (inserted)
+	{
+		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
+		pgWalUsage.wal_records++;
+		pgWalUsage.wal_fpi += num_fpi;
+	}
+
+	return EndPos;
+}
+
+/*
+ * Reserves the right amount of space for a record of given size from the WAL.
+ * *StartPos is set to the beginning of the reserved section, *EndPos to
+ * its end+1. *PrevPtr is set to the beginning of the previous record; it is
+ * used to set the xl_prev of this record.
+ *
+ * This is the performance critical part of XLogInsert that must be serialized
+ * across backends. The rest can happen mostly in parallel. Try to keep this
+ * section as short as possible, insertpos_lck can be heavily contended on a
+ * busy system.
+ *
+ * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
+ * where we actually copy the record to the reserved space.
+ */
+static void
+ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
+						  XLogRecPtr *PrevPtr)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint64		startbytepos;
+	uint64		endbytepos;
+	uint64		prevbytepos;
+
+	size = MAXALIGN(size);
+
+	/* All (non xlog-switch) records should contain data. */
+	Assert(size > SizeOfXLogRecord);
+
+	/*
+	 * The duration the spinlock needs to be held is minimized by minimizing
+	 * the calculations that have to be done while holding the lock. The
+	 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
+	 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
+	 * page headers. The mapping between "usable" byte positions and physical
+	 * positions (XLogRecPtrs) can be done outside the locked region, and
+	 * because the usable byte position doesn't include any headers, reserving
+	 * X bytes from WAL is almost as simple as "CurrBytePos += X".
+	 */
+	SpinLockAcquire(&Insert->insertpos_lck);
+
+	startbytepos = Insert->CurrBytePos;
+	endbytepos = startbytepos + size;
+	prevbytepos = Insert->PrevBytePos;
+	Insert->CurrBytePos = endbytepos;
+	Insert->PrevBytePos = startbytepos;
+
+	SpinLockRelease(&Insert->insertpos_lck);
+
+	*StartPos = XLogBytePosToRecPtr(startbytepos);
+	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
+	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+	/*
+	 * Check that the conversions between "usable byte positions" and
+	 * XLogRecPtrs work consistently in both directions.
+	 */
+	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+}
+
+/*
+ * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
+ *
+ * A log-switch record is handled slightly differently. The rest of the
+ * segment will be reserved for this insertion, as indicated by the returned
+ * *EndPos value. However, if we are already at the beginning of the current
+ * segment, *StartPos and *EndPos are set to the current location without
+ * reserving any space, and the function returns false.
+*/
+static bool
+ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint64		startbytepos;
+	uint64		endbytepos;
+	uint64		prevbytepos;
+	uint32		size = MAXALIGN(SizeOfXLogRecord);
+	XLogRecPtr	ptr;
+	uint32		segleft;
+
+	/*
+	 * These calculations are a bit heavy-weight to be done while holding a
+	 * spinlock, but since we're holding all the WAL insertion locks, there
+	 * are no other inserters competing for it. GetXLogInsertRecPtr() does
+	 * compete for it, but that's not called very frequently.
+	 */
+	SpinLockAcquire(&Insert->insertpos_lck);
+
+	startbytepos = Insert->CurrBytePos;
+
+	ptr = XLogBytePosToEndRecPtr(startbytepos);
+	if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
+	{
+		SpinLockRelease(&Insert->insertpos_lck);
+		*EndPos = *StartPos = ptr;
+		return false;
+	}
+
+	endbytepos = startbytepos + size;
+	prevbytepos = Insert->PrevBytePos;
+
+	*StartPos = XLogBytePosToRecPtr(startbytepos);
+	*EndPos = XLogBytePosToEndRecPtr(endbytepos);
+
+	segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
+	if (segleft != wal_segment_size)
+	{
+		/* consume the rest of the segment */
+		*EndPos += segleft;
+		endbytepos = XLogRecPtrToBytePos(*EndPos);
+	}
+	Insert->CurrBytePos = endbytepos;
+	Insert->PrevBytePos = startbytepos;
+
+	SpinLockRelease(&Insert->insertpos_lck);
+
+	*PrevPtr = XLogBytePosToRecPtr(prevbytepos);
+
+	Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
+	Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
+	Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
+	Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
+
+	return true;
+}
+
+/*
+ * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
+ * area in the WAL.
+ */
+static void
+CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
+					XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
+{
+	char	   *currpos;
+	int			freespace;
+	int			written;
+	XLogRecPtr	CurrPos;
+	XLogPageHeader pagehdr;
+
+	/*
+	 * Get a pointer to the right place in the right WAL buffer to start
+	 * inserting to.
+	 */
+	CurrPos = StartPos;
+	currpos = GetXLogBuffer(CurrPos, tli);
+	freespace = INSERT_FREESPACE(CurrPos);
+
+	/*
+	 * there should be enough space for at least the first field (xl_tot_len)
+	 * on this page.
+	 */
+	Assert(freespace >= sizeof(uint32));
+
+	/* Copy record data */
+	written = 0;
+	while (rdata != NULL)
+	{
+		char	   *rdata_data = rdata->data;
+		int			rdata_len = rdata->len;
+
+		while (rdata_len > freespace)
+		{
+			/*
+			 * Write what fits on this page, and continue on the next page.
+			 */
+			Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
+			memcpy(currpos, rdata_data, freespace);
+			rdata_data += freespace;
+			rdata_len -= freespace;
+			written += freespace;
+			CurrPos += freespace;
+
+			/*
+			 * Get pointer to beginning of next page, and set the xlp_rem_len
+			 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
+			 *
+			 * It's safe to set the contrecord flag and xlp_rem_len without a
+			 * lock on the page. All the other flags were already set when the
+			 * page was initialized, in AdvanceXLInsertBuffer, and we're the
+			 * only backend that needs to set the contrecord flag.
+			 */
+			currpos = GetXLogBuffer(CurrPos, tli);
+			pagehdr = (XLogPageHeader) currpos;
+			pagehdr->xlp_rem_len = write_len - written;
+			pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
+
+			/* skip over the page header */
+			if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
+			{
+				CurrPos += SizeOfXLogLongPHD;
+				currpos += SizeOfXLogLongPHD;
+			}
+			else
+			{
+				CurrPos += SizeOfXLogShortPHD;
+				currpos += SizeOfXLogShortPHD;
+			}
+			freespace = INSERT_FREESPACE(CurrPos);
+		}
+
+		Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
+		memcpy(currpos, rdata_data, rdata_len);
+		currpos += rdata_len;
+		CurrPos += rdata_len;
+		freespace -= rdata_len;
+		written += rdata_len;
+
+		rdata = rdata->next;
+	}
+	Assert(written == write_len);
+
+	/*
+	 * If this was an xlog-switch, it's not enough to write the switch record,
+	 * we also have to consume all the remaining space in the WAL segment.  We
+	 * have already reserved that space, but we need to actually fill it.
+	 */
+	if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
+	{
+		/* An xlog-switch record doesn't contain any data besides the header */
+		Assert(write_len == SizeOfXLogRecord);
+
+		/* Assert that we did reserve the right amount of space */
+		Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
+
+		/* Use up all the remaining space on the current page */
+		CurrPos += freespace;
+
+		/*
+		 * Cause all remaining pages in the segment to be flushed, leaving the
+		 * XLog position where it should be, at the start of the next segment.
+		 * We do this one page at a time, to make sure we don't deadlock
+		 * against ourselves if wal_buffers < wal_segment_size.
+		 */
+		while (CurrPos < EndPos)
+		{
+			/*
+			 * The minimal action to flush the page would be to call
+			 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
+			 * AdvanceXLInsertBuffer(...).  The page would be left initialized
+			 * mostly to zeros, except for the page header (always the short
+			 * variant, as this is never a segment's first page).
+			 *
+			 * The large vistas of zeros are good for compressibility, but the
+			 * headers interrupting them every XLOG_BLCKSZ (with values that
+			 * differ from page to page) are not.  The effect varies with
+			 * compression tool, but bzip2 for instance compresses about an
+			 * order of magnitude worse if those headers are left in place.
+			 *
+			 * Rather than complicating AdvanceXLInsertBuffer itself (which is
+			 * called in heavily-loaded circumstances as well as this lightly-
+			 * loaded one) with variant behavior, we just use GetXLogBuffer
+			 * (which itself calls the two methods we need) to get the pointer
+			 * and zero most of the page.  Then we just zero the page header.
+			 */
+			currpos = GetXLogBuffer(CurrPos, tli);
+			MemSet(currpos, 0, SizeOfXLogShortPHD);
+
+			CurrPos += XLOG_BLCKSZ;
+		}
+	}
+	else
+	{
+		/* Align the end position, so that the next record starts aligned */
+		CurrPos = MAXALIGN64(CurrPos);
+	}
+
+	if (CurrPos != EndPos)
+		elog(PANIC, "space reserved for WAL record does not match what was written");
+}
+
+/*
+ * Acquire a WAL insertion lock, for inserting to WAL.
+ */
+static void
+WALInsertLockAcquire(void)
+{
+	bool		immed;
+
+	/*
+	 * It doesn't matter which of the WAL insertion locks we acquire, so try
+	 * the one we used last time.  If the system isn't particularly busy, it's
+	 * a good bet that it's still available, and it's good to have some
+	 * affinity to a particular lock so that you don't unnecessarily bounce
+	 * cache lines between processes when there's no contention.
+	 *
+	 * If this is the first time through in this backend, pick a lock
+	 * (semi-)randomly.  This allows the locks to be used evenly if you have a
+	 * lot of very short connections.
+	 */
+	static int	lockToTry = -1;
+
+	if (lockToTry == -1)
+		lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
+	MyLockNo = lockToTry;
+
+	/*
+	 * The insertingAt value is initially set to 0, as we don't know our
+	 * insert location yet.
+	 */
+	immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
+	if (!immed)
+	{
+		/*
+		 * If we couldn't get the lock immediately, try another lock next
+		 * time.  On a system with more insertion locks than concurrent
+		 * inserters, this causes all the inserters to eventually migrate to a
+		 * lock that no-one else is using.  On a system with more inserters
+		 * than locks, it still helps to distribute the inserters evenly
+		 * across the locks.
+		 */
+		lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
+	}
+}
+
+/*
+ * Acquire all WAL insertion locks, to prevent other backends from inserting
+ * to WAL.
+ */
+static void
+WALInsertLockAcquireExclusive(void)
+{
+	int			i;
+
+	/*
+	 * When holding all the locks, all but the last lock's insertingAt
+	 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
+	 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
+	 */
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
+	{
+		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+		LWLockUpdateVar(&WALInsertLocks[i].l.lock,
+						&WALInsertLocks[i].l.insertingAt,
+						PG_UINT64_MAX);
+	}
+	/* Variable value reset to 0 at release */
+	LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+
+	holdingAllLocks = true;
+}
+
+/*
+ * Release our insertion lock (or locks, if we're holding them all).
+ *
+ * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
+ * next time the lock is acquired.
+ */
+static void
+WALInsertLockRelease(void)
+{
+	if (holdingAllLocks)
+	{
+		int			i;
+
+		for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+			LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
+								  &WALInsertLocks[i].l.insertingAt,
+								  0);
+
+		holdingAllLocks = false;
+	}
+	else
+	{
+		LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
+							  &WALInsertLocks[MyLockNo].l.insertingAt,
+							  0);
+	}
+}
+
+/*
+ * Update our insertingAt value, to let others know that we've finished
+ * inserting up to that point.
+ */
+static void
+WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
+{
+	if (holdingAllLocks)
+	{
+		/*
+		 * We use the last lock to mark our actual position, see comments in
+		 * WALInsertLockAcquireExclusive.
+		 */
+		LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
+						&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
+						insertingAt);
+	}
+	else
+		LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
+						&WALInsertLocks[MyLockNo].l.insertingAt,
+						insertingAt);
+}
+
+/*
+ * Wait for any WAL insertions < upto to finish.
+ *
+ * Returns the location of the oldest insertion that is still in-progress.
+ * Any WAL prior to that point has been fully copied into WAL buffers, and
+ * can be flushed out to disk. Because this waits for any insertions older
+ * than 'upto' to finish, the return value is always >= 'upto'.
+ *
+ * Note: When you are about to write out WAL, you must call this function
+ * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
+ * need to wait for an insertion to finish (or at least advance to next
+ * uninitialized page), and the inserter might need to evict an old WAL buffer
+ * to make room for a new one, which in turn requires WALWriteLock.
+ */
+static XLogRecPtr
+WaitXLogInsertionsToFinish(XLogRecPtr upto)
+{
+	uint64		bytepos;
+	XLogRecPtr	reservedUpto;
+	XLogRecPtr	finishedUpto;
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	int			i;
+
+	if (MyProc == NULL)
+		elog(PANIC, "cannot wait without a PGPROC structure");
+
+	/* Read the current insert position */
+	SpinLockAcquire(&Insert->insertpos_lck);
+	bytepos = Insert->CurrBytePos;
+	SpinLockRelease(&Insert->insertpos_lck);
+	reservedUpto = XLogBytePosToEndRecPtr(bytepos);
+
+	/*
+	 * No-one should request to flush a piece of WAL that hasn't even been
+	 * reserved yet. However, it can happen if there is a block with a bogus
+	 * LSN on disk, for example. XLogFlush checks for that situation and
+	 * complains, but only after the flush. Here we just assume that to mean
+	 * that all WAL that has been reserved needs to be finished. In this
+	 * corner-case, the return value can be smaller than 'upto' argument.
+	 */
+	if (upto > reservedUpto)
+	{
+		ereport(LOG,
+				(errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
+						LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
+		upto = reservedUpto;
+	}
+
+	/*
+	 * Loop through all the locks, sleeping on any in-progress insert older
+	 * than 'upto'.
+	 *
+	 * finishedUpto is our return value, indicating the point upto which all
+	 * the WAL insertions have been finished. Initialize it to the head of
+	 * reserved WAL, and as we iterate through the insertion locks, back it
+	 * out for any insertion that's still in progress.
+	 */
+	finishedUpto = reservedUpto;
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+	{
+		XLogRecPtr	insertingat = InvalidXLogRecPtr;
+
+		do
+		{
+			/*
+			 * See if this insertion is in progress.  LWLockWaitForVar will
+			 * wait for the lock to be released, or for the 'value' to be set
+			 * by a LWLockUpdateVar call.  When a lock is initially acquired,
+			 * its value is 0 (InvalidXLogRecPtr), which means that we don't
+			 * know where it's inserting yet.  We will have to wait for it. If
+			 * it's a small insertion, the record will most likely fit on the
+			 * same page and the inserter will release the lock without ever
+			 * calling LWLockUpdateVar.  But if it has to sleep, it will
+			 * advertise the insertion point with LWLockUpdateVar before
+			 * sleeping.
+			 */
+			if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
+								 &WALInsertLocks[i].l.insertingAt,
+								 insertingat, &insertingat))
+			{
+				/* the lock was free, so no insertion in progress */
+				insertingat = InvalidXLogRecPtr;
+				break;
+			}
+
+			/*
+			 * This insertion is still in progress. Have to wait, unless the
+			 * inserter has proceeded past 'upto'.
+			 */
+		} while (insertingat < upto);
+
+		if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
+			finishedUpto = insertingat;
+	}
+	return finishedUpto;
+}
+
+/*
+ * Get a pointer to the right location in the WAL buffer containing the
+ * given XLogRecPtr.
+ *
+ * If the page is not initialized yet, it is initialized. That might require
+ * evicting an old dirty buffer from the buffer cache, which means I/O.
+ *
+ * The caller must ensure that the page containing the requested location
+ * isn't evicted yet, and won't be evicted. The way to ensure that is to
+ * hold onto a WAL insertion lock with the insertingAt position set to
+ * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
+ * to evict an old page from the buffer. (This means that once you call
+ * GetXLogBuffer() with a given 'ptr', you must not access anything before
+ * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
+ * later, because older buffers might be recycled already)
+ */
+static char *
+GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
+{
+	int			idx;
+	XLogRecPtr	endptr;
+	static uint64 cachedPage = 0;
+	static char *cachedPos = NULL;
+	XLogRecPtr	expectedEndPtr;
+
+	/*
+	 * Fast path for the common case that we need to access again the same
+	 * page as last time.
+	 */
+	if (ptr / XLOG_BLCKSZ == cachedPage)
+	{
+		Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+		Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+		return cachedPos + ptr % XLOG_BLCKSZ;
+	}
+
+	/*
+	 * The XLog buffer cache is organized so that a page is always loaded to a
+	 * particular buffer.  That way we can easily calculate the buffer a given
+	 * page must be loaded into, from the XLogRecPtr alone.
+	 */
+	idx = XLogRecPtrToBufIdx(ptr);
+
+	/*
+	 * See what page is loaded in the buffer at the moment. It could be the
+	 * page we're looking for, or something older. It can't be anything newer
+	 * - that would imply the page we're looking for has already been written
+	 * out to disk and evicted, and the caller is responsible for making sure
+	 * that doesn't happen.
+	 *
+	 * However, we don't hold a lock while we read the value. If someone has
+	 * just initialized the page, it's possible that we get a "torn read" of
+	 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
+	 * that case we will see a bogus value. That's ok, we'll grab the mapping
+	 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
+	 * the page we're looking for. But it means that when we do this unlocked
+	 * read, we might see a value that appears to be ahead of the page we're
+	 * looking for. Don't PANIC on that, until we've verified the value while
+	 * holding the lock.
+	 */
+	expectedEndPtr = ptr;
+	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
+
+	endptr = XLogCtl->xlblocks[idx];
+	if (expectedEndPtr != endptr)
+	{
+		XLogRecPtr	initializedUpto;
+
+		/*
+		 * Before calling AdvanceXLInsertBuffer(), which can block, let others
+		 * know how far we're finished with inserting the record.
+		 *
+		 * NB: If 'ptr' points to just after the page header, advertise a
+		 * position at the beginning of the page rather than 'ptr' itself. If
+		 * there are no other insertions running, someone might try to flush
+		 * up to our advertised location. If we advertised a position after
+		 * the page header, someone might try to flush the page header, even
+		 * though page might actually not be initialized yet. As the first
+		 * inserter on the page, we are effectively responsible for making
+		 * sure that it's initialized, before we let insertingAt to move past
+		 * the page header.
+		 */
+		if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
+			XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
+			initializedUpto = ptr - SizeOfXLogShortPHD;
+		else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
+				 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
+			initializedUpto = ptr - SizeOfXLogLongPHD;
+		else
+			initializedUpto = ptr;
+
+		WALInsertLockUpdateInsertingAt(initializedUpto);
+
+		AdvanceXLInsertBuffer(ptr, tli, false);
+		endptr = XLogCtl->xlblocks[idx];
+
+		if (expectedEndPtr != endptr)
+			elog(PANIC, "could not find WAL buffer for %X/%X",
+				 LSN_FORMAT_ARGS(ptr));
+	}
+	else
+	{
+		/*
+		 * Make sure the initialization of the page is visible to us, and
+		 * won't arrive later to overwrite the WAL data we write on the page.
+		 */
+		pg_memory_barrier();
+	}
+
+	/*
+	 * Found the buffer holding this page. Return a pointer to the right
+	 * offset within the page.
+	 */
+	cachedPage = ptr / XLOG_BLCKSZ;
+	cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
+
+	Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
+	Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
+
+	return cachedPos + ptr % XLOG_BLCKSZ;
+}
+
+/*
+ * Converts a "usable byte position" to XLogRecPtr. A usable byte position
+ * is the position starting from the beginning of WAL, excluding all WAL
+ * page headers.
+ */
+static XLogRecPtr
+XLogBytePosToRecPtr(uint64 bytepos)
+{
+	uint64		fullsegs;
+	uint64		fullpages;
+	uint64		bytesleft;
+	uint32		seg_offset;
+	XLogRecPtr	result;
+
+	fullsegs = bytepos / UsableBytesInSegment;
+	bytesleft = bytepos % UsableBytesInSegment;
+
+	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+	{
+		/* fits on first page of segment */
+		seg_offset = bytesleft + SizeOfXLogLongPHD;
+	}
+	else
+	{
+		/* account for the first page on segment with long header */
+		seg_offset = XLOG_BLCKSZ;
+		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+		fullpages = bytesleft / UsableBytesInPage;
+		bytesleft = bytesleft % UsableBytesInPage;
+
+		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+	}
+
+	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
+
+	return result;
+}
+
+/*
+ * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
+ * returns a pointer to the beginning of the page (ie. before page header),
+ * not to where the first xlog record on that page would go to. This is used
+ * when converting a pointer to the end of a record.
+ */
+static XLogRecPtr
+XLogBytePosToEndRecPtr(uint64 bytepos)
+{
+	uint64		fullsegs;
+	uint64		fullpages;
+	uint64		bytesleft;
+	uint32		seg_offset;
+	XLogRecPtr	result;
+
+	fullsegs = bytepos / UsableBytesInSegment;
+	bytesleft = bytepos % UsableBytesInSegment;
+
+	if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
+	{
+		/* fits on first page of segment */
+		if (bytesleft == 0)
+			seg_offset = 0;
+		else
+			seg_offset = bytesleft + SizeOfXLogLongPHD;
+	}
+	else
+	{
+		/* account for the first page on segment with long header */
+		seg_offset = XLOG_BLCKSZ;
+		bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
+
+		fullpages = bytesleft / UsableBytesInPage;
+		bytesleft = bytesleft % UsableBytesInPage;
+
+		if (bytesleft == 0)
+			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
+		else
+			seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+	}
+
+	XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
+
+	return result;
+}
+
+/*
+ * Convert an XLogRecPtr to a "usable byte position".
+ */
+static uint64
+XLogRecPtrToBytePos(XLogRecPtr ptr)
+{
+	uint64		fullsegs;
+	uint32		fullpages;
+	uint32		offset;
+	uint64		result;
+
+	XLByteToSeg(ptr, fullsegs, wal_segment_size);
+
+	fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
+	offset = ptr % XLOG_BLCKSZ;
+
+	if (fullpages == 0)
+	{
+		result = fullsegs * UsableBytesInSegment;
+		if (offset > 0)
+		{
+			Assert(offset >= SizeOfXLogLongPHD);
+			result += offset - SizeOfXLogLongPHD;
+		}
+	}
+	else
+	{
+		result = fullsegs * UsableBytesInSegment +
+			(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
+			(fullpages - 1) * UsableBytesInPage;	/* full pages */
+		if (offset > 0)
+		{
+			Assert(offset >= SizeOfXLogShortPHD);
+			result += offset - SizeOfXLogShortPHD;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * Initialize XLOG buffers, writing out old buffers if they still contain
+ * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
+ * true, initialize as many pages as we can without having to write out
+ * unwritten data. Any new pages are initialized to zeros, with pages headers
+ * initialized properly.
+ */
+static void
+AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	int			nextidx;
+	XLogRecPtr	OldPageRqstPtr;
+	XLogwrtRqst WriteRqst;
+	XLogRecPtr	NewPageEndPtr = InvalidXLogRecPtr;
+	XLogRecPtr	NewPageBeginPtr;
+	XLogPageHeader NewPage;
+	int			npages pg_attribute_unused() = 0;
+
+	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+
+	/*
+	 * Now that we have the lock, check if someone initialized the page
+	 * already.
+	 */
+	while (upto >= XLogCtl->InitializedUpTo || opportunistic)
+	{
+		nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
+
+		/*
+		 * Get ending-offset of the buffer page we need to replace (this may
+		 * be zero if the buffer hasn't been used yet).  Fall through if it's
+		 * already written out.
+		 */
+		OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
+		if (LogwrtResult.Write < OldPageRqstPtr)
+		{
+			/*
+			 * Nope, got work to do. If we just want to pre-initialize as much
+			 * as we can without flushing, give up now.
+			 */
+			if (opportunistic)
+				break;
+
+			/* Before waiting, get info_lck and update LogwrtResult */
+			SpinLockAcquire(&XLogCtl->info_lck);
+			if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
+				XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
+			LogwrtResult = XLogCtl->LogwrtResult;
+			SpinLockRelease(&XLogCtl->info_lck);
+
+			/*
+			 * Now that we have an up-to-date LogwrtResult value, see if we
+			 * still need to write it or if someone else already did.
+			 */
+			if (LogwrtResult.Write < OldPageRqstPtr)
+			{
+				/*
+				 * Must acquire write lock. Release WALBufMappingLock first,
+				 * to make sure that all insertions that we need to wait for
+				 * can finish (up to this same position). Otherwise we risk
+				 * deadlock.
+				 */
+				LWLockRelease(WALBufMappingLock);
+
+				WaitXLogInsertionsToFinish(OldPageRqstPtr);
+
+				LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+
+				LogwrtResult = XLogCtl->LogwrtResult;
+				if (LogwrtResult.Write >= OldPageRqstPtr)
+				{
+					/* OK, someone wrote it already */
+					LWLockRelease(WALWriteLock);
+				}
+				else
+				{
+					/* Have to write it ourselves */
+					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
+					WriteRqst.Write = OldPageRqstPtr;
+					WriteRqst.Flush = 0;
+					XLogWrite(WriteRqst, tli, false);
+					LWLockRelease(WALWriteLock);
+					PendingWalStats.wal_buffers_full++;
+					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
+				}
+				/* Re-acquire WALBufMappingLock and retry */
+				LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+				continue;
+			}
+		}
+
+		/*
+		 * Now the next buffer slot is free and we can set it up to be the
+		 * next output page.
+		 */
+		NewPageBeginPtr = XLogCtl->InitializedUpTo;
+		NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
+
+		Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
+
+		NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
+
+		/*
+		 * Be sure to re-zero the buffer so that bytes beyond what we've
+		 * written will look like zeroes and not valid XLOG records...
+		 */
+		MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
+
+		/*
+		 * Fill the new page's header
+		 */
+		NewPage->xlp_magic = XLOG_PAGE_MAGIC;
+
+		/* NewPage->xlp_info = 0; */	/* done by memset */
+		NewPage->xlp_tli = tli;
+		NewPage->xlp_pageaddr = NewPageBeginPtr;
+
+		/* NewPage->xlp_rem_len = 0; */	/* done by memset */
+
+		/*
+		 * If online backup is not in progress, mark the header to indicate
+		 * that WAL records beginning in this page have removable backup
+		 * blocks.  This allows the WAL archiver to know whether it is safe to
+		 * compress archived WAL data by transforming full-block records into
+		 * the non-full-block format.  It is sufficient to record this at the
+		 * page level because we force a page switch (in fact a segment
+		 * switch) when starting a backup, so the flag will be off before any
+		 * records can be written during the backup.  At the end of a backup,
+		 * the last page will be marked as all unsafe when perhaps only part
+		 * is unsafe, but at worst the archiver would miss the opportunity to
+		 * compress a few records.
+		 */
+		if (!Insert->forcePageWrites)
+			NewPage->xlp_info |= XLP_BKP_REMOVABLE;
+
+		/*
+		 * If first page of an XLOG segment file, make it a long header.
+		 */
+		if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
+		{
+			XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
+
+			NewLongPage->xlp_sysid = ControlFile->system_identifier;
+			NewLongPage->xlp_seg_size = wal_segment_size;
+			NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
+			NewPage->xlp_info |= XLP_LONG_HEADER;
+		}
+
+		/*
+		 * Make sure the initialization of the page becomes visible to others
+		 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
+		 * holding a lock.
+		 */
+		pg_write_barrier();
+
+		*((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
+
+		XLogCtl->InitializedUpTo = NewPageEndPtr;
+
+		npages++;
+	}
+	LWLockRelease(WALBufMappingLock);
+
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG && npages > 0)
+	{
+		elog(DEBUG1, "initialized %d pages, up to %X/%X",
+			 npages, LSN_FORMAT_ARGS(NewPageEndPtr));
+	}
+#endif
+}
+
+/*
+ * Calculate CheckPointSegments based on max_wal_size_mb and
+ * checkpoint_completion_target.
+ */
+static void
+CalculateCheckpointSegments(void)
+{
+	double		target;
+
+	/*-------
+	 * Calculate the distance at which to trigger a checkpoint, to avoid
+	 * exceeding max_wal_size_mb. This is based on two assumptions:
+	 *
+	 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
+	 *    WAL for two checkpoint cycles to allow us to recover from the
+	 *    secondary checkpoint if the first checkpoint failed, though we
+	 *    only did this on the primary anyway, not on standby. Keeping just
+	 *    one checkpoint simplifies processing and reduces disk space in
+	 *    many smaller databases.)
+	 * b) during checkpoint, we consume checkpoint_completion_target *
+	 *	  number of segments consumed between checkpoints.
+	 *-------
+	 */
+	target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
+		(1.0 + CheckPointCompletionTarget);
+
+	/* round down */
+	CheckPointSegments = (int) target;
+
+	if (CheckPointSegments < 1)
+		CheckPointSegments = 1;
+}
+
+void
+assign_max_wal_size(int newval, void *extra)
+{
+	max_wal_size_mb = newval;
+	CalculateCheckpointSegments();
+}
+
+void
+assign_checkpoint_completion_target(double newval, void *extra)
+{
+	CheckPointCompletionTarget = newval;
+	CalculateCheckpointSegments();
+}
+
+/*
+ * At a checkpoint, how many WAL segments to recycle as preallocated future
+ * XLOG segments? Returns the highest segment that should be preallocated.
+ */
+static XLogSegNo
+XLOGfileslop(XLogRecPtr lastredoptr)
+{
+	XLogSegNo	minSegNo;
+	XLogSegNo	maxSegNo;
+	double		distance;
+	XLogSegNo	recycleSegNo;
+
+	/*
+	 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
+	 * correspond to. Always recycle enough segments to meet the minimum, and
+	 * remove enough segments to stay below the maximum.
+	 */
+	minSegNo = lastredoptr / wal_segment_size +
+		ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
+	maxSegNo = lastredoptr / wal_segment_size +
+		ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
+
+	/*
+	 * Between those limits, recycle enough segments to get us through to the
+	 * estimated end of next checkpoint.
+	 *
+	 * To estimate where the next checkpoint will finish, assume that the
+	 * system runs steadily consuming CheckPointDistanceEstimate bytes between
+	 * every checkpoint.
+	 */
+	distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
+	/* add 10% for good measure. */
+	distance *= 1.10;
+
+	recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
+									wal_segment_size);
+
+	if (recycleSegNo < minSegNo)
+		recycleSegNo = minSegNo;
+	if (recycleSegNo > maxSegNo)
+		recycleSegNo = maxSegNo;
+
+	return recycleSegNo;
+}
+
+/*
+ * Check whether we've consumed enough xlog space that a checkpoint is needed.
+ *
+ * new_segno indicates a log file that has just been filled up (or read
+ * during recovery). We measure the distance from RedoRecPtr to new_segno
+ * and see if that exceeds CheckPointSegments.
+ *
+ * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
+ */
+bool
+XLogCheckpointNeeded(XLogSegNo new_segno)
+{
+	XLogSegNo	old_segno;
+
+	XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
+
+	if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
+		return true;
+	return false;
+}
+
+/*
+ * Write and/or fsync the log at least as far as WriteRqst indicates.
+ *
+ * If flexible == true, we don't have to write as far as WriteRqst, but
+ * may stop at any convenient boundary (such as a cache or logfile boundary).
+ * This option allows us to avoid uselessly issuing multiple writes when a
+ * single one would do.
+ *
+ * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
+ * must be called before grabbing the lock, to make sure the data is ready to
+ * write.
+ */
+static void
+XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
+{
+	bool		ispartialpage;
+	bool		last_iteration;
+	bool		finishing_seg;
+	int			curridx;
+	int			npages;
+	int			startidx;
+	uint32		startoffset;
+
+	/* We should always be inside a critical section here */
+	Assert(CritSectionCount > 0);
+
+	/*
+	 * Update local LogwrtResult (caller probably did this already, but...)
+	 */
+	LogwrtResult = XLogCtl->LogwrtResult;
+
+	/*
+	 * Since successive pages in the xlog cache are consecutively allocated,
+	 * we can usually gather multiple pages together and issue just one
+	 * write() call.  npages is the number of pages we have determined can be
+	 * written together; startidx is the cache block index of the first one,
+	 * and startoffset is the file offset at which it should go. The latter
+	 * two variables are only valid when npages > 0, but we must initialize
+	 * all of them to keep the compiler quiet.
+	 */
+	npages = 0;
+	startidx = 0;
+	startoffset = 0;
+
+	/*
+	 * Within the loop, curridx is the cache block index of the page to
+	 * consider writing.  Begin at the buffer containing the next unwritten
+	 * page, or last partially written page.
+	 */
+	curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
+
+	while (LogwrtResult.Write < WriteRqst.Write)
+	{
+		/*
+		 * Make sure we're not ahead of the insert process.  This could happen
+		 * if we're passed a bogus WriteRqst.Write that is past the end of the
+		 * last page that's been initialized by AdvanceXLInsertBuffer.
+		 */
+		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
+
+		if (LogwrtResult.Write >= EndPtr)
+			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
+				 LSN_FORMAT_ARGS(LogwrtResult.Write),
+				 LSN_FORMAT_ARGS(EndPtr));
+
+		/* Advance LogwrtResult.Write to end of current buffer page */
+		LogwrtResult.Write = EndPtr;
+		ispartialpage = WriteRqst.Write < LogwrtResult.Write;
+
+		if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+							 wal_segment_size))
+		{
+			/*
+			 * Switch to new logfile segment.  We cannot have any pending
+			 * pages here (since we dump what we have at segment end).
+			 */
+			Assert(npages == 0);
+			if (openLogFile >= 0)
+				XLogFileClose();
+			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+							wal_segment_size);
+			openLogTLI = tli;
+
+			/* create/use new log file */
+			openLogFile = XLogFileInit(openLogSegNo, tli);
+			ReserveExternalFD();
+		}
+
+		/* Make sure we have the current logfile open */
+		if (openLogFile < 0)
+		{
+			XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+							wal_segment_size);
+			openLogTLI = tli;
+			openLogFile = XLogFileOpen(openLogSegNo, tli);
+			ReserveExternalFD();
+		}
+
+		/* Add current page to the set of pending pages-to-dump */
+		if (npages == 0)
+		{
+			/* first of group */
+			startidx = curridx;
+			startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
+											wal_segment_size);
+		}
+		npages++;
+
+		/*
+		 * Dump the set if this will be the last loop iteration, or if we are
+		 * at the last page of the cache area (since the next page won't be
+		 * contiguous in memory), or if we are at the end of the logfile
+		 * segment.
+		 */
+		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
+
+		finishing_seg = !ispartialpage &&
+			(startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
+
+		if (last_iteration ||
+			curridx == XLogCtl->XLogCacheBlck ||
+			finishing_seg)
+		{
+			char	   *from;
+			Size		nbytes;
+			Size		nleft;
+			int			written;
+			instr_time	start;
+
+			/* OK to write the page(s) */
+			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
+			nbytes = npages * (Size) XLOG_BLCKSZ;
+			nleft = nbytes;
+			do
+			{
+				errno = 0;
+
+				/* Measure I/O timing to write WAL data */
+				if (track_wal_io_timing)
+					INSTR_TIME_SET_CURRENT(start);
+
+				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
+				written = pg_pwrite(openLogFile, from, nleft, startoffset);
+				pgstat_report_wait_end();
+
+				/*
+				 * Increment the I/O timing and the number of times WAL data
+				 * were written out to disk.
+				 */
+				if (track_wal_io_timing)
+				{
+					instr_time	duration;
+
+					INSTR_TIME_SET_CURRENT(duration);
+					INSTR_TIME_SUBTRACT(duration, start);
+					PendingWalStats.wal_write_time += INSTR_TIME_GET_MICROSEC(duration);
+				}
+
+				PendingWalStats.wal_write++;
+
+				if (written <= 0)
+				{
+					char		xlogfname[MAXFNAMELEN];
+					int			save_errno;
+
+					if (errno == EINTR)
+						continue;
+
+					save_errno = errno;
+					XLogFileName(xlogfname, tli, openLogSegNo,
+								 wal_segment_size);
+					errno = save_errno;
+					ereport(PANIC,
+							(errcode_for_file_access(),
+							 errmsg("could not write to log file %s "
+									"at offset %u, length %zu: %m",
+									xlogfname, startoffset, nleft)));
+				}
+				nleft -= written;
+				from += written;
+				startoffset += written;
+			} while (nleft > 0);
+
+			npages = 0;
+
+			/*
+			 * If we just wrote the whole last page of a logfile segment,
+			 * fsync the segment immediately.  This avoids having to go back
+			 * and re-open prior segments when an fsync request comes along
+			 * later. Doing it here ensures that one and only one backend will
+			 * perform this fsync.
+			 *
+			 * This is also the right place to notify the Archiver that the
+			 * segment is ready to copy to archival storage, and to update the
+			 * timer for archive_timeout, and to signal for a checkpoint if
+			 * too many logfile segments have been used since the last
+			 * checkpoint.
+			 */
+			if (finishing_seg)
+			{
+				issue_xlog_fsync(openLogFile, openLogSegNo, tli);
+
+				/* signal that we need to wakeup walsenders later */
+				WalSndWakeupRequest();
+
+				LogwrtResult.Flush = LogwrtResult.Write;	/* end of page */
+
+				if (XLogArchivingActive())
+					XLogArchiveNotifySeg(openLogSegNo, tli);
+
+				XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+				XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
+
+				/*
+				 * Request a checkpoint if we've consumed too much xlog since
+				 * the last one.  For speed, we first check using the local
+				 * copy of RedoRecPtr, which might be out of date; if it looks
+				 * like a checkpoint is needed, forcibly update RedoRecPtr and
+				 * recheck.
+				 */
+				if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
+				{
+					(void) GetRedoRecPtr();
+					if (XLogCheckpointNeeded(openLogSegNo))
+						RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+				}
+			}
+		}
+
+		if (ispartialpage)
+		{
+			/* Only asked to write a partial page */
+			LogwrtResult.Write = WriteRqst.Write;
+			break;
+		}
+		curridx = NextBufIdx(curridx);
+
+		/* If flexible, break out of loop as soon as we wrote something */
+		if (flexible && npages == 0)
+			break;
+	}
+
+	Assert(npages == 0);
+
+	/*
+	 * If asked to flush, do so
+	 */
+	if (LogwrtResult.Flush < WriteRqst.Flush &&
+		LogwrtResult.Flush < LogwrtResult.Write)
+	{
+		/*
+		 * Could get here without iterating above loop, in which case we might
+		 * have no open file or the wrong one.  However, we do not need to
+		 * fsync more than one file.
+		 */
+		if (sync_method != SYNC_METHOD_OPEN &&
+			sync_method != SYNC_METHOD_OPEN_DSYNC)
+		{
+			if (openLogFile >= 0 &&
+				!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+								 wal_segment_size))
+				XLogFileClose();
+			if (openLogFile < 0)
+			{
+				XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
+								wal_segment_size);
+				openLogTLI = tli;
+				openLogFile = XLogFileOpen(openLogSegNo, tli);
+				ReserveExternalFD();
+			}
+
+			issue_xlog_fsync(openLogFile, openLogSegNo, tli);
+		}
+
+		/* signal that we need to wakeup walsenders later */
+		WalSndWakeupRequest();
+
+		LogwrtResult.Flush = LogwrtResult.Write;
+	}
+
+	/*
+	 * Update shared-memory status
+	 *
+	 * We make sure that the shared 'request' values do not fall behind the
+	 * 'result' values.  This is not absolutely essential, but it saves some
+	 * code in a couple of places.
+	 */
+	{
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->LogwrtResult = LogwrtResult;
+		if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
+			XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
+		if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
+			XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
+		SpinLockRelease(&XLogCtl->info_lck);
+	}
+}
+
+/*
+ * Record the LSN for an asynchronous transaction commit/abort
+ * and nudge the WALWriter if there is work for it to do.
+ * (This should not be called for synchronous commits.)
+ */
+void
+XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
+{
+	XLogRecPtr	WriteRqstPtr = asyncXactLSN;
+	bool		sleeping;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	sleeping = XLogCtl->WalWriterSleeping;
+	if (XLogCtl->asyncXactLSN < asyncXactLSN)
+		XLogCtl->asyncXactLSN = asyncXactLSN;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * If the WALWriter is sleeping, we should kick it to make it come out of
+	 * low-power mode.  Otherwise, determine whether there's a full page of
+	 * WAL available to write.
+	 */
+	if (!sleeping)
+	{
+		/* back off to last completed page boundary */
+		WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
+
+		/* if we have already flushed that far, we're done */
+		if (WriteRqstPtr <= LogwrtResult.Flush)
+			return;
+	}
+
+	/*
+	 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
+	 * to come out of low-power mode so that this async commit will reach disk
+	 * within the expected amount of time.
+	 */
+	if (ProcGlobal->walwriterLatch)
+		SetLatch(ProcGlobal->walwriterLatch);
+}
+
+/*
+ * Record the LSN up to which we can remove WAL because it's not required by
+ * any replication slot.
+ */
+void
+XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->replicationSlotMinLSN = lsn;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+
+/*
+ * Return the oldest LSN we must retain to satisfy the needs of some
+ * replication slot.
+ */
+static XLogRecPtr
+XLogGetReplicationSlotMinimumLSN(void)
+{
+	XLogRecPtr	retval;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	retval = XLogCtl->replicationSlotMinLSN;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return retval;
+}
+
+/*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent.
+ *
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+	/* Quick check using our local copy of the variable */
+	if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
+		return;
+
+	/*
+	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
+	 * i.e., we're doing crash recovery.  We never modify the control file's
+	 * value in that case, so we can short-circuit future checks here too. The
+	 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
+	 * updated until crash recovery finishes.  We only do this for the startup
+	 * process as it should not update its own reference of minRecoveryPoint
+	 * until it has finished crash recovery to make sure that all WAL
+	 * available is replayed in this case.  This also saves from extra locks
+	 * taken on the control file from the startup process.
+	 */
+	if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
+	{
+		updateMinRecoveryPoint = false;
+		return;
+	}
+
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+	/* update local copy */
+	LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+	LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+
+	if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
+		updateMinRecoveryPoint = false;
+	else if (force || LocalMinRecoveryPoint < lsn)
+	{
+		XLogRecPtr	newMinRecoveryPoint;
+		TimeLineID	newMinRecoveryPointTLI;
+
+		/*
+		 * To avoid having to update the control file too often, we update it
+		 * all the way to the last record being replayed, even though 'lsn'
+		 * would suffice for correctness.  This also allows the 'force' case
+		 * to not need a valid 'lsn' value.
+		 *
+		 * Another important reason for doing it this way is that the passed
+		 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
+		 * the caller got it from a corrupted heap page.  Accepting such a
+		 * value as the min recovery point would prevent us from coming up at
+		 * all.  Instead, we just log a warning and continue with recovery.
+		 * (See also the comments about corrupt LSNs in XLogFlush.)
+		 */
+		newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
+		if (!force && newMinRecoveryPoint < lsn)
+			elog(WARNING,
+				 "xlog min recovery request %X/%X is past current point %X/%X",
+				 LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
+
+		/* update control file */
+		if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
+		{
+			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+			ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
+			UpdateControlFile();
+			LocalMinRecoveryPoint = newMinRecoveryPoint;
+			LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
+
+			ereport(DEBUG2,
+					(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
+									 LSN_FORMAT_ARGS(newMinRecoveryPoint),
+									 newMinRecoveryPointTLI)));
+		}
+	}
+	LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Ensure that all XLOG data through the given position is flushed to disk.
+ *
+ * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
+ * already held, and we try to avoid acquiring it if possible.
+ */
+void
+XLogFlush(XLogRecPtr record)
+{
+	XLogRecPtr	WriteRqstPtr;
+	XLogwrtRqst WriteRqst;
+	TimeLineID	insertTLI = XLogCtl->InsertTimeLineID;
+
+	/*
+	 * During REDO, we are reading not writing WAL.  Therefore, instead of
+	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
+	 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
+	 * to act this way too, and because when it tries to write the
+	 * end-of-recovery checkpoint, it should indeed flush.
+	 */
+	if (!XLogInsertAllowed())
+	{
+		UpdateMinRecoveryPoint(record, false);
+		return;
+	}
+
+	/* Quick exit if already known flushed */
+	if (record <= LogwrtResult.Flush)
+		return;
+
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG)
+		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
+			 LSN_FORMAT_ARGS(record),
+			 LSN_FORMAT_ARGS(LogwrtResult.Write),
+			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
+#endif
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Since fsync is usually a horribly expensive operation, we try to
+	 * piggyback as much data as we can on each fsync: if we see any more data
+	 * entered into the xlog buffer, we'll write and fsync that too, so that
+	 * the final value of LogwrtResult.Flush is as large as possible. This
+	 * gives us some chance of avoiding another fsync immediately after.
+	 */
+
+	/* initialize to given target; may increase below */
+	WriteRqstPtr = record;
+
+	/*
+	 * Now wait until we get the write lock, or someone else does the flush
+	 * for us.
+	 */
+	for (;;)
+	{
+		XLogRecPtr	insertpos;
+
+		/* read LogwrtResult and update local state */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
+			WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
+		LogwrtResult = XLogCtl->LogwrtResult;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		/* done already? */
+		if (record <= LogwrtResult.Flush)
+			break;
+
+		/*
+		 * Before actually performing the write, wait for all in-flight
+		 * insertions to the pages we're about to write to finish.
+		 */
+		insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
+
+		/*
+		 * Try to get the write lock. If we can't get it immediately, wait
+		 * until it's released, and recheck if we still need to do the flush
+		 * or if the backend that held the lock did it for us already. This
+		 * helps to maintain a good rate of group committing when the system
+		 * is bottlenecked by the speed of fsyncing.
+		 */
+		if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
+		{
+			/*
+			 * The lock is now free, but we didn't acquire it yet. Before we
+			 * do, loop back to check if someone else flushed the record for
+			 * us already.
+			 */
+			continue;
+		}
+
+		/* Got the lock; recheck whether request is satisfied */
+		LogwrtResult = XLogCtl->LogwrtResult;
+		if (record <= LogwrtResult.Flush)
+		{
+			LWLockRelease(WALWriteLock);
+			break;
+		}
+
+		/*
+		 * Sleep before flush! By adding a delay here, we may give further
+		 * backends the opportunity to join the backlog of group commit
+		 * followers; this can significantly improve transaction throughput,
+		 * at the risk of increasing transaction latency.
+		 *
+		 * We do not sleep if enableFsync is not turned on, nor if there are
+		 * fewer than CommitSiblings other backends with active transactions.
+		 */
+		if (CommitDelay > 0 && enableFsync &&
+			MinimumActiveBackends(CommitSiblings))
+		{
+			pg_usleep(CommitDelay);
+
+			/*
+			 * Re-check how far we can now flush the WAL. It's generally not
+			 * safe to call WaitXLogInsertionsToFinish while holding
+			 * WALWriteLock, because an in-progress insertion might need to
+			 * also grab WALWriteLock to make progress. But we know that all
+			 * the insertions up to insertpos have already finished, because
+			 * that's what the earlier WaitXLogInsertionsToFinish() returned.
+			 * We're only calling it again to allow insertpos to be moved
+			 * further forward, not to actually wait for anyone.
+			 */
+			insertpos = WaitXLogInsertionsToFinish(insertpos);
+		}
+
+		/* try to write/flush later additions to XLOG as well */
+		WriteRqst.Write = insertpos;
+		WriteRqst.Flush = insertpos;
+
+		XLogWrite(WriteRqst, insertTLI, false);
+
+		LWLockRelease(WALWriteLock);
+		/* done */
+		break;
+	}
+
+	END_CRIT_SECTION();
+
+	/* wake up walsenders now that we've released heavily contended locks */
+	WalSndWakeupProcessRequests();
+
+	/*
+	 * If we still haven't flushed to the request point then we have a
+	 * problem; most likely, the requested flush point is past end of XLOG.
+	 * This has been seen to occur when a disk page has a corrupted LSN.
+	 *
+	 * Formerly we treated this as a PANIC condition, but that hurts the
+	 * system's robustness rather than helping it: we do not want to take down
+	 * the whole system due to corruption on one data page.  In particular, if
+	 * the bad page is encountered again during recovery then we would be
+	 * unable to restart the database at all!  (This scenario actually
+	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
+	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
+	 * the only time we can reach here during recovery is while flushing the
+	 * end-of-recovery checkpoint record, and we don't expect that to have a
+	 * bad LSN.
+	 *
+	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
+	 * since xact.c calls this routine inside a critical section.  However,
+	 * calls from bufmgr.c are not within critical sections and so we will not
+	 * force a restart for a bad LSN on a data page.
+	 */
+	if (LogwrtResult.Flush < record)
+		elog(ERROR,
+			 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
+			 LSN_FORMAT_ARGS(record),
+			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
+}
+
+/*
+ * Write & flush xlog, but without specifying exactly where to.
+ *
+ * We normally write only completed blocks; but if there is nothing to do on
+ * that basis, we check for unwritten async commits in the current incomplete
+ * block, and write through the latest one of those.  Thus, if async commits
+ * are not being used, we will write complete blocks only.
+ *
+ * If, based on the above, there's anything to write we do so immediately. But
+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
+ * more than wal_writer_flush_after unflushed blocks.
+ *
+ * We can guarantee that async commits reach disk after at most three
+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
+ * this makes a difference only with very high load or long wal_writer_delay,
+ * but imposes one extra cycle for the worst case for async commits.)
+ *
+ * This routine is invoked periodically by the background walwriter process.
+ *
+ * Returns true if there was any work to do, even if we skipped flushing due
+ * to wal_writer_delay/wal_writer_flush_after.
+ */
+bool
+XLogBackgroundFlush(void)
+{
+	XLogwrtRqst WriteRqst;
+	bool		flexible = true;
+	static TimestampTz lastflush;
+	TimestampTz now;
+	int			flushbytes;
+	TimeLineID	insertTLI;
+
+	/* XLOG doesn't need flushing during recovery */
+	if (RecoveryInProgress())
+		return false;
+
+	/*
+	 * Since we're not in recovery, InsertTimeLineID is set and can't change,
+	 * so we can read it without a lock.
+	 */
+	insertTLI = XLogCtl->InsertTimeLineID;
+
+	/* read LogwrtResult and update local state */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	WriteRqst = XLogCtl->LogwrtRqst;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/* back off to last completed page boundary */
+	WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
+
+	/* if we have already flushed that far, consider async commit records */
+	if (WriteRqst.Write <= LogwrtResult.Flush)
+	{
+		SpinLockAcquire(&XLogCtl->info_lck);
+		WriteRqst.Write = XLogCtl->asyncXactLSN;
+		SpinLockRelease(&XLogCtl->info_lck);
+		flexible = false;		/* ensure it all gets written */
+	}
+
+	/*
+	 * If already known flushed, we're done. Just need to check if we are
+	 * holding an open file handle to a logfile that's no longer in use,
+	 * preventing the file from being deleted.
+	 */
+	if (WriteRqst.Write <= LogwrtResult.Flush)
+	{
+		if (openLogFile >= 0)
+		{
+			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
+								 wal_segment_size))
+			{
+				XLogFileClose();
+			}
+		}
+		return false;
+	}
+
+	/*
+	 * Determine how far to flush WAL, based on the wal_writer_delay and
+	 * wal_writer_flush_after GUCs.
+	 */
+	now = GetCurrentTimestamp();
+	flushbytes =
+		WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
+
+	if (WalWriterFlushAfter == 0 || lastflush == 0)
+	{
+		/* first call, or block based limits disabled */
+		WriteRqst.Flush = WriteRqst.Write;
+		lastflush = now;
+	}
+	else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
+	{
+		/*
+		 * Flush the writes at least every WalWriterDelay ms. This is
+		 * important to bound the amount of time it takes for an asynchronous
+		 * commit to hit disk.
+		 */
+		WriteRqst.Flush = WriteRqst.Write;
+		lastflush = now;
+	}
+	else if (flushbytes >= WalWriterFlushAfter)
+	{
+		/* exceeded wal_writer_flush_after blocks, flush */
+		WriteRqst.Flush = WriteRqst.Write;
+		lastflush = now;
+	}
+	else
+	{
+		/* no flushing, this time round */
+		WriteRqst.Flush = 0;
+	}
+
+#ifdef WAL_DEBUG
+	if (XLOG_DEBUG)
+		elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+			 LSN_FORMAT_ARGS(WriteRqst.Write),
+			 LSN_FORMAT_ARGS(WriteRqst.Flush),
+			 LSN_FORMAT_ARGS(LogwrtResult.Write),
+			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
+#endif
+
+	START_CRIT_SECTION();
+
+	/* now wait for any in-progress insertions to finish and get write lock */
+	WaitXLogInsertionsToFinish(WriteRqst.Write);
+	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	if (WriteRqst.Write > LogwrtResult.Write ||
+		WriteRqst.Flush > LogwrtResult.Flush)
+	{
+		XLogWrite(WriteRqst, insertTLI, flexible);
+	}
+	LWLockRelease(WALWriteLock);
+
+	END_CRIT_SECTION();
+
+	/* wake up walsenders now that we've released heavily contended locks */
+	WalSndWakeupProcessRequests();
+
+	/*
+	 * Great, done. To take some work off the critical path, try to initialize
+	 * as many of the no-longer-needed WAL buffers for future use as we can.
+	 */
+	AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
+
+	/*
+	 * If we determined that we need to write data, but somebody else
+	 * wrote/flushed already, it should be considered as being active, to
+	 * avoid hibernating too early.
+	 */
+	return true;
+}
+
+/*
+ * Test whether XLOG data has been flushed up to (at least) the given position.
+ *
+ * Returns true if a flush is still needed.  (It may be that someone else
+ * is already in process of flushing that far, however.)
+ */
+bool
+XLogNeedsFlush(XLogRecPtr record)
+{
+	/*
+	 * During recovery, we don't flush WAL but update minRecoveryPoint
+	 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
+	 * would need to be updated.
+	 */
+	if (RecoveryInProgress())
+	{
+		/*
+		 * An invalid minRecoveryPoint means that we need to recover all the
+		 * WAL, i.e., we're doing crash recovery.  We never modify the control
+		 * file's value in that case, so we can short-circuit future checks
+		 * here too.  This triggers a quick exit path for the startup process,
+		 * which cannot update its local copy of minRecoveryPoint as long as
+		 * it has not replayed all WAL available when doing crash recovery.
+		 */
+		if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
+			updateMinRecoveryPoint = false;
+
+		/* Quick exit if already known to be updated or cannot be updated */
+		if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
+			return false;
+
+		/*
+		 * Update local copy of minRecoveryPoint. But if the lock is busy,
+		 * just return a conservative guess.
+		 */
+		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
+			return true;
+		LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+		LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+		LWLockRelease(ControlFileLock);
+
+		/*
+		 * Check minRecoveryPoint for any other process than the startup
+		 * process doing crash recovery, which should not update the control
+		 * file value if crash recovery is still running.
+		 */
+		if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
+			updateMinRecoveryPoint = false;
+
+		/* check again */
+		if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
+			return false;
+		else
+			return true;
+	}
+
+	/* Quick exit if already known flushed */
+	if (record <= LogwrtResult.Flush)
+		return false;
+
+	/* read LogwrtResult and update local state */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/* check again */
+	if (record <= LogwrtResult.Flush)
+		return false;
+
+	return true;
+}
+
+/*
+ * Try to make a given XLOG file segment exist.
+ *
+ * logsegno: identify segment.
+ *
+ * *added: on return, true if this call raised the number of extant segments.
+ *
+ * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
+ *
+ * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
+ * wanting an open segment should attempt to open "path", which usually will
+ * succeed.  (This is weird, but it's efficient for the callers.)
+ */
+static int
+XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
+					 bool *added, char *path)
+{
+	char		tmppath[MAXPGPATH];
+	PGAlignedXLogBlock zbuffer;
+	XLogSegNo	installed_segno;
+	XLogSegNo	max_segno;
+	int			fd;
+	int			save_errno;
+
+	Assert(logtli != 0);
+
+	XLogFilePath(path, logtli, logsegno, wal_segment_size);
+
+	/*
+	 * Try to use existent file (checkpoint maker may have created it already)
+	 */
+	*added = false;
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", path)));
+	}
+	else
+		return fd;
+
+	/*
+	 * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
+	 * another process is doing the same thing.  If so, we will end up
+	 * pre-creating an extra log segment.  That seems OK, and better than
+	 * holding the lock throughout this lengthy process.
+	 */
+	elog(DEBUG2, "creating and filling new WAL file");
+
+	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+	unlink(tmppath);
+
+	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
+	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tmppath)));
+
+	memset(zbuffer.data, 0, XLOG_BLCKSZ);
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
+	save_errno = 0;
+	if (wal_init_zero)
+	{
+		struct iovec iov[PG_IOV_MAX];
+		int			blocks;
+
+		/*
+		 * Zero-fill the file.  With this setting, we do this the hard way to
+		 * ensure that all the file space has really been allocated.  On
+		 * platforms that allow "holes" in files, just seeking to the end
+		 * doesn't allocate intermediate space.  This way, we know that we
+		 * have all the space and (after the fsync below) that all the
+		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
+		 * O_DSYNC will be sufficient to sync future writes to the log file.
+		 */
+
+		/* Prepare to write out a lot of copies of our zero buffer at once. */
+		for (int i = 0; i < lengthof(iov); ++i)
+		{
+			iov[i].iov_base = zbuffer.data;
+			iov[i].iov_len = XLOG_BLCKSZ;
+		}
+
+		/* Loop, writing as many blocks as we can for each system call. */
+		blocks = wal_segment_size / XLOG_BLCKSZ;
+		for (int i = 0; i < blocks;)
+		{
+			int			iovcnt = Min(blocks - i, lengthof(iov));
+			off_t		offset = i * XLOG_BLCKSZ;
+
+			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
+			{
+				save_errno = errno;
+				break;
+			}
+
+			i += iovcnt;
+		}
+	}
+	else
+	{
+		/*
+		 * Otherwise, seeking to the end and writing a solitary byte is
+		 * enough.
+		 */
+		errno = 0;
+		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+		{
+			/* if write didn't set errno, assume no disk space */
+			save_errno = errno ? errno : ENOSPC;
+		}
+	}
+	pgstat_report_wait_end();
+
+	if (save_errno)
+	{
+		/*
+		 * If we fail to make the file, delete it to release disk space
+		 */
+		unlink(tmppath);
+
+		close(fd);
+
+		errno = save_errno;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
+	}
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
+	if (pg_fsync(fd) != 0)
+	{
+		int			save_errno = errno;
+
+		close(fd);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	}
+	pgstat_report_wait_end();
+
+	if (close(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tmppath)));
+
+	/*
+	 * Now move the segment into place with its final name.  Cope with
+	 * possibility that someone else has created the file while we were
+	 * filling ours: if so, use ours to pre-create a future log segment.
+	 */
+	installed_segno = logsegno;
+
+	/*
+	 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
+	 * that was a constant, but that was always a bit dubious: normally, at a
+	 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
+	 * here, it was the offset from the insert location. We can't do the
+	 * normal XLOGfileslop calculation here because we don't have access to
+	 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
+	 * CheckPointSegments.
+	 */
+	max_segno = logsegno + CheckPointSegments;
+	if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
+							   logtli))
+	{
+		*added = true;
+		elog(DEBUG2, "done creating and filling new WAL file");
+	}
+	else
+	{
+		/*
+		 * No need for any more future segments, or InstallXLogFileSegment()
+		 * failed to rename the file into place. If the rename failed, a
+		 * caller opening the file may fail.
+		 */
+		unlink(tmppath);
+		elog(DEBUG2, "abandoned new WAL file");
+	}
+
+	return -1;
+}
+
+/*
+ * Create a new XLOG file segment, or open a pre-existing one.
+ *
+ * logsegno: identify segment to be created/opened.
+ *
+ * Returns FD of opened file.
+ *
+ * Note: errors here are ERROR not PANIC because we might or might not be
+ * inside a critical section (eg, during checkpoint there is no reason to
+ * take down the system on failure).  They will promote to PANIC if we are
+ * in a critical section.
+ */
+int
+XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
+{
+	bool		ignore_added;
+	char		path[MAXPGPATH];
+	int			fd;
+
+	Assert(logtli != 0);
+
+	fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
+	if (fd >= 0)
+		return fd;
+
+	/* Now open original target segment (might not be file I just made) */
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+	return fd;
+}
+
+/*
+ * Create a new XLOG file segment by copying a pre-existing one.
+ *
+ * destsegno: identify segment to be created.
+ *
+ * srcTLI, srcsegno: identify segment to be copied (could be from
+ *		a different timeline)
+ *
+ * upto: how much of the source file to copy (the rest is filled with
+ *		zeros)
+ *
+ * Currently this is only used during recovery, and so there are no locking
+ * considerations.  But we should be just as tense as XLogFileInit to avoid
+ * emplacing a bogus file.
+ */
+static void
+XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
+			 TimeLineID srcTLI, XLogSegNo srcsegno,
+			 int upto)
+{
+	char		path[MAXPGPATH];
+	char		tmppath[MAXPGPATH];
+	PGAlignedXLogBlock buffer;
+	int			srcfd;
+	int			fd;
+	int			nbytes;
+
+	/*
+	 * Open the source file
+	 */
+	XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
+	srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+
+	/*
+	 * Copy into a temp file name.
+	 */
+	snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
+
+	unlink(tmppath);
+
+	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
+	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tmppath)));
+
+	/*
+	 * Do the data copying.
+	 */
+	for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
+	{
+		int			nread;
+
+		nread = upto - nbytes;
+
+		/*
+		 * The part that is not read from the source file is filled with
+		 * zeros.
+		 */
+		if (nread < sizeof(buffer))
+			memset(buffer.data, 0, sizeof(buffer));
+
+		if (nread > 0)
+		{
+			int			r;
+
+			if (nread > sizeof(buffer))
+				nread = sizeof(buffer);
+			pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
+			r = read(srcfd, buffer.data, nread);
+			if (r != nread)
+			{
+				if (r < 0)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not read file \"%s\": %m",
+									path)));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_DATA_CORRUPTED),
+							 errmsg("could not read file \"%s\": read %d of %zu",
+									path, r, (Size) nread)));
+			}
+			pgstat_report_wait_end();
+		}
+		errno = 0;
+		pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
+		if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
+		{
+			int			save_errno = errno;
+
+			/*
+			 * If we fail to make the file, delete it to release disk space
+			 */
+			unlink(tmppath);
+			/* if write didn't set errno, assume problem is no disk space */
+			errno = save_errno ? save_errno : ENOSPC;
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m", tmppath)));
+		}
+		pgstat_report_wait_end();
+	}
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	pgstat_report_wait_end();
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tmppath)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", path)));
+
+	/*
+	 * Now move the segment into place with its final name.
+	 */
+	if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
+		elog(ERROR, "InstallXLogFileSegment should not have failed");
+}
+
+/*
+ * Install a new XLOG segment file as a current or future log segment.
+ *
+ * This is used both to install a newly-created segment (which has a temp
+ * filename while it's being created) and to recycle an old segment.
+ *
+ * *segno: identify segment to install as (or first possible target).
+ * When find_free is true, this is modified on return to indicate the
+ * actual installation location or last segment searched.
+ *
+ * tmppath: initial name of file to install.  It will be renamed into place.
+ *
+ * find_free: if true, install the new segment at the first empty segno
+ * number at or after the passed numbers.  If false, install the new segment
+ * exactly where specified, deleting any existing segment file there.
+ *
+ * max_segno: maximum segment number to install the new file as.  Fail if no
+ * free slot is found between *segno and max_segno. (Ignored when find_free
+ * is false.)
+ *
+ * tli: The timeline on which the new segment should be installed.
+ *
+ * Returns true if the file was installed successfully.  false indicates that
+ * max_segno limit was exceeded, the startup process has disabled this
+ * function for now, or an error occurred while renaming the file into place.
+ */
+static bool
+InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
+					   bool find_free, XLogSegNo max_segno, TimeLineID tli)
+{
+	char		path[MAXPGPATH];
+	struct stat stat_buf;
+
+	Assert(tli != 0);
+
+	XLogFilePath(path, tli, *segno, wal_segment_size);
+
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	if (!XLogCtl->InstallXLogFileSegmentActive)
+	{
+		LWLockRelease(ControlFileLock);
+		return false;
+	}
+
+	if (!find_free)
+	{
+		/* Force installation: get rid of any pre-existing segment file */
+		durable_unlink(path, DEBUG1);
+	}
+	else
+	{
+		/* Find a free slot to put it in */
+		while (stat(path, &stat_buf) == 0)
+		{
+			if ((*segno) >= max_segno)
+			{
+				/* Failed to find a free slot within specified range */
+				LWLockRelease(ControlFileLock);
+				return false;
+			}
+			(*segno)++;
+			XLogFilePath(path, tli, *segno, wal_segment_size);
+		}
+	}
+
+	/*
+	 * Perform the rename using link if available, paranoidly trying to avoid
+	 * overwriting an existing file (there shouldn't be one).
+	 */
+	if (durable_rename_excl(tmppath, path, LOG) != 0)
+	{
+		LWLockRelease(ControlFileLock);
+		/* durable_rename_excl already emitted log message */
+		return false;
+	}
+
+	LWLockRelease(ControlFileLock);
+
+	return true;
+}
+
+/*
+ * Open a pre-existing logfile segment for writing.
+ */
+int
+XLogFileOpen(XLogSegNo segno, TimeLineID tli)
+{
+	char		path[MAXPGPATH];
+	int			fd;
+
+	XLogFilePath(path, tli, segno, wal_segment_size);
+
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	if (fd < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+
+	return fd;
+}
+
+/*
+ * Close the current logfile segment for writing.
+ */
+static void
+XLogFileClose(void)
+{
+	Assert(openLogFile >= 0);
+
+	/*
+	 * WAL segment files will not be re-read in normal operation, so we advise
+	 * the OS to release any cached pages.  But do not do so if WAL archiving
+	 * or streaming is active, because archiver and walsender process could
+	 * use the cache to read the WAL segment.
+	 */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	if (!XLogIsNeeded())
+		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+
+	if (close(openLogFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
+		errno = save_errno;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", xlogfname)));
+	}
+
+	openLogFile = -1;
+	ReleaseExternalFD();
+}
+
+/*
+ * Preallocate log files beyond the specified log endpoint.
+ *
+ * XXX this is currently extremely conservative, since it forces only one
+ * future log segment to exist, and even that only if we are 75% done with
+ * the current one.  This is only appropriate for very low-WAL-volume systems.
+ * High-volume systems will be OK once they've built up a sufficient set of
+ * recycled log segments, but the startup transient is likely to include
+ * a lot of segment creations by foreground processes, which is not so good.
+ *
+ * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
+ * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
+ * and/or ControlFile updates already completed.  If a RequestCheckpoint()
+ * initiated the present checkpoint and an ERROR ends this function, the
+ * command that called RequestCheckpoint() fails.  That's not ideal, but it's
+ * not worth contorting more functions to use caller-specified elevel values.
+ * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
+ * reporting and resource reclamation.)
+ */
+static void
+PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
+{
+	XLogSegNo	_logSegNo;
+	int			lf;
+	bool		added;
+	char		path[MAXPGPATH];
+	uint64		offset;
+
+	if (!XLogCtl->InstallXLogFileSegmentActive)
+		return;					/* unlocked check says no */
+
+	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
+	offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
+	if (offset >= (uint32) (0.75 * wal_segment_size))
+	{
+		_logSegNo++;
+		lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
+		if (lf >= 0)
+			close(lf);
+		if (added)
+			CheckpointStats.ckpt_segs_added++;
+	}
+}
+
+/*
+ * Throws an error if the given log segment has already been removed or
+ * recycled. The caller should only pass a segment that it knows to have
+ * existed while the server has been running, as this function always
+ * succeeds if no WAL segments have been removed since startup.
+ * 'tli' is only used in the error message.
+ *
+ * Note: this function guarantees to keep errno unchanged on return.
+ * This supports callers that use this to possibly deliver a better
+ * error message about a missing file, while still being able to throw
+ * a normal file-access error afterwards, if this does return.
+ */
+void
+CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
+{
+	int			save_errno = errno;
+	XLogSegNo	lastRemovedSegNo;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (segno <= lastRemovedSegNo)
+	{
+		char		filename[MAXFNAMELEN];
+
+		XLogFileName(filename, tli, segno, wal_segment_size);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("requested WAL segment %s has already been removed",
+						filename)));
+	}
+	errno = save_errno;
+}
+
+/*
+ * Return the last WAL segment removed, or 0 if no segment has been removed
+ * since startup.
+ *
+ * NB: the result can be out of date arbitrarily fast, the caller has to deal
+ * with that.
+ */
+XLogSegNo
+XLogGetLastRemovedSegno(void)
+{
+	XLogSegNo	lastRemovedSegNo;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return lastRemovedSegNo;
+}
+
+
+/*
+ * Update the last removed segno pointer in shared memory, to reflect that the
+ * given XLOG file has been removed.
+ */
+static void
+UpdateLastRemovedPtr(char *filename)
+{
+	uint32		tli;
+	XLogSegNo	segno;
+
+	XLogFromFileName(filename, &tli, &segno, wal_segment_size);
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	if (segno > XLogCtl->lastRemovedSegNo)
+		XLogCtl->lastRemovedSegNo = segno;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+/*
+ * Remove all temporary log files in pg_wal
+ *
+ * This is called at the beginning of recovery after a previous crash,
+ * at a point where no other processes write fresh WAL data.
+ */
+static void
+RemoveTempXlogFiles(void)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+
+	elog(DEBUG2, "removing all temporary WAL segments");
+
+	xldir = AllocateDir(XLOGDIR);
+	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+	{
+		char		path[MAXPGPATH];
+
+		if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
+		unlink(path);
+		elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+	}
+	FreeDir(xldir);
+}
+
+/*
+ * Recycle or remove all log files older or equal to passed segno.
+ *
+ * endptr is current (or recent) end of xlog, and lastredoptr is the
+ * redo pointer of the last checkpoint. These are used to determine
+ * whether we want to recycle rather than delete no-longer-wanted log files.
+ *
+ * insertTLI is the current timeline for XLOG insertion. Any recycled
+ * segments should be reused for this timeline.
+ */
+static void
+RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
+				   TimeLineID insertTLI)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		lastoff[MAXFNAMELEN];
+	XLogSegNo	endlogSegNo;
+	XLogSegNo	recycleSegNo;
+
+	/* Initialize info about where to try to recycle to */
+	XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
+	recycleSegNo = XLOGfileslop(lastredoptr);
+
+	/*
+	 * Construct a filename of the last segment to be kept. The timeline ID
+	 * doesn't matter, we ignore that in the comparison. (During recovery,
+	 * InsertTimeLineID isn't set, so we can't use that.)
+	 */
+	XLogFileName(lastoff, 0, segno, wal_segment_size);
+
+	elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
+		 lastoff);
+
+	xldir = AllocateDir(XLOGDIR);
+
+	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+	{
+		/* Ignore files that are not XLOG segments */
+		if (!IsXLogFileName(xlde->d_name) &&
+			!IsPartialXLogFileName(xlde->d_name))
+			continue;
+
+		/*
+		 * We ignore the timeline part of the XLOG segment identifiers in
+		 * deciding whether a segment is still needed.  This ensures that we
+		 * won't prematurely remove a segment from a parent timeline. We could
+		 * probably be a little more proactive about removing segments of
+		 * non-parent timelines, but that would be a whole lot more
+		 * complicated.
+		 *
+		 * We use the alphanumeric sorting property of the filenames to decide
+		 * which ones are earlier than the lastoff segment.
+		 */
+		if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
+		{
+			if (XLogArchiveCheckDone(xlde->d_name))
+			{
+				/* Update the last removed location in shared memory first */
+				UpdateLastRemovedPtr(xlde->d_name);
+
+				RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo,
+							   insertTLI);
+			}
+		}
+	}
+
+	FreeDir(xldir);
+}
+
+/*
+ * Remove WAL files that are not part of the given timeline's history.
+ *
+ * This is called during recovery, whenever we switch to follow a new
+ * timeline, and at the end of recovery when we create a new timeline. We
+ * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
+ * might be leftover pre-allocated or recycled WAL segments on the old timeline
+ * that we haven't used yet, and contain garbage. If we just leave them in
+ * pg_wal, they will eventually be archived, and we can't let that happen.
+ * Files that belong to our timeline history are valid, because we have
+ * successfully replayed them, but from others we can't be sure.
+ *
+ * 'switchpoint' is the current point in WAL where we switch to new timeline,
+ * and 'newTLI' is the new timeline we switch to.
+ */
+void
+RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		switchseg[MAXFNAMELEN];
+	XLogSegNo	endLogSegNo;
+	XLogSegNo	switchLogSegNo;
+	XLogSegNo	recycleSegNo;
+
+	/*
+	 * Initialize info about where to begin the work.  This will recycle,
+	 * somewhat arbitrarily, 10 future segments.
+	 */
+	XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
+	XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
+	recycleSegNo = endLogSegNo + 10;
+
+	/*
+	 * Construct a filename of the last segment to be kept.
+	 */
+	XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
+
+	elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
+		 switchseg);
+
+	xldir = AllocateDir(XLOGDIR);
+
+	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+	{
+		/* Ignore files that are not XLOG segments */
+		if (!IsXLogFileName(xlde->d_name))
+			continue;
+
+		/*
+		 * Remove files that are on a timeline older than the new one we're
+		 * switching to, but with a segment number >= the first segment on the
+		 * new timeline.
+		 */
+		if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
+			strcmp(xlde->d_name + 8, switchseg + 8) > 0)
+		{
+			/*
+			 * If the file has already been marked as .ready, however, don't
+			 * remove it yet. It should be OK to remove it - files that are
+			 * not part of our timeline history are not required for recovery
+			 * - but seems safer to let them be archived and removed later.
+			 */
+			if (!XLogArchiveIsReady(xlde->d_name))
+				RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo,
+							   newTLI);
+		}
+	}
+
+	FreeDir(xldir);
+}
+
+/*
+ * Recycle or remove a log file that's no longer needed.
+ *
+ * segname is the name of the segment to recycle or remove.  recycleSegNo
+ * is the segment number to recycle up to.  endlogSegNo is the segment
+ * number of the current (or recent) end of WAL.
+ *
+ * endlogSegNo gets incremented if the segment is recycled so as it is not
+ * checked again with future callers of this function.
+ *
+ * insertTLI is the current timeline for XLOG insertion. Any recycled segments
+ * should be used for this timeline.
+ */
+static void
+RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
+			   XLogSegNo *endlogSegNo, TimeLineID insertTLI)
+{
+	char		path[MAXPGPATH];
+#ifdef WIN32
+	char		newpath[MAXPGPATH];
+#endif
+	struct stat statbuf;
+
+	snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
+
+	/*
+	 * Before deleting the file, see if it can be recycled as a future log
+	 * segment. Only recycle normal files, because we don't want to recycle
+	 * symbolic links pointing to a separate archive directory.
+	 */
+	if (wal_recycle &&
+		*endlogSegNo <= recycleSegNo &&
+		XLogCtl->InstallXLogFileSegmentActive &&	/* callee rechecks this */
+		lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
+		InstallXLogFileSegment(endlogSegNo, path,
+							   true, recycleSegNo, insertTLI))
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("recycled write-ahead log file \"%s\"",
+								 segname)));
+		CheckpointStats.ckpt_segs_recycled++;
+		/* Needn't recheck that slot on future iterations */
+		(*endlogSegNo)++;
+	}
+	else
+	{
+		/* No need for any more future segments, or recycling failed ... */
+		int			rc;
+
+		ereport(DEBUG2,
+				(errmsg_internal("removing write-ahead log file \"%s\"",
+								 segname)));
+
+#ifdef WIN32
+
+		/*
+		 * On Windows, if another process (e.g another backend) holds the file
+		 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
+		 * will still show up in directory listing until the last handle is
+		 * closed. To avoid confusing the lingering deleted file for a live
+		 * WAL file that needs to be archived, rename it before deleting it.
+		 *
+		 * If another process holds the file open without FILE_SHARE_DELETE
+		 * flag, rename will fail. We'll try again at the next checkpoint.
+		 */
+		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
+		if (rename(path, newpath) != 0)
+		{
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not rename file \"%s\": %m",
+							path)));
+			return;
+		}
+		rc = durable_unlink(newpath, LOG);
+#else
+		rc = durable_unlink(path, LOG);
+#endif
+		if (rc != 0)
+		{
+			/* Message already logged by durable_unlink() */
+			return;
+		}
+		CheckpointStats.ckpt_segs_removed++;
+	}
+
+	XLogArchiveCleanup(segname);
+}
+
+/*
+ * Verify whether pg_wal and pg_wal/archive_status exist.
+ * If the latter does not exist, recreate it.
+ *
+ * It is not the goal of this function to verify the contents of these
+ * directories, but to help in cases where someone has performed a cluster
+ * copy for PITR purposes but omitted pg_wal from the copy.
+ *
+ * We could also recreate pg_wal if it doesn't exist, but a deliberate
+ * policy decision was made not to.  It is fairly common for pg_wal to be
+ * a symlink, and if that was the DBA's intent then automatically making a
+ * plain directory would result in degraded performance with no notice.
+ */
+static void
+ValidateXLOGDirectoryStructure(void)
+{
+	char		path[MAXPGPATH];
+	struct stat stat_buf;
+
+	/* Check for pg_wal; if it doesn't exist, error out */
+	if (stat(XLOGDIR, &stat_buf) != 0 ||
+		!S_ISDIR(stat_buf.st_mode))
+		ereport(FATAL,
+				(errmsg("required WAL directory \"%s\" does not exist",
+						XLOGDIR)));
+
+	/* Check for archive_status */
+	snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
+	if (stat(path, &stat_buf) == 0)
+	{
+		/* Check for weird cases where it exists but isn't a directory */
+		if (!S_ISDIR(stat_buf.st_mode))
+			ereport(FATAL,
+					(errmsg("required WAL directory \"%s\" does not exist",
+							path)));
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("creating missing WAL directory \"%s\"", path)));
+		if (MakePGDirectory(path) < 0)
+			ereport(FATAL,
+					(errmsg("could not create missing directory \"%s\": %m",
+							path)));
+	}
+}
+
+/*
+ * Remove previous backup history files.  This also retries creation of
+ * .ready files for any backup history files for which XLogArchiveNotify
+ * failed earlier.
+ */
+static void
+CleanupBackupHistory(void)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		path[MAXPGPATH + sizeof(XLOGDIR)];
+
+	xldir = AllocateDir(XLOGDIR);
+
+	while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
+	{
+		if (IsBackupHistoryFileName(xlde->d_name))
+		{
+			if (XLogArchiveCheckDone(xlde->d_name))
+			{
+				elog(DEBUG2, "removing WAL backup history file \"%s\"",
+					 xlde->d_name);
+				snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
+				unlink(path);
+				XLogArchiveCleanup(xlde->d_name);
+			}
+		}
+	}
+
+	FreeDir(xldir);
+}
+
+/*
+ * I/O routines for pg_control
+ *
+ * *ControlFile is a buffer in shared memory that holds an image of the
+ * contents of pg_control.  WriteControlFile() initializes pg_control
+ * given a preloaded buffer, ReadControlFile() loads the buffer from
+ * the pg_control file (during postmaster or standalone-backend startup),
+ * and UpdateControlFile() rewrites pg_control after we modify xlog state.
+ * InitControlFile() fills the buffer with initial values.
+ *
+ * For simplicity, WriteControlFile() initializes the fields of pg_control
+ * that are related to checking backend/database compatibility, and
+ * ReadControlFile() verifies they are correct.  We could split out the
+ * I/O and compatibility-check functions, but there seems no need currently.
+ */
+
+static void
+InitControlFile(uint64 sysidentifier)
+{
+	char		mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
+
+	/*
+	 * Generate a random nonce. This is used for authentication requests that
+	 * will fail because the user does not exist. The nonce is used to create
+	 * a genuine-looking password challenge for the non-existent user, in lieu
+	 * of an actual stored password.
+	 */
+	if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
+		ereport(PANIC,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("could not generate secret authorization token")));
+
+	memset(ControlFile, 0, sizeof(ControlFileData));
+	/* Initialize pg_control status fields */
+	ControlFile->system_identifier = sysidentifier;
+	memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
+	ControlFile->state = DB_SHUTDOWNED;
+	ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
+
+	/* Set important parameter values for use when replaying WAL */
+	ControlFile->MaxConnections = MaxConnections;
+	ControlFile->max_worker_processes = max_worker_processes;
+	ControlFile->max_wal_senders = max_wal_senders;
+	ControlFile->max_prepared_xacts = max_prepared_xacts;
+	ControlFile->max_locks_per_xact = max_locks_per_xact;
+	ControlFile->wal_level = wal_level;
+	ControlFile->wal_log_hints = wal_log_hints;
+	ControlFile->track_commit_timestamp = track_commit_timestamp;
+	ControlFile->data_checksum_version = bootstrap_data_checksum_version;
+}
+
+static void
+WriteControlFile(void)
+{
+	int			fd;
+	char		buffer[PG_CONTROL_FILE_SIZE];	/* need not be aligned */
+
+	/*
+	 * Ensure that the size of the pg_control data structure is sane.  See the
+	 * comments for these symbols in pg_control.h.
+	 */
+	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
+					 "pg_control is too large for atomic disk writes");
+	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
+					 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
+
+	/*
+	 * Initialize version and compatibility-check fields
+	 */
+	ControlFile->pg_control_version = PG_CONTROL_VERSION;
+	ControlFile->catalog_version_no = CATALOG_VERSION_NO;
+
+	ControlFile->maxAlign = MAXIMUM_ALIGNOF;
+	ControlFile->floatFormat = FLOATFORMAT_VALUE;
+
+	ControlFile->blcksz = BLCKSZ;
+	ControlFile->relseg_size = RELSEG_SIZE;
+	ControlFile->xlog_blcksz = XLOG_BLCKSZ;
+	ControlFile->xlog_seg_size = wal_segment_size;
+
+	ControlFile->nameDataLen = NAMEDATALEN;
+	ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
+
+	ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
+	ControlFile->loblksize = LOBLKSIZE;
+
+	ControlFile->float8ByVal = FLOAT8PASSBYVAL;
+
+	/* Contents are protected with a CRC */
+	INIT_CRC32C(ControlFile->crc);
+	COMP_CRC32C(ControlFile->crc,
+				(char *) ControlFile,
+				offsetof(ControlFileData, crc));
+	FIN_CRC32C(ControlFile->crc);
+
+	/*
+	 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
+	 * the excess over sizeof(ControlFileData).  This reduces the odds of
+	 * premature-EOF errors when reading pg_control.  We'll still fail when we
+	 * check the contents of the file, but hopefully with a more specific
+	 * error than "couldn't read pg_control".
+	 */
+	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
+	memcpy(buffer, ControlFile, sizeof(ControlFileData));
+
+	fd = BasicOpenFile(XLOG_CONTROL_FILE,
+					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (fd < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
+	if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+	}
+	pgstat_report_wait_end();
+
+	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
+	if (pg_fsync(fd) != 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+	pgstat_report_wait_end();
+
+	if (close(fd) != 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+}
+
+static void
+ReadControlFile(void)
+{
+	pg_crc32c	crc;
+	int			fd;
+	static char wal_segsz_str[20];
+	int			r;
+
+	/*
+	 * Read data...
+	 */
+	fd = BasicOpenFile(XLOG_CONTROL_FILE,
+					   O_RDWR | PG_BINARY);
+	if (fd < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+
+	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
+	r = read(fd, ControlFile, sizeof(ControlFileData));
+	if (r != sizeof(ControlFileData))
+	{
+		if (r < 0)
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							XLOG_CONTROL_FILE)));
+		else
+			ereport(PANIC,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("could not read file \"%s\": read %d of %zu",
+							XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
+	}
+	pgstat_report_wait_end();
+
+	close(fd);
+
+	/*
+	 * Check for expected pg_control format version.  If this is wrong, the
+	 * CRC check will likely fail because we'll be checking the wrong number
+	 * of bytes.  Complaining about wrong version will probably be more
+	 * enlightening than complaining about wrong CRC.
+	 */
+
+	if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
+						   " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
+						   ControlFile->pg_control_version, ControlFile->pg_control_version,
+						   PG_CONTROL_VERSION, PG_CONTROL_VERSION),
+				 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
+
+	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
+						   " but the server was compiled with PG_CONTROL_VERSION %d.",
+						   ControlFile->pg_control_version, PG_CONTROL_VERSION),
+				 errhint("It looks like you need to initdb.")));
+
+	/* Now check the CRC. */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc,
+				(char *) ControlFile,
+				offsetof(ControlFileData, crc));
+	FIN_CRC32C(crc);
+
+	if (!EQ_CRC32C(crc, ControlFile->crc))
+		ereport(FATAL,
+				(errmsg("incorrect checksum in control file")));
+
+	/*
+	 * Do compatibility checking immediately.  If the database isn't
+	 * compatible with the backend executable, we want to abort before we can
+	 * possibly do any damage.
+	 */
+	if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
+						   " but the server was compiled with CATALOG_VERSION_NO %d.",
+						   ControlFile->catalog_version_no, CATALOG_VERSION_NO),
+				 errhint("It looks like you need to initdb.")));
+	if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with MAXALIGN %d,"
+						   " but the server was compiled with MAXALIGN %d.",
+						   ControlFile->maxAlign, MAXIMUM_ALIGNOF),
+				 errhint("It looks like you need to initdb.")));
+	if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
+				 errhint("It looks like you need to initdb.")));
+	if (ControlFile->blcksz != BLCKSZ)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with BLCKSZ %d,"
+						   " but the server was compiled with BLCKSZ %d.",
+						   ControlFile->blcksz, BLCKSZ),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->relseg_size != RELSEG_SIZE)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
+						   " but the server was compiled with RELSEG_SIZE %d.",
+						   ControlFile->relseg_size, RELSEG_SIZE),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
+						   " but the server was compiled with XLOG_BLCKSZ %d.",
+						   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->nameDataLen != NAMEDATALEN)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
+						   " but the server was compiled with NAMEDATALEN %d.",
+						   ControlFile->nameDataLen, NAMEDATALEN),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
+						   " but the server was compiled with INDEX_MAX_KEYS %d.",
+						   ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
+						   " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
+						   ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
+				 errhint("It looks like you need to recompile or initdb.")));
+	if (ControlFile->loblksize != LOBLKSIZE)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
+						   " but the server was compiled with LOBLKSIZE %d.",
+						   ControlFile->loblksize, (int) LOBLKSIZE),
+				 errhint("It looks like you need to recompile or initdb.")));
+
+#ifdef USE_FLOAT8_BYVAL
+	if (ControlFile->float8ByVal != true)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
+						   " but the server was compiled with USE_FLOAT8_BYVAL."),
+				 errhint("It looks like you need to recompile or initdb.")));
+#else
+	if (ControlFile->float8ByVal != false)
+		ereport(FATAL,
+				(errmsg("database files are incompatible with server"),
+				 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
+						   " but the server was compiled without USE_FLOAT8_BYVAL."),
+				 errhint("It looks like you need to recompile or initdb.")));
+#endif
+
+	wal_segment_size = ControlFile->xlog_seg_size;
+
+	if (!IsValidWalSegSize(wal_segment_size))
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
+									  "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
+									  wal_segment_size,
+									  wal_segment_size)));
+
+	snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
+	SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
+					PGC_S_DYNAMIC_DEFAULT);
+
+	/* check and update variables dependent on wal_segment_size */
+	if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
+
+	if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
+		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
+
+	UsableBytesInSegment =
+		(wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
+		(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
+
+	CalculateCheckpointSegments();
+
+	/* Make the initdb settings visible as GUC variables, too */
+	SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
+					PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+}
+
+/*
+ * Utility wrapper to update the control file.  Note that the control
+ * file gets flushed.
+ */
+static void
+UpdateControlFile(void)
+{
+	update_controlfile(DataDir, ControlFile, true);
+}
+
+/*
+ * Returns the unique system identifier from control file.
+ */
+uint64
+GetSystemIdentifier(void)
+{
+	Assert(ControlFile != NULL);
+	return ControlFile->system_identifier;
+}
+
+/*
+ * Returns the random nonce from control file.
+ */
+char *
+GetMockAuthenticationNonce(void)
+{
+	Assert(ControlFile != NULL);
+	return ControlFile->mock_authentication_nonce;
+}
+
+/*
+ * Are checksums enabled for data pages?
+ */
+bool
+DataChecksumsEnabled(void)
+{
+	Assert(ControlFile != NULL);
+	return (ControlFile->data_checksum_version > 0);
+}
+
+/*
+ * Returns a fake LSN for unlogged relations.
+ *
+ * Each call generates an LSN that is greater than any previous value
+ * returned. The current counter value is saved and restored across clean
+ * shutdowns, but like unlogged relations, does not survive a crash. This can
+ * be used in lieu of real LSN values returned by XLogInsert, if you need an
+ * LSN-like increasing sequence of numbers without writing any WAL.
+ */
+XLogRecPtr
+GetFakeLSNForUnloggedRel(void)
+{
+	XLogRecPtr	nextUnloggedLSN;
+
+	/* increment the unloggedLSN counter, need SpinLock */
+	SpinLockAcquire(&XLogCtl->ulsn_lck);
+	nextUnloggedLSN = XLogCtl->unloggedLSN++;
+	SpinLockRelease(&XLogCtl->ulsn_lck);
+
+	return nextUnloggedLSN;
+}
+
+/*
+ * Auto-tune the number of XLOG buffers.
+ *
+ * The preferred setting for wal_buffers is about 3% of shared_buffers, with
+ * a maximum of one XLOG segment (there is little reason to think that more
+ * is helpful, at least so long as we force an fsync when switching log files)
+ * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
+ * 9.1, when auto-tuning was added).
+ *
+ * This should not be called until NBuffers has received its final value.
+ */
+static int
+XLOGChooseNumBuffers(void)
+{
+	int			xbuffers;
+
+	xbuffers = NBuffers / 32;
+	if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
+		xbuffers = (wal_segment_size / XLOG_BLCKSZ);
+	if (xbuffers < 8)
+		xbuffers = 8;
+	return xbuffers;
+}
+
+/*
+ * GUC check_hook for wal_buffers
+ */
+bool
+check_wal_buffers(int *newval, void **extra, GucSource source)
+{
+	/*
+	 * -1 indicates a request for auto-tune.
+	 */
+	if (*newval == -1)
+	{
+		/*
+		 * If we haven't yet changed the boot_val default of -1, just let it
+		 * be.  We'll fix it when XLOGShmemSize is called.
+		 */
+		if (XLOGbuffers == -1)
+			return true;
+
+		/* Otherwise, substitute the auto-tune value */
+		*newval = XLOGChooseNumBuffers();
+	}
+
+	/*
+	 * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
+	 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
+	 * the case, we just silently treat such values as a request for the
+	 * minimum.  (We could throw an error instead, but that doesn't seem very
+	 * helpful.)
+	 */
+	if (*newval < 4)
+		*newval = 4;
+
+	return true;
+}
+
+/*
+ * Read the control file, set respective GUCs.
+ *
+ * This is to be called during startup, including a crash recovery cycle,
+ * unless in bootstrap mode, where no control file yet exists.  As there's no
+ * usable shared memory yet (its sizing can depend on the contents of the
+ * control file!), first store the contents in local memory. XLOGShmemInit()
+ * will then copy it to shared memory later.
+ *
+ * reset just controls whether previous contents are to be expected (in the
+ * reset case, there's a dangling pointer into old shared memory), or not.
+ */
+void
+LocalProcessControlFile(bool reset)
+{
+	Assert(reset || ControlFile == NULL);
+	ControlFile = palloc(sizeof(ControlFileData));
+	ReadControlFile();
+}
+
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+	Size		size;
+
+	/*
+	 * If the value of wal_buffers is -1, use the preferred auto-tune value.
+	 * This isn't an amazingly clean place to do this, but we must wait till
+	 * NBuffers has received its final value, and must do it before using the
+	 * value of XLOGbuffers to do anything important.
+	 *
+	 * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
+	 * However, if the DBA explicitly set wal_buffers = -1 in the config file,
+	 * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
+	 * the matter with PGC_S_OVERRIDE.
+	 */
+	if (XLOGbuffers == -1)
+	{
+		char		buf[32];
+
+		snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
+		SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
+						PGC_S_DYNAMIC_DEFAULT);
+		if (XLOGbuffers == -1)	/* failed to apply it? */
+			SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
+							PGC_S_OVERRIDE);
+	}
+	Assert(XLOGbuffers > 0);
+
+	/* XLogCtl */
+	size = sizeof(XLogCtlData);
+
+	/* WAL insertion locks, plus alignment */
+	size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
+	/* xlblocks array */
+	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
+	/* extra alignment padding for XLOG I/O buffers */
+	size = add_size(size, XLOG_BLCKSZ);
+	/* and the buffers themselves */
+	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
+
+	/*
+	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
+	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
+	 * routine again below to compute the actual allocation size.
+	 */
+
+	return size;
+}
+
+void
+XLOGShmemInit(void)
+{
+	bool		foundCFile,
+				foundXLog;
+	char	   *allocptr;
+	int			i;
+	ControlFileData *localControlFile;
+
+#ifdef WAL_DEBUG
+
+	/*
+	 * Create a memory context for WAL debugging that's exempt from the normal
+	 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
+	 * an allocation fails, but wal_debug is not for production use anyway.
+	 */
+	if (walDebugCxt == NULL)
+	{
+		walDebugCxt = AllocSetContextCreate(TopMemoryContext,
+											"WAL Debug",
+											ALLOCSET_DEFAULT_SIZES);
+		MemoryContextAllowInCriticalSection(walDebugCxt, true);
+	}
+#endif
+
+
+	XLogCtl = (XLogCtlData *)
+		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+
+	localControlFile = ControlFile;
+	ControlFile = (ControlFileData *)
+		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
+
+	if (foundCFile || foundXLog)
+	{
+		/* both should be present or neither */
+		Assert(foundCFile && foundXLog);
+
+		/* Initialize local copy of WALInsertLocks */
+		WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
+
+		if (localControlFile)
+			pfree(localControlFile);
+		return;
+	}
+	memset(XLogCtl, 0, sizeof(XLogCtlData));
+
+	/*
+	 * Already have read control file locally, unless in bootstrap mode. Move
+	 * contents into shared memory.
+	 */
+	if (localControlFile)
+	{
+		memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
+		pfree(localControlFile);
+	}
+
+	/*
+	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
+	 * multiple of the alignment for same, so no extra alignment padding is
+	 * needed here.
+	 */
+	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
+	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
+	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
+	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
+
+
+	/* WAL insertion locks. Ensure they're aligned to the full padded size */
+	allocptr += sizeof(WALInsertLockPadded) -
+		((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
+	WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
+		(WALInsertLockPadded *) allocptr;
+	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
+
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+	{
+		LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
+		WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+		WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
+	}
+
+	/*
+	 * Align the start of the page buffers to a full xlog block size boundary.
+	 * This simplifies some calculations in XLOG insertion. It is also
+	 * required for O_DIRECT.
+	 */
+	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
+	XLogCtl->pages = allocptr;
+	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
+
+	/*
+	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
+	 * in additional info.)
+	 */
+	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+	XLogCtl->InstallXLogFileSegmentActive = false;
+	XLogCtl->WalWriterSleeping = false;
+
+	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
+	SpinLockInit(&XLogCtl->info_lck);
+	SpinLockInit(&XLogCtl->ulsn_lck);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates pg_control
+ * and the initial XLOG segment.
+ */
+void
+BootStrapXLOG(void)
+{
+	CheckPoint	checkPoint;
+	char	   *buffer;
+	XLogPageHeader page;
+	XLogLongPageHeader longpage;
+	XLogRecord *record;
+	char	   *recptr;
+	uint64		sysidentifier;
+	struct timeval tv;
+	pg_crc32c	crc;
+
+	/* allow ordinary WAL segment creation, like StartupXLOG() would */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	XLogCtl->InstallXLogFileSegmentActive = true;
+	LWLockRelease(ControlFileLock);
+
+	/*
+	 * Select a hopefully-unique system identifier code for this installation.
+	 * We use the result of gettimeofday(), including the fractional seconds
+	 * field, as being about as unique as we can easily get.  (Think not to
+	 * use random(), since it hasn't been seeded and there's no portable way
+	 * to seed it other than the system clock value...)  The upper half of the
+	 * uint64 value is just the tv_sec part, while the lower half contains the
+	 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
+	 * PID for a little extra uniqueness.  A person knowing this encoding can
+	 * determine the initialization time of the installation, which could
+	 * perhaps be useful sometimes.
+	 */
+	gettimeofday(&tv, NULL);
+	sysidentifier = ((uint64) tv.tv_sec) << 32;
+	sysidentifier |= ((uint64) tv.tv_usec) << 12;
+	sysidentifier |= getpid() & 0xFFF;
+
+	/* page buffer must be aligned suitably for O_DIRECT */
+	buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
+	page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
+	memset(page, 0, XLOG_BLCKSZ);
+
+	/*
+	 * Set up information for the initial checkpoint record
+	 *
+	 * The initial checkpoint record is written to the beginning of the WAL
+	 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
+	 * used, so that we can use 0/0 to mean "before any valid WAL segment".
+	 */
+	checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
+	checkPoint.ThisTimeLineID = BootstrapTimeLineID;
+	checkPoint.PrevTimeLineID = BootstrapTimeLineID;
+	checkPoint.fullPageWrites = fullPageWrites;
+	checkPoint.nextXid =
+		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
+	checkPoint.nextOid = FirstGenbkiObjectId;
+	checkPoint.nextMulti = FirstMultiXactId;
+	checkPoint.nextMultiOffset = 0;
+	checkPoint.oldestXid = FirstNormalTransactionId;
+	checkPoint.oldestXidDB = Template1DbOid;
+	checkPoint.oldestMulti = FirstMultiXactId;
+	checkPoint.oldestMultiDB = Template1DbOid;
+	checkPoint.oldestCommitTsXid = InvalidTransactionId;
+	checkPoint.newestCommitTsXid = InvalidTransactionId;
+	checkPoint.time = (pg_time_t) time(NULL);
+	checkPoint.oldestActiveXid = InvalidTransactionId;
+
+	ShmemVariableCache->nextXid = checkPoint.nextXid;
+	ShmemVariableCache->nextOid = checkPoint.nextOid;
+	ShmemVariableCache->oidCount = 0;
+	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+	AdvanceOldestClogXid(checkPoint.oldestXid);
+	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+	SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
+
+	/* Set up the XLOG page header */
+	page->xlp_magic = XLOG_PAGE_MAGIC;
+	page->xlp_info = XLP_LONG_HEADER;
+	page->xlp_tli = BootstrapTimeLineID;
+	page->xlp_pageaddr = wal_segment_size;
+	longpage = (XLogLongPageHeader) page;
+	longpage->xlp_sysid = sysidentifier;
+	longpage->xlp_seg_size = wal_segment_size;
+	longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
+
+	/* Insert the initial checkpoint record */
+	recptr = ((char *) page + SizeOfXLogLongPHD);
+	record = (XLogRecord *) recptr;
+	record->xl_prev = 0;
+	record->xl_xid = InvalidTransactionId;
+	record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
+	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
+	record->xl_rmid = RM_XLOG_ID;
+	recptr += SizeOfXLogRecord;
+	/* fill the XLogRecordDataHeaderShort struct */
+	*(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
+	*(recptr++) = sizeof(checkPoint);
+	memcpy(recptr, &checkPoint, sizeof(checkPoint));
+	recptr += sizeof(checkPoint);
+	Assert(recptr - (char *) record == record->xl_tot_len);
+
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
+	COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
+	FIN_CRC32C(crc);
+	record->xl_crc = crc;
+
+	/* Create first XLOG segment file */
+	openLogTLI = BootstrapTimeLineID;
+	openLogFile = XLogFileInit(1, BootstrapTimeLineID);
+
+	/*
+	 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
+	 * close the file again in a moment.
+	 */
+
+	/* Write the first page with the initial record */
+	errno = 0;
+	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
+	if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not write bootstrap write-ahead log file: %m")));
+	}
+	pgstat_report_wait_end();
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
+	if (pg_fsync(openLogFile) != 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync bootstrap write-ahead log file: %m")));
+	pgstat_report_wait_end();
+
+	if (close(openLogFile) != 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close bootstrap write-ahead log file: %m")));
+
+	openLogFile = -1;
+
+	/* Now create pg_control */
+	InitControlFile(sysidentifier);
+	ControlFile->time = checkPoint.time;
+	ControlFile->checkPoint = checkPoint.redo;
+	ControlFile->checkPointCopy = checkPoint;
+
+	/* some additional ControlFile fields are set in WriteControlFile() */
+	WriteControlFile();
+
+	/* Bootstrap the commit log, too */
+	BootStrapCLOG();
+	BootStrapCommitTs();
+	BootStrapSUBTRANS();
+	BootStrapMultiXact();
+
+	pfree(buffer);
+
+	/*
+	 * Force control file to be read - in contrast to normal processing we'd
+	 * otherwise never run the checks and GUC related initializations therein.
+	 */
+	ReadControlFile();
+}
+
+static char *
+str_time(pg_time_t tnow)
+{
+	static char buf[128];
+
+	pg_strftime(buf, sizeof(buf),
+				"%Y-%m-%d %H:%M:%S %Z",
+				pg_localtime(&tnow, log_timezone));
+
+	return buf;
+}
+
+/*
+ * Initialize the first WAL segment on new timeline.
+ */
+static void
+XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
+{
+	char		xlogfname[MAXFNAMELEN];
+	XLogSegNo	endLogSegNo;
+	XLogSegNo	startLogSegNo;
+
+	/* we always switch to a new timeline after archive recovery */
+	Assert(endTLI != newTLI);
+
+	/*
+	 * Update min recovery point one last time.
+	 */
+	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+
+	/*
+	 * Calculate the last segment on the old timeline, and the first segment
+	 * on the new timeline. If the switch happens in the middle of a segment,
+	 * they are the same, but if the switch happens exactly at a segment
+	 * boundary, startLogSegNo will be endLogSegNo + 1.
+	 */
+	XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
+	XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
+
+	/*
+	 * Initialize the starting WAL segment for the new timeline. If the switch
+	 * happens in the middle of a segment, copy data from the last WAL segment
+	 * of the old timeline up to the switch point, to the starting WAL segment
+	 * on the new timeline.
+	 */
+	if (endLogSegNo == startLogSegNo)
+	{
+		/*
+		 * Make a copy of the file on the new timeline.
+		 *
+		 * Writing WAL isn't allowed yet, so there are no locking
+		 * considerations. But we should be just as tense as XLogFileInit to
+		 * avoid emplacing a bogus file.
+		 */
+		XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
+					 XLogSegmentOffset(endOfLog, wal_segment_size));
+	}
+	else
+	{
+		/*
+		 * The switch happened at a segment boundary, so just create the next
+		 * segment on the new timeline.
+		 */
+		int			fd;
+
+		fd = XLogFileInit(startLogSegNo, newTLI);
+
+		if (close(fd) != 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno = errno;
+
+			XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not close file \"%s\": %m", xlogfname)));
+		}
+	}
+
+	/*
+	 * Let's just make real sure there are not .ready or .done flags posted
+	 * for the new segment.
+	 */
+	XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
+	XLogArchiveCleanup(xlogfname);
+}
+
+/*
+ * Perform cleanup actions at the conclusion of archive recovery.
+ */
+static void
+CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
+							TimeLineID newTLI)
+{
+	/*
+	 * Execute the recovery_end_command, if any.
+	 */
+	if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
+		ExecuteRecoveryCommand(recoveryEndCommand,
+							   "recovery_end_command",
+							   true,
+							   WAIT_EVENT_RECOVERY_END_COMMAND);
+
+	/*
+	 * We switched to a new timeline. Clean up segments on the old timeline.
+	 *
+	 * If there are any higher-numbered segments on the old timeline, remove
+	 * them. They might contain valid WAL, but they might also be
+	 * pre-allocated files containing garbage. In any case, they are not part
+	 * of the new timeline's history so we don't need them.
+	 */
+	RemoveNonParentXlogFiles(EndOfLog, newTLI);
+
+	/*
+	 * If the switch happened in the middle of a segment, what to do with the
+	 * last, partial segment on the old timeline? If we don't archive it, and
+	 * the server that created the WAL never archives it either (e.g. because
+	 * it was hit by a meteor), it will never make it to the archive. That's
+	 * OK from our point of view, because the new segment that we created with
+	 * the new TLI contains all the WAL from the old timeline up to the switch
+	 * point. But if you later try to do PITR to the "missing" WAL on the old
+	 * timeline, recovery won't find it in the archive. It's physically
+	 * present in the new file with new TLI, but recovery won't look there
+	 * when it's recovering to the older timeline. On the other hand, if we
+	 * archive the partial segment, and the original server on that timeline
+	 * is still running and archives the completed version of the same segment
+	 * later, it will fail. (We used to do that in 9.4 and below, and it
+	 * caused such problems).
+	 *
+	 * As a compromise, we rename the last segment with the .partial suffix,
+	 * and archive it. Archive recovery will never try to read .partial
+	 * segments, so they will normally go unused. But in the odd PITR case,
+	 * the administrator can copy them manually to the pg_wal directory
+	 * (removing the suffix). They can be useful in debugging, too.
+	 *
+	 * If a .done or .ready file already exists for the old timeline, however,
+	 * we had already determined that the segment is complete, so we can let
+	 * it be archived normally. (In particular, if it was restored from the
+	 * archive to begin with, it's expected to have a .done file).
+	 */
+	if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
+		XLogArchivingActive())
+	{
+		char		origfname[MAXFNAMELEN];
+		XLogSegNo	endLogSegNo;
+
+		XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+		XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
+
+		if (!XLogArchiveIsReadyOrDone(origfname))
+		{
+			char		origpath[MAXPGPATH];
+			char		partialfname[MAXFNAMELEN];
+			char		partialpath[MAXPGPATH];
+
+			XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
+			snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
+			snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
+
+			/*
+			 * Make sure there's no .done or .ready file for the .partial
+			 * file.
+			 */
+			XLogArchiveCleanup(partialfname);
+
+			durable_rename(origpath, partialpath, ERROR);
+			XLogArchiveNotify(partialfname);
+		}
+	}
+}
+
+/*
+ * Check to see if required parameters are set high enough on this server
+ * for various aspects of recovery operation.
+ *
+ * Note that all the parameters which this function tests need to be
+ * listed in Administrator's Overview section in high-availability.sgml.
+ * If you change them, don't forget to update the list.
+ */
+static void
+CheckRequiredParameterValues(void)
+{
+	/*
+	 * For archive recovery, the WAL must be generated with at least 'replica'
+	 * wal_level.
+	 */
+	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+	{
+		ereport(FATAL,
+				(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
+				 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
+				 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
+	}
+
+	/*
+	 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
+	 * must have at least as many backend slots as the primary.
+	 */
+	if (ArchiveRecoveryRequested && EnableHotStandby)
+	{
+		/* We ignore autovacuum_max_workers when we make this test. */
+		RecoveryRequiresIntParameter("max_connections",
+									 MaxConnections,
+									 ControlFile->MaxConnections);
+		RecoveryRequiresIntParameter("max_worker_processes",
+									 max_worker_processes,
+									 ControlFile->max_worker_processes);
+		RecoveryRequiresIntParameter("max_wal_senders",
+									 max_wal_senders,
+									 ControlFile->max_wal_senders);
+		RecoveryRequiresIntParameter("max_prepared_transactions",
+									 max_prepared_xacts,
+									 ControlFile->max_prepared_xacts);
+		RecoveryRequiresIntParameter("max_locks_per_transaction",
+									 max_locks_per_xact,
+									 ControlFile->max_locks_per_xact);
+	}
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup
+ */
+void
+StartupXLOG(void)
+{
+	XLogCtlInsert *Insert;
+	CheckPoint	checkPoint;
+	bool		wasShutdown;
+	bool		didCrash;
+	bool		haveTblspcMap;
+	bool		haveBackupLabel;
+	XLogRecPtr	EndOfLog;
+	TimeLineID	EndOfLogTLI;
+	TimeLineID	newTLI;
+	bool		performedWalRecovery;
+	EndOfWalRecoveryInfo *endOfRecoveryInfo;
+	XLogRecPtr	abortedRecPtr;
+	XLogRecPtr	missingContrecPtr;
+	TransactionId oldestActiveXID;
+	bool		promoted = false;
+
+	/*
+	 * We should have an aux process resource owner to use, and we should not
+	 * be in a transaction that's installed some other resowner.
+	 */
+	Assert(AuxProcessResourceOwner != NULL);
+	Assert(CurrentResourceOwner == NULL ||
+		   CurrentResourceOwner == AuxProcessResourceOwner);
+	CurrentResourceOwner = AuxProcessResourceOwner;
+
+	/*
+	 * Check that contents look valid.
+	 */
+	if (!XRecOffIsValid(ControlFile->checkPoint))
+		ereport(FATAL,
+				(errmsg("control file contains invalid checkpoint location")));
+
+	switch (ControlFile->state)
+	{
+		case DB_SHUTDOWNED:
+
+			/*
+			 * This is the expected case, so don't be chatty in standalone
+			 * mode
+			 */
+			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+					(errmsg("database system was shut down at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_SHUTDOWNED_IN_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was shut down in recovery at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_SHUTDOWNING:
+			ereport(LOG,
+					(errmsg("database system shutdown was interrupted; last known up at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_IN_CRASH_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was interrupted while in recovery at %s",
+							str_time(ControlFile->time)),
+					 errhint("This probably means that some data is corrupted and"
+							 " you will have to use the last backup for recovery.")));
+			break;
+
+		case DB_IN_ARCHIVE_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was interrupted while in recovery at log time %s",
+							str_time(ControlFile->checkPointCopy.time)),
+					 errhint("If this has occurred more than once some data might be corrupted"
+							 " and you might need to choose an earlier recovery target.")));
+			break;
+
+		case DB_IN_PRODUCTION:
+			ereport(LOG,
+					(errmsg("database system was interrupted; last known up at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		default:
+			ereport(FATAL,
+					(errmsg("control file contains invalid database cluster state")));
+	}
+
+	/* This is just to allow attaching to startup process with a debugger */
+#ifdef XLOG_REPLAY_DELAY
+	if (ControlFile->state != DB_SHUTDOWNED)
+		pg_usleep(60000000L);
+#endif
+
+	/*
+	 * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
+	 * someone has performed a copy for PITR, these directories may have been
+	 * excluded and need to be re-created.
+	 */
+	ValidateXLOGDirectoryStructure();
+
+	/* Set up timeout handler needed to report startup progress. */
+	if (!IsBootstrapProcessingMode())
+		RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
+						startup_progress_timeout_handler);
+
+	/*----------
+	 * If we previously crashed, perform a couple of actions:
+	 *
+	 * - The pg_wal directory may still include some temporary WAL segments
+	 *   used when creating a new segment, so perform some clean up to not
+	 *   bloat this path.  This is done first as there is no point to sync
+	 *   this temporary data.
+	 *
+	 * - There might be data which we had written, intending to fsync it, but
+	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
+	 *   the near future might cause earlier unflushed writes to be lost, even
+	 *   though more recent data written to disk from here on would be
+	 *   persisted.  To avoid that, fsync the entire data directory.
+	 */
+	if (ControlFile->state != DB_SHUTDOWNED &&
+		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+	{
+		RemoveTempXlogFiles();
+		SyncDataDirectory();
+		didCrash = true;
+	}
+	else
+		didCrash = false;
+
+	/*
+	 * Prepare for WAL recovery if needed.
+	 *
+	 * InitWalRecovery analyzes the control file and the backup label file, if
+	 * any.  It updates the in-memory ControlFile buffer according to the
+	 * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
+	 * It also applies the tablespace map file, if any.
+	 */
+	InitWalRecovery(ControlFile, &wasShutdown,
+					&haveBackupLabel, &haveTblspcMap);
+	checkPoint = ControlFile->checkPointCopy;
+
+	/* initialize shared memory variables from the checkpoint record */
+	ShmemVariableCache->nextXid = checkPoint.nextXid;
+	ShmemVariableCache->nextOid = checkPoint.nextOid;
+	ShmemVariableCache->oidCount = 0;
+	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+	AdvanceOldestClogXid(checkPoint.oldestXid);
+	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
+					 checkPoint.newestCommitTsXid);
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
+
+	/*
+	 * Clear out any old relcache cache files.  This is *necessary* if we do
+	 * any WAL replay, since that would probably result in the cache files
+	 * being out of sync with database reality.  In theory we could leave them
+	 * in place if the database had been cleanly shut down, but it seems
+	 * safest to just remove them always and let them be rebuilt during the
+	 * first backend startup.  These files needs to be removed from all
+	 * directories including pg_tblspc, however the symlinks are created only
+	 * after reading tablespace_map file in case of archive recovery from
+	 * backup, so needs to clear old relcache files here after creating
+	 * symlinks.
+	 */
+	RelationCacheInitFileRemove();
+
+	/*
+	 * Initialize replication slots, before there's a chance to remove
+	 * required resources.
+	 */
+	StartupReplicationSlots();
+
+	/*
+	 * Startup logical state, needs to be setup now so we have proper data
+	 * during crash recovery.
+	 */
+	StartupReorderBuffer();
+
+	/*
+	 * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
+	 * been initialized and before we accept connections or begin WAL replay.
+	 */
+	StartupCLOG();
+
+	/*
+	 * Startup MultiXact. We need to do this early to be able to replay
+	 * truncations.
+	 */
+	StartupMultiXact();
+
+	/*
+	 * Ditto for commit timestamps.  Activate the facility if the setting is
+	 * enabled in the control file, as there should be no tracking of commit
+	 * timestamps done when the setting was disabled.  This facility can be
+	 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
+	 */
+	if (ControlFile->track_commit_timestamp)
+		StartupCommitTs();
+
+	/*
+	 * Recover knowledge about replay progress of known replication partners.
+	 */
+	StartupReplicationOrigin();
+
+	/*
+	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
+	 * control file. On recovery, all unlogged relations are blown away, so
+	 * the unlogged LSN counter can be reset too.
+	 */
+	if (ControlFile->state == DB_SHUTDOWNED)
+		XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
+	else
+		XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
+
+	/*
+	 * Copy any missing timeline history files between 'now' and the recovery
+	 * target timeline from archive to pg_wal. While we don't need those files
+	 * ourselves - the history file of the recovery target timeline covers all
+	 * the previous timelines in the history too - a cascading standby server
+	 * might be interested in them. Or, if you archive the WAL from this
+	 * server to a different archive than the primary, it'd be good for all
+	 * the history files to get archived there after failover, so that you can
+	 * use one of the old timelines as a PITR target. Timeline history files
+	 * are small, so it's better to copy them unnecessarily than not copy them
+	 * and regret later.
+	 */
+	restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
+
+	/*
+	 * Before running in recovery, scan pg_twophase and fill in its status to
+	 * be able to work on entries generated by redo.  Doing a scan before
+	 * taking any recovery action has the merit to discard any 2PC files that
+	 * are newer than the first record to replay, saving from any conflicts at
+	 * replay.  This avoids as well any subsequent scans when doing recovery
+	 * of the on-disk two-phase data.
+	 */
+	restoreTwoPhaseData();
+
+	/*
+	 * When starting with crash recovery, reset pgstat data - it might not be
+	 * valid. Otherwise restore pgstat data. It's safe to do this here,
+	 * because postmaster will not yet have started any other processes.
+	 *
+	 * NB: Restoring replication slot stats relies on slot state to have
+	 * already been restored from disk.
+	 *
+	 * TODO: With a bit of extra work we could just start with a pgstat file
+	 * associated with the checkpoint redo location we're starting from.
+	 */
+	if (didCrash)
+		pgstat_discard_stats();
+	else
+		pgstat_restore_stats();
+
+	lastFullPageWrites = checkPoint.fullPageWrites;
+
+	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+	doPageWrites = lastFullPageWrites;
+
+	/* REDO */
+	if (InRecovery)
+	{
+		/* Initialize state for RecoveryInProgress() */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		if (InArchiveRecovery)
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+		else
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		/*
+		 * Update pg_control to show that we are recovering and to show the
+		 * selected checkpoint as the place we are starting from. We also mark
+		 * pg_control with any minimum recovery stop point obtained from a
+		 * backup history file.
+		 *
+		 * No need to hold ControlFileLock yet, we aren't up far enough.
+		 */
+		UpdateControlFile();
+
+		/*
+		 * If there was a backup label file, it's done its job and the info
+		 * has now been propagated into pg_control.  We must get rid of the
+		 * label file so that if we crash during recovery, we'll pick up at
+		 * the latest recovery restartpoint instead of going all the way back
+		 * to the backup start point.  It seems prudent though to just rename
+		 * the file out of the way rather than delete it completely.
+		 */
+		if (haveBackupLabel)
+		{
+			unlink(BACKUP_LABEL_OLD);
+			durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
+		}
+
+		/*
+		 * If there was a tablespace_map file, it's done its job and the
+		 * symlinks have been created.  We must get rid of the map file so
+		 * that if we crash during recovery, we don't create symlinks again.
+		 * It seems prudent though to just rename the file out of the way
+		 * rather than delete it completely.
+		 */
+		if (haveTblspcMap)
+		{
+			unlink(TABLESPACE_MAP_OLD);
+			durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
+		}
+
+		/*
+		 * Initialize our local copy of minRecoveryPoint.  When doing crash
+		 * recovery we want to replay up to the end of WAL.  Particularly, in
+		 * the case of a promoted standby minRecoveryPoint value in the
+		 * control file is only updated after the first checkpoint.  However,
+		 * if the instance crashes before the first post-recovery checkpoint
+		 * is completed then recovery will use a stale location causing the
+		 * startup process to think that there are still invalid page
+		 * references when checking for data consistency.
+		 */
+		if (InArchiveRecovery)
+		{
+			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+		}
+		else
+		{
+			LocalMinRecoveryPoint = InvalidXLogRecPtr;
+			LocalMinRecoveryPointTLI = 0;
+		}
+
+		/* Check that the GUCs used to generate the WAL allow recovery */
+		CheckRequiredParameterValues();
+
+		/*
+		 * We're in recovery, so unlogged relations may be trashed and must be
+		 * reset.  This should be done BEFORE allowing Hot Standby
+		 * connections, so that read-only backends don't try to read whatever
+		 * garbage is left over from before.
+		 */
+		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
+
+		/*
+		 * Likewise, delete any saved transaction snapshot files that got left
+		 * behind by crashed backends.
+		 */
+		DeleteAllExportedSnapshotFiles();
+
+		/*
+		 * Initialize for Hot Standby, if enabled. We won't let backends in
+		 * yet, not until we've reached the min recovery point specified in
+		 * control file and we've established a recovery snapshot from a
+		 * running-xacts WAL record.
+		 */
+		if (ArchiveRecoveryRequested && EnableHotStandby)
+		{
+			TransactionId *xids;
+			int			nxids;
+
+			ereport(DEBUG1,
+					(errmsg_internal("initializing for hot standby")));
+
+			InitRecoveryTransactionEnvironment();
+
+			if (wasShutdown)
+				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+			else
+				oldestActiveXID = checkPoint.oldestActiveXid;
+			Assert(TransactionIdIsValid(oldestActiveXID));
+
+			/* Tell procarray about the range of xids it has to deal with */
+			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
+
+			/*
+			 * Startup subtrans only.  CLOG, MultiXact and commit timestamp
+			 * have already been started up and other SLRUs are not maintained
+			 * during recovery and need not be started yet.
+			 */
+			StartupSUBTRANS(oldestActiveXID);
+
+			/*
+			 * If we're beginning at a shutdown checkpoint, we know that
+			 * nothing was running on the primary at this point. So fake-up an
+			 * empty running-xacts record and use that here and now. Recover
+			 * additional standby state for prepared transactions.
+			 */
+			if (wasShutdown)
+			{
+				RunningTransactionsData running;
+				TransactionId latestCompletedXid;
+
+				/*
+				 * Construct a RunningTransactions snapshot representing a
+				 * shut down server, with only prepared transactions still
+				 * alive. We're never overflowed at this point because all
+				 * subxids are listed with their parent prepared transactions.
+				 */
+				running.xcnt = nxids;
+				running.subxcnt = 0;
+				running.subxid_overflow = false;
+				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+				running.oldestRunningXid = oldestActiveXID;
+				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+				TransactionIdRetreat(latestCompletedXid);
+				Assert(TransactionIdIsNormal(latestCompletedXid));
+				running.latestCompletedXid = latestCompletedXid;
+				running.xids = xids;
+
+				ProcArrayApplyRecoveryInfo(&running);
+
+				StandbyRecoverPreparedTransactions();
+			}
+		}
+
+		/*
+		 * We're all set for replaying the WAL now. Do it.
+		 */
+		PerformWalRecovery();
+		performedWalRecovery = true;
+	}
+	else
+		performedWalRecovery = false;
+
+	/*
+	 * Finish WAL recovery.
+	 */
+	endOfRecoveryInfo = FinishWalRecovery();
+	EndOfLog = endOfRecoveryInfo->endOfLog;
+	EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
+	abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
+	missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
+
+	/*
+	 * Reset ps status display, so as no information related to recovery
+	 * shows up.
+	 */
+	set_ps_display("");
+
+	/*
+	 * When recovering from a backup (we are in recovery, and archive recovery
+	 * was requested), complain if we did not roll forward far enough to reach
+	 * the point where the database is consistent.  For regular online
+	 * backup-from-primary, that means reaching the end-of-backup WAL record
+	 * (at which point we reset backupStartPoint to be Invalid), for
+	 * backup-from-replica (which can't inject records into the WAL stream),
+	 * that point is when we reach the minRecoveryPoint in pg_control (which
+	 * we purposefully copy last when backing up from a replica).  For
+	 * pg_rewind (which creates a backup_label with a method of "pg_rewind")
+	 * or snapshot-style backups (which don't), backupEndRequired will be set
+	 * to false.
+	 *
+	 * Note: it is indeed okay to look at the local variable
+	 * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
+	 * might be further ahead --- ControlFile->minRecoveryPoint cannot have
+	 * been advanced beyond the WAL we processed.
+	 */
+	if (InRecovery &&
+		(EndOfLog < LocalMinRecoveryPoint ||
+		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
+	{
+		/*
+		 * Ran off end of WAL before reaching end-of-backup WAL record, or
+		 * minRecoveryPoint. That's a bad sign, indicating that you tried to
+		 * recover from an online backup but never called pg_backup_stop(), or
+		 * you didn't archive all the WAL needed.
+		 */
+		if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
+		{
+			if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
+				ereport(FATAL,
+						(errmsg("WAL ends before end of online backup"),
+						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
+			else
+				ereport(FATAL,
+						(errmsg("WAL ends before consistent recovery point")));
+		}
+	}
+
+	/*
+	 * Reset unlogged relations to the contents of their INIT fork. This is
+	 * done AFTER recovery is complete so as to include any unlogged relations
+	 * created during recovery, but BEFORE recovery is marked as having
+	 * completed successfully. Otherwise we'd not retry if any of the post
+	 * end-of-recovery steps fail.
+	 */
+	if (InRecovery)
+		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
+
+	/*
+	 * Pre-scan prepared transactions to find out the range of XIDs present.
+	 * This information is not quite needed yet, but it is positioned here so
+	 * as potential problems are detected before any on-disk change is done.
+	 */
+	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+
+	/*
+	 * Allow ordinary WAL segment creation before possibly switching to a new
+	 * timeline, which creates a new segment, and after the last ReadRecord().
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	XLogCtl->InstallXLogFileSegmentActive = true;
+	LWLockRelease(ControlFileLock);
+
+	/*
+	 * Consider whether we need to assign a new timeline ID.
+	 *
+	 * If we did archive recovery, we always assign a new ID.  This handles a
+	 * couple of issues.  If we stopped short of the end of WAL during
+	 * recovery, then we are clearly generating a new timeline and must assign
+	 * it a unique new ID.  Even if we ran to the end, modifying the current
+	 * last segment is problematic because it may result in trying to
+	 * overwrite an already-archived copy of that segment, and we encourage
+	 * DBAs to make their archive_commands reject that.  We can dodge the
+	 * problem by making the new active segment have a new timeline ID.
+	 *
+	 * In a normal crash recovery, we can just extend the timeline we were in.
+	 */
+	newTLI = endOfRecoveryInfo->lastRecTLI;
+	if (ArchiveRecoveryRequested)
+	{
+		newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
+		ereport(LOG,
+				(errmsg("selected new timeline ID: %u", newTLI)));
+
+		/*
+		 * Make a writable copy of the last WAL segment.  (Note that we also
+		 * have a copy of the last block of the old WAL in
+		 * endOfRecovery->lastPage; we will use that below.)
+		 */
+		XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
+
+		/*
+		 * Remove the signal files out of the way, so that we don't
+		 * accidentally re-enter archive recovery mode in a subsequent crash.
+		 */
+		if (endOfRecoveryInfo->standby_signal_file_found)
+			durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
+
+		if (endOfRecoveryInfo->recovery_signal_file_found)
+			durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
+
+		/*
+		 * Write the timeline history file, and have it archived. After this
+		 * point (or rather, as soon as the file is archived), the timeline
+		 * will appear as "taken" in the WAL archive and to any standby
+		 * servers.  If we crash before actually switching to the new
+		 * timeline, standby servers will nevertheless think that we switched
+		 * to the new timeline, and will try to connect to the new timeline.
+		 * To minimize the window for that, try to do as little as possible
+		 * between here and writing the end-of-recovery record.
+		 */
+		writeTimeLineHistory(newTLI, recoveryTargetTLI,
+							 EndOfLog, endOfRecoveryInfo->recoveryStopReason);
+
+		ereport(LOG,
+				(errmsg("archive recovery complete")));
+	}
+
+	/* Save the selected TimeLineID in shared memory, too */
+	XLogCtl->InsertTimeLineID = newTLI;
+	XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
+
+	/*
+	 * Actually, if WAL ended in an incomplete record, skip the parts that
+	 * made it through and start writing after the portion that persisted.
+	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
+	 * we'll do as soon as we're open for writing new WAL.)
+	 */
+	if (!XLogRecPtrIsInvalid(missingContrecPtr))
+	{
+		/*
+		 * We should only have a missingContrecPtr if we're not switching to
+		 * a new timeline. When a timeline switch occurs, WAL is copied from
+		 * the old timeline to the new only up to the end of the last complete
+		 * record, so there can't be an incomplete WAL record that we need to
+		 * disregard.
+		 */
+		Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
+		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
+		EndOfLog = missingContrecPtr;
+	}
+
+	/*
+	 * Prepare to write WAL starting at EndOfLog location, and init xlog
+	 * buffer cache using the block containing the last record from the
+	 * previous incarnation.
+	 */
+	Insert = &XLogCtl->Insert;
+	Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
+	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
+
+	/*
+	 * Tricky point here: lastPage contains the *last* block that the LastRec
+	 * record spans, not the one it starts in.  The last block is indeed the
+	 * one we want to use.
+	 */
+	if (EndOfLog % XLOG_BLCKSZ != 0)
+	{
+		char	   *page;
+		int			len;
+		int			firstIdx;
+
+		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+		len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
+		Assert(len < XLOG_BLCKSZ);
+
+		/* Copy the valid part of the last block, and zero the rest */
+		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+		memcpy(page, endOfRecoveryInfo->lastPage, len);
+		memset(page + len, 0, XLOG_BLCKSZ - len);
+
+		XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+		XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+	}
+	else
+	{
+		/*
+		 * There is no partial block to copy. Just set InitializedUpTo, and
+		 * let the first attempt to insert a log record to initialize the next
+		 * buffer.
+		 */
+		XLogCtl->InitializedUpTo = EndOfLog;
+	}
+
+	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+
+	XLogCtl->LogwrtResult = LogwrtResult;
+
+	XLogCtl->LogwrtRqst.Write = EndOfLog;
+	XLogCtl->LogwrtRqst.Flush = EndOfLog;
+
+	/*
+	 * Preallocate additional log files, if wanted.
+	 */
+	PreallocXlogFiles(EndOfLog, newTLI);
+
+	/*
+	 * Okay, we're officially UP.
+	 */
+	InRecovery = false;
+
+	/* start the archive_timeout timer and LSN running */
+	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+	XLogCtl->lastSegSwitchLSN = EndOfLog;
+
+	/* also initialize latestCompletedXid, to nextXid - 1 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
+	FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * Start up subtrans, if not already done for hot standby.  (commit
+	 * timestamps are started below, if necessary.)
+	 */
+	if (standbyState == STANDBY_DISABLED)
+		StartupSUBTRANS(oldestActiveXID);
+
+	/*
+	 * Perform end of recovery actions for any SLRUs that need it.
+	 */
+	TrimCLOG();
+	TrimMultiXact();
+
+	/*
+	 * Reload shared-memory state for prepared transactions.  This needs to
+	 * happen before renaming the last partial segment of the old timeline as
+	 * it may be possible that we have to recovery some transactions from it.
+	 */
+	RecoverPreparedTransactions();
+
+	/* Shut down xlogreader */
+	ShutdownWalRecovery();
+
+	/* Enable WAL writes for this backend only. */
+	LocalSetXLogInsertAllowed();
+
+	/* If necessary, write overwrite-contrecord before doing anything else */
+	if (!XLogRecPtrIsInvalid(abortedRecPtr))
+	{
+		Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
+		CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
+	}
+
+	/*
+	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
+	 * record before resource manager writes cleanup WAL records or checkpoint
+	 * record is written.
+	 */
+	Insert->fullPageWrites = lastFullPageWrites;
+	UpdateFullPageWrites();
+
+	/*
+	 * Emit checkpoint or end-of-recovery record in XLOG, if required.
+	 */
+	if (performedWalRecovery)
+		promoted = PerformRecoveryXLogAction();
+
+	/*
+	 * If any of the critical GUCs have changed, log them before we allow
+	 * backends to write WAL.
+	 */
+	XLogReportParameters();
+
+	/* If this is archive recovery, perform post-recovery cleanup actions. */
+	if (ArchiveRecoveryRequested)
+		CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
+
+	/*
+	 * Local WAL inserts enabled, so it's time to finish initialization of
+	 * commit timestamp.
+	 */
+	CompleteCommitTsInitialization();
+
+	/*
+	 * All done with end-of-recovery actions.
+	 *
+	 * Now allow backends to write WAL and update the control file status in
+	 * consequence.  SharedRecoveryState, that controls if backends can write
+	 * WAL, is updated while holding ControlFileLock to prevent other backends
+	 * to look at an inconsistent state of the control file in shared memory.
+	 * There is still a small window during which backends can write WAL and
+	 * the control file is still referring to a system not in DB_IN_PRODUCTION
+	 * state while looking at the on-disk control file.
+	 *
+	 * Also, we use info_lck to update SharedRecoveryState to ensure that
+	 * there are no race conditions concerning visibility of other recent
+	 * updates to shared memory.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->state = DB_IN_PRODUCTION;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
+
+	/*
+	 * Shutdown the recovery environment.  This must occur after
+	 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
+	 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
+	 * any session building a snapshot will not rely on KnownAssignedXids as
+	 * RecoveryInProgress() would return false at this stage.  This is
+	 * particularly critical for prepared 2PC transactions, that would still
+	 * need to be included in snapshots once recovery has ended.
+	 */
+	if (standbyState != STANDBY_DISABLED)
+		ShutdownRecoveryTransactionEnvironment();
+
+	/*
+	 * If there were cascading standby servers connected to us, nudge any wal
+	 * sender processes to notice that we've been promoted.
+	 */
+	WalSndWakeup();
+
+	/*
+	 * If this was a promotion, request an (online) checkpoint now. This isn't
+	 * required for consistency, but the last restartpoint might be far back,
+	 * and in case of a crash, recovering from it might take a longer than is
+	 * appropriate now that we're not in standby mode anymore.
+	 */
+	if (promoted)
+		RequestCheckpoint(CHECKPOINT_FORCE);
+}
+
+/*
+ * Callback from PerformWalRecovery(), called when we switch from crash
+ * recovery to archive recovery mode.  Updates the control file accordingly.
+ */
+void
+SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
+{
+	/* initialize minRecoveryPoint to this record */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+	if (ControlFile->minRecoveryPoint < EndRecPtr)
+	{
+		ControlFile->minRecoveryPoint = EndRecPtr;
+		ControlFile->minRecoveryPointTLI = replayTLI;
+	}
+	/* update local copy */
+	LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+	LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+
+	/*
+	 * The startup process can update its local copy of minRecoveryPoint from
+	 * this point.
+	 */
+	updateMinRecoveryPoint = true;
+
+	UpdateControlFile();
+
+	/*
+	 * We update SharedRecoveryState while holding the lock on ControlFileLock
+	 * so both states are consistent in shared memory.
+	 */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Callback from PerformWalRecovery(), called when we reach the end of backup.
+ * Updates the control file accordingly.
+ */
+void
+ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
+{
+	/*
+	 * We have reached the end of base backup, as indicated by pg_control. The
+	 * data on disk is now consistent (unless minRecovery point is further
+	 * ahead, which can happen if we crashed during previous recovery).  Reset
+	 * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
+	 * make sure we don't allow starting up at an earlier point even if
+	 * recovery is stopped and restarted soon after this.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+	if (ControlFile->minRecoveryPoint < EndRecPtr)
+	{
+		ControlFile->minRecoveryPoint = EndRecPtr;
+		ControlFile->minRecoveryPointTLI = tli;
+	}
+
+	ControlFile->backupStartPoint = InvalidXLogRecPtr;
+	ControlFile->backupEndPoint = InvalidXLogRecPtr;
+	ControlFile->backupEndRequired = false;
+	UpdateControlFile();
+
+	LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Perform whatever XLOG actions are necessary at end of REDO.
+ *
+ * The goal here is to make sure that we'll be able to recover properly if
+ * we crash again. If we choose to write a checkpoint, we'll write a shutdown
+ * checkpoint rather than an on-line one. This is not particularly critical,
+ * but since we may be assigning a new TLI, using a shutdown checkpoint allows
+ * us to have the rule that TLI only changes in shutdown checkpoints, which
+ * allows some extra error checking in xlog_redo.
+ */
+static bool
+PerformRecoveryXLogAction(void)
+{
+	bool		promoted = false;
+
+	/*
+	 * Perform a checkpoint to update all our recovery activity to disk.
+	 *
+	 * Note that we write a shutdown checkpoint rather than an on-line one.
+	 * This is not particularly critical, but since we may be assigning a new
+	 * TLI, using a shutdown checkpoint allows us to have the rule that TLI
+	 * only changes in shutdown checkpoints, which allows some extra error
+	 * checking in xlog_redo.
+	 *
+	 * In promotion, only create a lightweight end-of-recovery record instead
+	 * of a full checkpoint. A checkpoint is requested later, after we're
+	 * fully out of recovery mode and already accepting queries.
+	 */
+	if (ArchiveRecoveryRequested && IsUnderPostmaster &&
+		PromoteIsTriggered())
+	{
+		promoted = true;
+
+		/*
+		 * Insert a special WAL record to mark the end of recovery, since we
+		 * aren't doing a checkpoint. That means that the checkpointer process
+		 * may likely be in the middle of a time-smoothed restartpoint and
+		 * could continue to be for minutes after this.  That sounds strange,
+		 * but the effect is roughly the same and it would be stranger to try
+		 * to come out of the restartpoint and then checkpoint. We request a
+		 * checkpoint later anyway, just for safety.
+		 */
+		CreateEndOfRecoveryRecord();
+	}
+	else
+	{
+		RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+						  CHECKPOINT_IMMEDIATE |
+						  CHECKPOINT_WAIT);
+	}
+
+	return promoted;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * Unlike testing InRecovery, this works in any process that's connected to
+ * shared memory.
+ */
+bool
+RecoveryInProgress(void)
+{
+	/*
+	 * We check shared state each time only until we leave recovery mode. We
+	 * can't re-enter recovery, so there's no need to keep checking after the
+	 * shared variable has once been seen false.
+	 */
+	if (!LocalRecoveryInProgress)
+		return false;
+	else
+	{
+		/*
+		 * use volatile pointer to make sure we make a fresh read of the
+		 * shared variable.
+		 */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
+
+		/*
+		 * Note: We don't need a memory barrier when we're still in recovery.
+		 * We might exit recovery immediately after return, so the caller
+		 * can't rely on 'true' meaning that we're still in recovery anyway.
+		 */
+
+		return LocalRecoveryInProgress;
+	}
+}
+
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is kept consistent with the contents of the control
+ * file.  See details about the possible values of RecoveryState in xlog.h.
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+	RecoveryState retval;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	retval = XLogCtl->SharedRecoveryState;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return retval;
+}
+
+/*
+ * Is this process allowed to insert new WAL records?
+ *
+ * Ordinarily this is essentially equivalent to !RecoveryInProgress().
+ * But we also have provisions for forcing the result "true" or "false"
+ * within specific processes regardless of the global state.
+ */
+bool
+XLogInsertAllowed(void)
+{
+	/*
+	 * If value is "unconditionally true" or "unconditionally false", just
+	 * return it.  This provides the normal fast path once recovery is known
+	 * done.
+	 */
+	if (LocalXLogInsertAllowed >= 0)
+		return (bool) LocalXLogInsertAllowed;
+
+	/*
+	 * Else, must check to see if we're still in recovery.
+	 */
+	if (RecoveryInProgress())
+		return false;
+
+	/*
+	 * On exit from recovery, reset to "unconditionally true", since there is
+	 * no need to keep checking.
+	 */
+	LocalXLogInsertAllowed = 1;
+	return true;
+}
+
+/*
+ * Make XLogInsertAllowed() return true in the current process only.
+ *
+ * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
+ * and even call LocalSetXLogInsertAllowed() again after that.
+ *
+ * Returns the previous value of LocalXLogInsertAllowed.
+ */
+static int
+LocalSetXLogInsertAllowed(void)
+{
+	int			oldXLogAllowed = LocalXLogInsertAllowed;
+
+	LocalXLogInsertAllowed = 1;
+
+	return oldXLogAllowed;
+}
+
+/*
+ * Return the current Redo pointer from shared memory.
+ *
+ * As a side-effect, the local RedoRecPtr copy is updated.
+ */
+XLogRecPtr
+GetRedoRecPtr(void)
+{
+	XLogRecPtr	ptr;
+
+	/*
+	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+	 * grabbed a WAL insertion lock to read the authoritative value in
+	 * Insert->RedoRecPtr, someone might update it just after we've released
+	 * the lock.
+	 */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
+
+	return RedoRecPtr;
+}
+
+/*
+ * Return information needed to decide whether a modified block needs a
+ * full-page image to be included in the WAL record.
+ *
+ * The returned values are cached copies from backend-private memory, and
+ * possibly out-of-date or, indeed, uninitialized, in which case they will
+ * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
+ * re-check them against up-to-date values, while holding the WAL insert lock.
+ */
+void
+GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
+{
+	*RedoRecPtr_p = RedoRecPtr;
+	*doPageWrites_p = doPageWrites;
+}
+
+/*
+ * GetInsertRecPtr -- Returns the current insert position.
+ *
+ * NOTE: The value *actually* returned is the position of the last full
+ * xlog page. It lags behind the real insert position by at most 1 page.
+ * For that, we don't need to scan through WAL insertion locks, and an
+ * approximation is enough for the current usage of this function.
+ */
+XLogRecPtr
+GetInsertRecPtr(void)
+{
+	XLogRecPtr	recptr;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	recptr = XLogCtl->LogwrtRqst.Write;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return recptr;
+}
+
+/*
+ * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
+ * position known to be fsync'd to disk. This should only be used on a
+ * system that is known not to be in recovery.
+ */
+XLogRecPtr
+GetFlushRecPtr(TimeLineID *insertTLI)
+{
+	Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * If we're writing and flushing WAL, the time line can't be changing, so
+	 * no lock is required.
+	 */
+	if (insertTLI)
+		*insertTLI = XLogCtl->InsertTimeLineID;
+
+	return LogwrtResult.Flush;
+}
+
+/*
+ * GetWALInsertionTimeLine -- Returns the current timeline of a system that
+ * is not in recovery.
+ */
+TimeLineID
+GetWALInsertionTimeLine(void)
+{
+	Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
+
+	/* Since the value can't be changing, no lock is required. */
+	return XLogCtl->InsertTimeLineID;
+}
+
+/*
+ * GetLastImportantRecPtr -- Returns the LSN of the last important record
+ * inserted. All records not explicitly marked as unimportant are considered
+ * important.
+ *
+ * The LSN is determined by computing the maximum of
+ * WALInsertLocks[i].lastImportantAt.
+ */
+XLogRecPtr
+GetLastImportantRecPtr(void)
+{
+	XLogRecPtr	res = InvalidXLogRecPtr;
+	int			i;
+
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+	{
+		XLogRecPtr	last_important;
+
+		/*
+		 * Need to take a lock to prevent torn reads of the LSN, which are
+		 * possible on some of the supported platforms. WAL insert locks only
+		 * support exclusive mode, so we have to use that.
+		 */
+		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+		last_important = WALInsertLocks[i].l.lastImportantAt;
+		LWLockRelease(&WALInsertLocks[i].l.lock);
+
+		if (res < last_important)
+			res = last_important;
+	}
+
+	return res;
+}
+
+/*
+ * Get the time and LSN of the last xlog segment switch
+ */
+pg_time_t
+GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
+{
+	pg_time_t	result;
+
+	/* Need WALWriteLock, but shared lock is sufficient */
+	LWLockAcquire(WALWriteLock, LW_SHARED);
+	result = XLogCtl->lastSegSwitchTime;
+	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
+	LWLockRelease(WALWriteLock);
+
+	return result;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownXLOG(int code, Datum arg)
+{
+	/*
+	 * We should have an aux process resource owner to use, and we should not
+	 * be in a transaction that's installed some other resowner.
+	 */
+	Assert(AuxProcessResourceOwner != NULL);
+	Assert(CurrentResourceOwner == NULL ||
+		   CurrentResourceOwner == AuxProcessResourceOwner);
+	CurrentResourceOwner = AuxProcessResourceOwner;
+
+	/* Don't be chatty in standalone mode */
+	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+			(errmsg("shutting down")));
+
+	/*
+	 * Signal walsenders to move to stopping state.
+	 */
+	WalSndInitStopping();
+
+	/*
+	 * Wait for WAL senders to be in stopping state.  This prevents commands
+	 * from writing new WAL.
+	 */
+	WalSndWaitStopping();
+
+	if (RecoveryInProgress())
+		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	else
+	{
+		/*
+		 * If archiving is enabled, rotate the last XLOG file so that all the
+		 * remaining records are archived (postmaster wakes up the archiver
+		 * process one more time at the end of shutdown). The checkpoint
+		 * record will go to the next XLOG file and won't be archived (yet).
+		 */
+		if (XLogArchivingActive())
+			RequestXLogSwitch(false);
+
+		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	}
+}
+
+/*
+ * Log start of a checkpoint.
+ */
+static void
+LogCheckpointStart(int flags, bool restartpoint)
+{
+	if (restartpoint)
+		ereport(LOG,
+		/* translator: the placeholders show checkpoint options */
+				(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
+						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+						(flags & CHECKPOINT_FORCE) ? " force" : "",
+						(flags & CHECKPOINT_WAIT) ? " wait" : "",
+						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+	else
+		ereport(LOG,
+		/* translator: the placeholders show checkpoint options */
+				(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
+						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+						(flags & CHECKPOINT_FORCE) ? " force" : "",
+						(flags & CHECKPOINT_WAIT) ? " wait" : "",
+						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+}
+
+/*
+ * Log end of a checkpoint.
+ */
+static void
+LogCheckpointEnd(bool restartpoint)
+{
+	long		write_msecs,
+				sync_msecs,
+				total_msecs,
+				longest_msecs,
+				average_msecs;
+	uint64		average_sync_time;
+
+	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
+
+	write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
+												  CheckpointStats.ckpt_sync_t);
+
+	sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
+												 CheckpointStats.ckpt_sync_end_t);
+
+	/* Accumulate checkpoint timing summary data, in milliseconds. */
+	PendingCheckpointerStats.checkpoint_write_time += write_msecs;
+	PendingCheckpointerStats.checkpoint_sync_time += sync_msecs;
+
+	/*
+	 * All of the published timing statistics are accounted for.  Only
+	 * continue if a log message is to be written.
+	 */
+	if (!log_checkpoints)
+		return;
+
+	total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
+												  CheckpointStats.ckpt_end_t);
+
+	/*
+	 * Timing values returned from CheckpointStats are in microseconds.
+	 * Convert to milliseconds for consistent printing.
+	 */
+	longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
+
+	average_sync_time = 0;
+	if (CheckpointStats.ckpt_sync_rels > 0)
+		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+			CheckpointStats.ckpt_sync_rels;
+	average_msecs = (long) ((average_sync_time + 999) / 1000);
+
+	if (restartpoint)
+		ereport(LOG,
+				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
+						"%d WAL file(s) added, %d removed, %d recycled; "
+						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						"distance=%d kB, estimate=%d kB",
+						CheckpointStats.ckpt_bufs_written,
+						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						CheckpointStats.ckpt_segs_added,
+						CheckpointStats.ckpt_segs_removed,
+						CheckpointStats.ckpt_segs_recycled,
+						write_msecs / 1000, (int) (write_msecs % 1000),
+						sync_msecs / 1000, (int) (sync_msecs % 1000),
+						total_msecs / 1000, (int) (total_msecs % 1000),
+						CheckpointStats.ckpt_sync_rels,
+						longest_msecs / 1000, (int) (longest_msecs % 1000),
+						average_msecs / 1000, (int) (average_msecs % 1000),
+						(int) (PrevCheckPointDistance / 1024.0),
+						(int) (CheckPointDistanceEstimate / 1024.0))));
+	else
+		ereport(LOG,
+				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
+						"%d WAL file(s) added, %d removed, %d recycled; "
+						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						"distance=%d kB, estimate=%d kB",
+						CheckpointStats.ckpt_bufs_written,
+						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						CheckpointStats.ckpt_segs_added,
+						CheckpointStats.ckpt_segs_removed,
+						CheckpointStats.ckpt_segs_recycled,
+						write_msecs / 1000, (int) (write_msecs % 1000),
+						sync_msecs / 1000, (int) (sync_msecs % 1000),
+						total_msecs / 1000, (int) (total_msecs % 1000),
+						CheckpointStats.ckpt_sync_rels,
+						longest_msecs / 1000, (int) (longest_msecs % 1000),
+						average_msecs / 1000, (int) (average_msecs % 1000),
+						(int) (PrevCheckPointDistance / 1024.0),
+						(int) (CheckPointDistanceEstimate / 1024.0))));
+}
+
+/*
+ * Update the estimate of distance between checkpoints.
+ *
+ * The estimate is used to calculate the number of WAL segments to keep
+ * preallocated, see XLOGfileslop().
+ */
+static void
+UpdateCheckPointDistanceEstimate(uint64 nbytes)
+{
+	/*
+	 * To estimate the number of segments consumed between checkpoints, keep a
+	 * moving average of the amount of WAL generated in previous checkpoint
+	 * cycles. However, if the load is bursty, with quiet periods and busy
+	 * periods, we want to cater for the peak load. So instead of a plain
+	 * moving average, let the average decline slowly if the previous cycle
+	 * used less WAL than estimated, but bump it up immediately if it used
+	 * more.
+	 *
+	 * When checkpoints are triggered by max_wal_size, this should converge to
+	 * CheckpointSegments * wal_segment_size,
+	 *
+	 * Note: This doesn't pay any attention to what caused the checkpoint.
+	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
+	 * starting a base backup, are counted the same as those created
+	 * automatically. The slow-decline will largely mask them out, if they are
+	 * not frequent. If they are frequent, it seems reasonable to count them
+	 * in as any others; if you issue a manual checkpoint every 5 minutes and
+	 * never let a timed checkpoint happen, it makes sense to base the
+	 * preallocation on that 5 minute interval rather than whatever
+	 * checkpoint_timeout is set to.
+	 */
+	PrevCheckPointDistance = nbytes;
+	if (CheckPointDistanceEstimate < nbytes)
+		CheckPointDistanceEstimate = nbytes;
+	else
+		CheckPointDistanceEstimate =
+			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
+}
+
+/*
+ * Update the ps display for a process running a checkpoint.  Note that
+ * this routine should not do any allocations so as it can be called
+ * from a critical section.
+ */
+static void
+update_checkpoint_display(int flags, bool restartpoint, bool reset)
+{
+	/*
+	 * The status is reported only for end-of-recovery and shutdown
+	 * checkpoints or shutdown restartpoints.  Updating the ps display is
+	 * useful in those situations as it may not be possible to rely on
+	 * pg_stat_activity to see the status of the checkpointer or the startup
+	 * process.
+	 */
+	if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
+		return;
+
+	if (reset)
+		set_ps_display("");
+	else
+	{
+		char		activitymsg[128];
+
+		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
+				 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
+				 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
+				 restartpoint ? "restartpoint" : "checkpoint");
+		set_ps_display(activitymsg);
+	}
+}
+
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * flags is a bitwise OR of the following:
+ *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
+ *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
+ *		ignoring checkpoint_completion_target parameter.
+ *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
+ *		CHECKPOINT_END_OF_RECOVERY).
+ *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
+ *
+ * Note: flags contains other bits, of interest here only for logging purposes.
+ * In particular note that this routine is synchronous and does not pay
+ * attention to CHECKPOINT_WAIT.
+ *
+ * If !shutdown then we are writing an online checkpoint. This is a very special
+ * kind of operation and WAL record because the checkpoint action occurs over
+ * a period of time yet logically occurs at just a single LSN. The logical
+ * position of the WAL record (redo ptr) is the same or earlier than the
+ * physical position. When we replay WAL we locate the checkpoint via its
+ * physical position then read the redo ptr and actually start replay at the
+ * earlier logical position. Note that we don't write *anything* to WAL at
+ * the logical position, so that location could be any other kind of WAL record.
+ * All of this mechanism allows us to continue working while we checkpoint.
+ * As a result, timing of actions is critical here and be careful to note that
+ * this function will likely take minutes to execute on a busy system.
+ */
+void
+CreateCheckPoint(int flags)
+{
+	bool		shutdown;
+	CheckPoint	checkPoint;
+	XLogRecPtr	recptr;
+	XLogSegNo	_logSegNo;
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint32		freespace;
+	XLogRecPtr	PriorRedoPtr;
+	XLogRecPtr	curInsert;
+	XLogRecPtr	last_important_lsn;
+	VirtualTransactionId *vxids;
+	int			nvxids;
+	int			oldXLogAllowed = 0;
+
+	/*
+	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
+	 * issued at a different time.
+	 */
+	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
+		shutdown = true;
+	else
+		shutdown = false;
+
+	/* sanity check */
+	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+		elog(ERROR, "can't create a checkpoint during recovery");
+
+	/*
+	 * Prepare to accumulate statistics.
+	 *
+	 * Note: because it is possible for log_checkpoints to change while a
+	 * checkpoint proceeds, we always accumulate stats, even if
+	 * log_checkpoints is currently off.
+	 */
+	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+	/*
+	 * Let smgr prepare for checkpoint; this has to happen outside the
+	 * critical section and before we determine the REDO pointer.  Note that
+	 * smgr must not do anything that'd have to be undone if we decide no
+	 * checkpoint is needed.
+	 */
+	SyncPreCheckpoint();
+
+	/*
+	 * Use a critical section to force system panic if we have trouble.
+	 */
+	START_CRIT_SECTION();
+
+	if (shutdown)
+	{
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->state = DB_SHUTDOWNING;
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+	}
+
+	/* Begin filling in the checkpoint WAL record */
+	MemSet(&checkPoint, 0, sizeof(checkPoint));
+	checkPoint.time = (pg_time_t) time(NULL);
+
+	/*
+	 * For Hot Standby, derive the oldestActiveXid before we fix the redo
+	 * pointer. This allows us to begin accumulating changes to assemble our
+	 * starting snapshot of locks and transactions.
+	 */
+	if (!shutdown && XLogStandbyInfoActive())
+		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
+	else
+		checkPoint.oldestActiveXid = InvalidTransactionId;
+
+	/*
+	 * Get location of last important record before acquiring insert locks (as
+	 * GetLastImportantRecPtr() also locks WAL locks).
+	 */
+	last_important_lsn = GetLastImportantRecPtr();
+
+	/*
+	 * We must block concurrent insertions while examining insert state to
+	 * determine the checkpoint REDO pointer.
+	 */
+	WALInsertLockAcquireExclusive();
+	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
+
+	/*
+	 * If this isn't a shutdown or forced checkpoint, and if there has been no
+	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
+	 * avoid inserting duplicate checkpoints when the system is idle.
+	 */
+	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+				  CHECKPOINT_FORCE)) == 0)
+	{
+		if (last_important_lsn == ControlFile->checkPoint)
+		{
+			WALInsertLockRelease();
+			END_CRIT_SECTION();
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint skipped because system is idle")));
+			return;
+		}
+	}
+
+	/*
+	 * An end-of-recovery checkpoint is created before anyone is allowed to
+	 * write WAL. To allow us to write the checkpoint record, temporarily
+	 * enable XLogInsertAllowed.
+	 */
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
+		oldXLogAllowed = LocalSetXLogInsertAllowed();
+
+	checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
+		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+	else
+		checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
+
+	checkPoint.fullPageWrites = Insert->fullPageWrites;
+
+	/*
+	 * Compute new REDO record ptr = location of next XLOG record.
+	 *
+	 * NB: this is NOT necessarily where the checkpoint record itself will be,
+	 * since other backends may insert more XLOG records while we're off doing
+	 * the buffer flush work.  Those XLOG records are logically after the
+	 * checkpoint, even though physically before it.  Got that?
+	 */
+	freespace = INSERT_FREESPACE(curInsert);
+	if (freespace == 0)
+	{
+		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+			curInsert += SizeOfXLogLongPHD;
+		else
+			curInsert += SizeOfXLogShortPHD;
+	}
+	checkPoint.redo = curInsert;
+
+	/*
+	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+	 * must be done while holding all the insertion locks.
+	 *
+	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+	 * pointing past where it really needs to point.  This is okay; the only
+	 * consequence is that XLogInsert might back up whole buffers that it
+	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
+	 * XLogInserts that happen while we are dumping buffers must assume that
+	 * their buffer changes are not included in the checkpoint.
+	 */
+	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+
+	/*
+	 * Now we can release the WAL insertion locks, allowing other xacts to
+	 * proceed while we are flushing disk buffers.
+	 */
+	WALInsertLockRelease();
+
+	/* Update the info_lck-protected copy of RedoRecPtr as well */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->RedoRecPtr = checkPoint.redo;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * If enabled, log checkpoint start.  We postpone this until now so as not
+	 * to log anything if we decided to skip the checkpoint.
+	 */
+	if (log_checkpoints)
+		LogCheckpointStart(flags, false);
+
+	/* Update the process title */
+	update_checkpoint_display(flags, false, false);
+
+	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
+
+	/*
+	 * Get the other info we need for the checkpoint record.
+	 *
+	 * We don't need to save oldestClogXid in the checkpoint, it only matters
+	 * for the short period in which clog is being truncated, and if we crash
+	 * during that we'll redo the clog truncation and fix up oldestClogXid
+	 * there.
+	 */
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	checkPoint.nextXid = ShmemVariableCache->nextXid;
+	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
+	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
+	LWLockRelease(XidGenLock);
+
+	LWLockAcquire(CommitTsLock, LW_SHARED);
+	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
+	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
+	LWLockRelease(CommitTsLock);
+
+	LWLockAcquire(OidGenLock, LW_SHARED);
+	checkPoint.nextOid = ShmemVariableCache->nextOid;
+	if (!shutdown)
+		checkPoint.nextOid += ShmemVariableCache->oidCount;
+	LWLockRelease(OidGenLock);
+
+	MultiXactGetCheckptMulti(shutdown,
+							 &checkPoint.nextMulti,
+							 &checkPoint.nextMultiOffset,
+							 &checkPoint.oldestMulti,
+							 &checkPoint.oldestMultiDB);
+
+	/*
+	 * Having constructed the checkpoint record, ensure all shmem disk buffers
+	 * and commit-log buffers are flushed to disk.
+	 *
+	 * This I/O could fail for various reasons.  If so, we will fail to
+	 * complete the checkpoint, but there is no reason to force a system
+	 * panic. Accordingly, exit critical section while doing it.
+	 */
+	END_CRIT_SECTION();
+
+	/*
+	 * In some cases there are groups of actions that must all occur on one
+	 * side or the other of a checkpoint record. Before flushing the
+	 * checkpoint record we must explicitly wait for any backend currently
+	 * performing those groups of actions.
+	 *
+	 * One example is end of transaction, so we must wait for any transactions
+	 * that are currently in commit critical sections.  If an xact inserted
+	 * its commit record into XLOG just before the REDO point, then a crash
+	 * restart from the REDO point would not replay that record, which means
+	 * that our flushing had better include the xact's update of pg_xact.  So
+	 * we wait till he's out of his commit critical section before proceeding.
+	 * See notes in RecordTransactionCommit().
+	 *
+	 * Because we've already released the insertion locks, this test is a bit
+	 * fuzzy: it is possible that we will wait for xacts we didn't really need
+	 * to wait for.  But the delay should be short and it seems better to make
+	 * checkpoint take a bit longer than to hold off insertions longer than
+	 * necessary. (In fact, the whole reason we have this issue is that xact.c
+	 * does commit record XLOG insertion and clog update as two separate steps
+	 * protected by different locks, but again that seems best on grounds of
+	 * minimizing lock contention.)
+	 *
+	 * A transaction that has not yet set delayChkptFlags when we look cannot
+	 * be at risk, since it has not inserted its commit record yet; and one
+	 * that's already cleared it is not at risk either, since it's done fixing
+	 * clog and we will correctly flush the update below.  So we cannot miss
+	 * any xacts we need to wait for.
+	 */
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
+	if (nvxids > 0)
+	{
+		do
+		{
+			pg_usleep(10000L);	/* wait for 10 msec */
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+											  DELAY_CHKPT_START));
+	}
+	pfree(vxids);
+
+	CheckPointGuts(checkPoint.redo, flags);
+
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
+	if (nvxids > 0)
+	{
+		do
+		{
+			pg_usleep(10000L);	/* wait for 10 msec */
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
+											  DELAY_CHKPT_COMPLETE));
+	}
+	pfree(vxids);
+
+	/*
+	 * Take a snapshot of running transactions and write this to WAL. This
+	 * allows us to reconstruct the state of running transactions during
+	 * archive recovery, if required. Skip, if this info disabled.
+	 *
+	 * If we are shutting down, or Startup process is completing crash
+	 * recovery we don't need to write running xact data.
+	 */
+	if (!shutdown && XLogStandbyInfoActive())
+		LogStandbySnapshot();
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Now insert the checkpoint record into XLOG.
+	 */
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
+	recptr = XLogInsert(RM_XLOG_ID,
+						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
+						XLOG_CHECKPOINT_ONLINE);
+
+	XLogFlush(recptr);
+
+	/*
+	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
+	 * overwritten at next startup.  No-one should even try, this just allows
+	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
+	 * to just temporarily disable writing until the system has exited
+	 * recovery.
+	 */
+	if (shutdown)
+	{
+		if (flags & CHECKPOINT_END_OF_RECOVERY)
+			LocalXLogInsertAllowed = oldXLogAllowed;
+		else
+			LocalXLogInsertAllowed = 0; /* never again write WAL */
+	}
+
+	/*
+	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
+	 * = end of actual checkpoint record.
+	 */
+	if (shutdown && checkPoint.redo != ProcLastRecPtr)
+		ereport(PANIC,
+				(errmsg("concurrent write-ahead log activity while database system is shutting down")));
+
+	/*
+	 * Remember the prior checkpoint's redo ptr for
+	 * UpdateCheckPointDistanceEstimate()
+	 */
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
+
+	/*
+	 * Update the control file.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	if (shutdown)
+		ControlFile->state = DB_SHUTDOWNED;
+	ControlFile->checkPoint = ProcLastRecPtr;
+	ControlFile->checkPointCopy = checkPoint;
+	/* crash recovery should always recover to the end of WAL */
+	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
+	ControlFile->minRecoveryPointTLI = 0;
+
+	/*
+	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
+	 * unused on non-shutdown checkpoints, but seems useful to store it always
+	 * for debugging purposes.
+	 */
+	SpinLockAcquire(&XLogCtl->ulsn_lck);
+	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
+	SpinLockRelease(&XLogCtl->ulsn_lck);
+
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
+
+	/* Update shared-memory copy of checkpoint XID/epoch */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * We are now done with critical updates; no need for system panic if we
+	 * have trouble while fooling with old log segments.
+	 */
+	END_CRIT_SECTION();
+
+	/*
+	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
+	 */
+	SyncPostCheckpoint();
+
+	/*
+	 * Update the average distance between checkpoints if the prior checkpoint
+	 * exists.
+	 */
+	if (PriorRedoPtr != InvalidXLogRecPtr)
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+	/*
+	 * Delete old log files, those no longer needed for last checkpoint to
+	 * prevent the disk holding the xlog from growing full.
+	 */
+	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+	KeepLogSeg(recptr, &_logSegNo);
+	if (InvalidateObsoleteReplicationSlots(_logSegNo))
+	{
+		/*
+		 * Some slots have been invalidated; recalculate the old-segment
+		 * horizon, starting again from RedoRecPtr.
+		 */
+		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+		KeepLogSeg(recptr, &_logSegNo);
+	}
+	_logSegNo--;
+	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
+					   checkPoint.ThisTimeLineID);
+
+	/*
+	 * Make more log segments if needed.  (Do this after recycling old log
+	 * segments, since that may supply some of the needed files.)
+	 */
+	if (!shutdown)
+		PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
+
+	/*
+	 * Truncate pg_subtrans if possible.  We can throw away all data before
+	 * the oldest XMIN of any running transaction.  No future transaction will
+	 * attempt to reference any pg_subtrans entry older than that (see Asserts
+	 * in subtrans.c).  During recovery, though, we mustn't do this because
+	 * StartupSUBTRANS hasn't been called yet.
+	 */
+	if (!RecoveryInProgress())
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+
+	/* Real work is done; log and update stats. */
+	LogCheckpointEnd(false);
+
+	/* Reset the process title */
+	update_checkpoint_display(flags, false, true);
+
+	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
+									 NBuffers,
+									 CheckpointStats.ckpt_segs_added,
+									 CheckpointStats.ckpt_segs_removed,
+									 CheckpointStats.ckpt_segs_recycled);
+}
+
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+static void
+CreateEndOfRecoveryRecord(void)
+{
+	xl_end_of_recovery xlrec;
+	XLogRecPtr	recptr;
+
+	/* sanity check */
+	if (!RecoveryInProgress())
+		elog(ERROR, "can only be used to end recovery");
+
+	xlrec.end_time = GetCurrentTimestamp();
+
+	WALInsertLockAcquireExclusive();
+	xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
+	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+	WALInsertLockRelease();
+
+	START_CRIT_SECTION();
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
+	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
+
+	XLogFlush(recptr);
+
+	/*
+	 * Update the control file so that crash recovery can follow the timeline
+	 * changes to this point.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->minRecoveryPoint = recptr;
+	ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
+
+	END_CRIT_SECTION();
+}
+
+/*
+ * Write an OVERWRITE_CONTRECORD message.
+ *
+ * When on WAL replay we expect a continuation record at the start of a page
+ * that is not there, recovery ends and WAL writing resumes at that point.
+ * But it's wrong to resume writing new WAL back at the start of the record
+ * that was broken, because downstream consumers of that WAL (physical
+ * replicas) are not prepared to "rewind".  So the first action after
+ * finishing replay of all valid WAL must be to write a record of this type
+ * at the point where the contrecord was missing; to support xlogreader
+ * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
+ * to the page header where the record occurs.  xlogreader has an ad-hoc
+ * mechanism to report metadata about the broken record, which is what we
+ * use here.
+ *
+ * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
+ * skip the record it was reading, and pass back the LSN of the skipped
+ * record, so that its caller can verify (on "replay" of that record) that the
+ * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
+ *
+ * 'aborted_lsn' is the beginning position of the record that was incomplete.
+ * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
+ * beginning of the XLOG page where the record is to be inserted.  They must
+ * match the current WAL insert position, they're passed here just so that we
+ * can verify that.
+ */
+static XLogRecPtr
+CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
+								TimeLineID newTLI)
+{
+	xl_overwrite_contrecord xlrec;
+	XLogRecPtr	recptr;
+	XLogPageHeader pagehdr;
+	XLogRecPtr	startPos;
+
+	/* sanity checks */
+	if (!RecoveryInProgress())
+		elog(ERROR, "can only be used at end of recovery");
+	if (pagePtr % XLOG_BLCKSZ != 0)
+		elog(ERROR, "invalid position for missing continuation record %X/%X",
+			 LSN_FORMAT_ARGS(pagePtr));
+
+	/* The current WAL insert position should be right after the page header */
+	startPos = pagePtr;
+	if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
+		startPos += SizeOfXLogLongPHD;
+	else
+		startPos += SizeOfXLogShortPHD;
+	recptr = GetXLogInsertRecPtr();
+	if (recptr != startPos)
+		elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
+			 LSN_FORMAT_ARGS(recptr));
+
+	START_CRIT_SECTION();
+
+	/*
+	 * Initialize the XLOG page header (by GetXLogBuffer), and set the
+	 * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
+	 *
+	 * No other backend is allowed to write WAL yet, so acquiring the WAL
+	 * insertion lock is just pro forma.
+	 */
+	WALInsertLockAcquire();
+	pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
+	pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
+	WALInsertLockRelease();
+
+	/*
+	 * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
+	 * page.  We know it becomes the first record, because no other backend is
+	 * allowed to write WAL yet.
+	 */
+	XLogBeginInsert();
+	xlrec.overwritten_lsn = aborted_lsn;
+	xlrec.overwrite_time = GetCurrentTimestamp();
+	XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
+	recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
+
+	/* check that the record was inserted to the right place */
+	if (ProcLastRecPtr != startPos)
+		elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
+			 LSN_FORMAT_ARGS(ProcLastRecPtr));
+
+	XLogFlush(recptr);
+
+	END_CRIT_SECTION();
+
+	return recptr;
+}
+
+/*
+ * Flush all data in shared memory to disk, and fsync
+ *
+ * This is the common code shared between regular checkpoints and
+ * recovery restartpoints.
+ */
+static void
+CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
+{
+	CheckPointRelationMap();
+	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
+	CheckPointReplicationOrigin();
+
+	/* Write out all dirty data in SLRUs and the main buffer pool */
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
+	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
+	CheckPointCLOG();
+	CheckPointCommitTs();
+	CheckPointSUBTRANS();
+	CheckPointMultiXact();
+	CheckPointPredicate();
+	CheckPointBuffers(flags);
+
+	/* Perform all queued up fsyncs */
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
+	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
+	ProcessSyncRequests();
+	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
+
+	/* We deliberately delay 2PC checkpointing as long as possible */
+	CheckPointTwoPhase(checkPointRedo);
+}
+
+/*
+ * Save a checkpoint for recovery restart if appropriate
+ *
+ * This function is called each time a checkpoint record is read from XLOG.
+ * It must determine whether the checkpoint represents a safe restartpoint or
+ * not.  If so, the checkpoint record is stashed in shared memory so that
+ * CreateRestartPoint can consult it.  (Note that the latter function is
+ * executed by the checkpointer, while this one will be executed by the
+ * startup process.)
+ */
+static void
+RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
+{
+	/*
+	 * Also refrain from creating a restartpoint if we have seen any
+	 * references to non-existent pages. Restarting recovery from the
+	 * restartpoint would not see the references, so we would lose the
+	 * cross-check that the pages belonged to a relation that was dropped
+	 * later.
+	 */
+	if (XLogHaveInvalidPages())
+	{
+		elog(trace_recovery(DEBUG2),
+			 "could not record restart point at %X/%X because there "
+			 "are unresolved references to invalid pages",
+			 LSN_FORMAT_ARGS(checkPoint->redo));
+		return;
+	}
+
+	/*
+	 * Copy the checkpoint record to shared memory, so that checkpointer can
+	 * work out the next time it wants to perform a restartpoint.
+	 */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
+	XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
+	XLogCtl->lastCheckPoint = *checkPoint;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+/*
+ * Establish a restartpoint if possible.
+ *
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a safe checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+	XLogRecPtr	lastCheckPointRecPtr;
+	XLogRecPtr	lastCheckPointEndPtr;
+	CheckPoint	lastCheckPoint;
+	XLogRecPtr	PriorRedoPtr;
+	XLogRecPtr	receivePtr;
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	endptr;
+	XLogSegNo	_logSegNo;
+	TimestampTz xtime;
+
+	/* Concurrent checkpoint/restartpoint cannot happen */
+	Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
+	/* Get a local copy of the last safe checkpoint record. */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
+	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
+	lastCheckPoint = XLogCtl->lastCheckPoint;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * Check that we're still in recovery mode. It's ok if we exit recovery
+	 * mode after this check, the restart point is valid anyway.
+	 */
+	if (!RecoveryInProgress())
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("skipping restartpoint, recovery has already ended")));
+		return false;
+	}
+
+	/*
+	 * If the last checkpoint record we've replayed is already our last
+	 * restartpoint, we can't perform a new restart point. We still update
+	 * minRecoveryPoint in that case, so that if this is a shutdown restart
+	 * point, we won't start up earlier than before. That's not strictly
+	 * necessary, but when hot standby is enabled, it would be rather weird if
+	 * the database opened up for read-only connections at a point-in-time
+	 * before the last shutdown. Such time travel is still possible in case of
+	 * immediate shutdown, though.
+	 *
+	 * We don't explicitly advance minRecoveryPoint when we do create a
+	 * restartpoint. It's assumed that flushing the buffers will do that as a
+	 * side-effect.
+	 */
+	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
+								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
+
+		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+		if (flags & CHECKPOINT_IS_SHUTDOWN)
+		{
+			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+			UpdateControlFile();
+			LWLockRelease(ControlFileLock);
+		}
+		return false;
+	}
+
+	/*
+	 * Update the shared RedoRecPtr so that the startup process can calculate
+	 * the number of segments replayed since last restartpoint, and request a
+	 * restartpoint if it exceeds CheckPointSegments.
+	 *
+	 * Like in CreateCheckPoint(), hold off insertions to update it, although
+	 * during recovery this is just pro forma, because no WAL insertions are
+	 * happening.
+	 */
+	WALInsertLockAcquireExclusive();
+	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
+	WALInsertLockRelease();
+
+	/* Also update the info_lck-protected copy */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	/*
+	 * Prepare to accumulate statistics.
+	 *
+	 * Note: because it is possible for log_checkpoints to change while a
+	 * checkpoint proceeds, we always accumulate stats, even if
+	 * log_checkpoints is currently off.
+	 */
+	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+	if (log_checkpoints)
+		LogCheckpointStart(flags, true);
+
+	/* Update the process title */
+	update_checkpoint_display(flags, true, false);
+
+	CheckPointGuts(lastCheckPoint.redo, flags);
+
+	/*
+	 * Remember the prior checkpoint's redo ptr for
+	 * UpdateCheckPointDistanceEstimate()
+	 */
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
+
+	/*
+	 * Update pg_control, using current time.  Check that it still shows an
+	 * older checkpoint, else do nothing; this is a quick hack to make sure
+	 * nothing really bad happens if somehow we get here after the
+	 * end-of-recovery checkpoint.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
+	{
+		/*
+		 * Update the checkpoint information.  We do this even if the cluster
+		 * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
+		 * segments recycled below.
+		 */
+		ControlFile->checkPoint = lastCheckPointRecPtr;
+		ControlFile->checkPointCopy = lastCheckPoint;
+
+		/*
+		 * Ensure minRecoveryPoint is past the checkpoint record and update it
+		 * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
+		 * this will have happened already while writing out dirty buffers,
+		 * but not necessarily - e.g. because no buffers were dirtied.  We do
+		 * this because a backup performed in recovery uses minRecoveryPoint
+		 * to determine which WAL files must be included in the backup, and
+		 * the file (or files) containing the checkpoint record must be
+		 * included, at a minimum.  Note that for an ordinary restart of
+		 * recovery there's no value in having the minimum recovery point any
+		 * earlier than this anyway, because redo will begin just after the
+		 * checkpoint record.
+		 */
+		if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
+		{
+			if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
+			{
+				ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
+				ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
+
+				/* update local copy */
+				LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+				LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+			}
+			if (flags & CHECKPOINT_IS_SHUTDOWN)
+				ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+		}
+		UpdateControlFile();
+	}
+	LWLockRelease(ControlFileLock);
+
+	/*
+	 * Update the average distance between checkpoints/restartpoints if the
+	 * prior checkpoint exists.
+	 */
+	if (PriorRedoPtr != InvalidXLogRecPtr)
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+
+	/*
+	 * Delete old log files, those no longer needed for last restartpoint to
+	 * prevent the disk holding the xlog from growing full.
+	 */
+	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+
+	/*
+	 * Retreat _logSegNo using the current end of xlog replayed or received,
+	 * whichever is later.
+	 */
+	receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
+	KeepLogSeg(endptr, &_logSegNo);
+	if (InvalidateObsoleteReplicationSlots(_logSegNo))
+	{
+		/*
+		 * Some slots have been invalidated; recalculate the old-segment
+		 * horizon, starting again from RedoRecPtr.
+		 */
+		XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+		KeepLogSeg(endptr, &_logSegNo);
+	}
+	_logSegNo--;
+
+	/*
+	 * Try to recycle segments on a useful timeline. If we've been promoted
+	 * since the beginning of this restartpoint, use the new timeline chosen
+	 * at end of recovery.  If we're still in recovery, use the timeline we're
+	 * currently replaying.
+	 *
+	 * There is no guarantee that the WAL segments will be useful on the
+	 * current timeline; if recovery proceeds to a new timeline right after
+	 * this, the pre-allocated WAL segments on this timeline will not be used,
+	 * and will go wasted until recycled on the next restartpoint. We'll live
+	 * with that.
+	 */
+	if (!RecoveryInProgress())
+		replayTLI = XLogCtl->InsertTimeLineID;
+
+	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
+
+	/*
+	 * Make more log segments if needed.  (Do this after recycling old log
+	 * segments, since that may supply some of the needed files.)
+	 */
+	PreallocXlogFiles(endptr, replayTLI);
+
+	/*
+	 * Truncate pg_subtrans if possible.  We can throw away all data before
+	 * the oldest XMIN of any running transaction.  No future transaction will
+	 * attempt to reference any pg_subtrans entry older than that (see Asserts
+	 * in subtrans.c).  When hot standby is disabled, though, we mustn't do
+	 * this because StartupSUBTRANS hasn't been called yet.
+	 */
+	if (EnableHotStandby)
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+
+	/* Real work is done; log and update stats. */
+	LogCheckpointEnd(true);
+
+	/* Reset the process title */
+	update_checkpoint_display(flags, true, true);
+
+	xtime = GetLatestXTime();
+	ereport((log_checkpoints ? LOG : DEBUG2),
+			(errmsg("recovery restart point at %X/%X",
+					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+			 xtime ? errdetail("Last completed transaction was at log time %s.",
+							   timestamptz_to_str(xtime)) : 0));
+
+	/*
+	 * Finally, execute archive_cleanup_command, if any.
+	 */
+	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
+		ExecuteRecoveryCommand(archiveCleanupCommand,
+							   "archive_cleanup_command",
+							   false,
+							   WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
+
+	return true;
+}
+
+/*
+ * Report availability of WAL for the given target LSN
+ *		(typically a slot's restart_lsn)
+ *
+ * Returns one of the following enum values:
+ *
+ * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
+ *   max_wal_size.
+ *
+ * * WALAVAIL_EXTENDED means it is still available by preserving extra
+ *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
+ *   than max_wal_size, this state is not returned.
+ *
+ * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
+ *   remove reserved segments. The walsender using this slot may return to the
+ *   above.
+ *
+ * * WALAVAIL_REMOVED means it has been removed. A replication stream on
+ *   a slot with this LSN cannot continue after a restart.
+ *
+ * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
+ */
+WALAvailability
+GetWALAvailability(XLogRecPtr targetLSN)
+{
+	XLogRecPtr	currpos;		/* current write LSN */
+	XLogSegNo	currSeg;		/* segid of currpos */
+	XLogSegNo	targetSeg;		/* segid of targetLSN */
+	XLogSegNo	oldestSeg;		/* actual oldest segid */
+	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
+	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
+	uint64		keepSegs;
+
+	/*
+	 * slot does not reserve WAL. Either deactivated, or has never been active
+	 */
+	if (XLogRecPtrIsInvalid(targetLSN))
+		return WALAVAIL_INVALID_LSN;
+
+	/*
+	 * Calculate the oldest segment currently reserved by all slots,
+	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
+	 * oldestSlotSeg to the current segment.
+	 */
+	currpos = GetXLogWriteRecPtr();
+	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
+	KeepLogSeg(currpos, &oldestSlotSeg);
+
+	/*
+	 * Find the oldest extant segment file. We get 1 until checkpoint removes
+	 * the first WAL segment file since startup, which causes the status being
+	 * wrong under certain abnormal conditions but that doesn't actually harm.
+	 */
+	oldestSeg = XLogGetLastRemovedSegno() + 1;
+
+	/* calculate oldest segment by max_wal_size */
+	XLByteToSeg(currpos, currSeg, wal_segment_size);
+	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
+
+	if (currSeg > keepSegs)
+		oldestSegMaxWalSize = currSeg - keepSegs;
+	else
+		oldestSegMaxWalSize = 1;
+
+	/* the segment we care about */
+	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+
+	/*
+	 * No point in returning reserved or extended status values if the
+	 * targetSeg is known to be lost.
+	 */
+	if (targetSeg >= oldestSlotSeg)
+	{
+		/* show "reserved" when targetSeg is within max_wal_size */
+		if (targetSeg >= oldestSegMaxWalSize)
+			return WALAVAIL_RESERVED;
+
+		/* being retained by slots exceeding max_wal_size */
+		return WALAVAIL_EXTENDED;
+	}
+
+	/* WAL segments are no longer retained but haven't been removed yet */
+	if (targetSeg >= oldestSeg)
+		return WALAVAIL_UNRESERVED;
+
+	/* Definitely lost */
+	return WALAVAIL_REMOVED;
+}
+
+
+/*
+ * Retreat *logSegNo to the last segment that we need to retain because of
+ * either wal_keep_size or replication slots.
+ *
+ * This is calculated by subtracting wal_keep_size from the given xlog
+ * location, recptr and by making sure that that result is below the
+ * requirement of replication slots.  For the latter criterion we do consider
+ * the effects of max_slot_wal_keep_size: reserve at most that much space back
+ * from recptr.
+ *
+ * Note about replication slots: if this function calculates a value
+ * that's further ahead than what slots need reserved, then affected
+ * slots need to be invalidated and this function invoked again.
+ * XXX it might be a good idea to rewrite this function so that
+ * invalidation is optionally done here, instead.
+ */
+static void
+KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
+{
+	XLogSegNo	currSegNo;
+	XLogSegNo	segno;
+	XLogRecPtr	keep;
+
+	XLByteToSeg(recptr, currSegNo, wal_segment_size);
+	segno = currSegNo;
+
+	/*
+	 * Calculate how many segments are kept by slots first, adjusting for
+	 * max_slot_wal_keep_size.
+	 */
+	keep = XLogGetReplicationSlotMinimumLSN();
+	if (keep != InvalidXLogRecPtr && keep < recptr)
+	{
+		XLByteToSeg(keep, segno, wal_segment_size);
+
+		/* Cap by max_slot_wal_keep_size ... */
+		if (max_slot_wal_keep_size_mb >= 0)
+		{
+			uint64		slot_keep_segs;
+
+			slot_keep_segs =
+				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
+
+			if (currSegNo - segno > slot_keep_segs)
+				segno = currSegNo - slot_keep_segs;
+		}
+	}
+
+	/* but, keep at least wal_keep_size if that's set */
+	if (wal_keep_size_mb > 0)
+	{
+		uint64		keep_segs;
+
+		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
+		if (currSegNo - segno < keep_segs)
+		{
+			/* avoid underflow, don't go below 1 */
+			if (currSegNo <= keep_segs)
+				segno = 1;
+			else
+				segno = currSegNo - keep_segs;
+		}
+	}
+
+	/* don't delete WAL segments newer than the calculated segment */
+	if (segno < *logSegNo)
+		*logSegNo = segno;
+}
+
+/*
+ * Write a NEXTOID log record
+ */
+void
+XLogPutNextOid(Oid nextOid)
+{
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&nextOid), sizeof(Oid));
+	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
+
+	/*
+	 * We need not flush the NEXTOID record immediately, because any of the
+	 * just-allocated OIDs could only reach disk as part of a tuple insert or
+	 * update that would have its own XLOG record that must follow the NEXTOID
+	 * record.  Therefore, the standard buffer LSN interlock applied to those
+	 * records will ensure no such OID reaches disk before the NEXTOID record
+	 * does.
+	 *
+	 * Note, however, that the above statement only covers state "within" the
+	 * database.  When we use a generated OID as a file or directory name, we
+	 * are in a sense violating the basic WAL rule, because that filesystem
+	 * change may reach disk before the NEXTOID WAL record does.  The impact
+	 * of this is that if a database crash occurs immediately afterward, we
+	 * might after restart re-generate the same OID and find that it conflicts
+	 * with the leftover file or directory.  But since for safety's sake we
+	 * always loop until finding a nonconflicting filename, this poses no real
+	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
+	 */
+}
+
+/*
+ * Write an XLOG SWITCH record.
+ *
+ * Here we just blindly issue an XLogInsert request for the record.
+ * All the magic happens inside XLogInsert.
+ *
+ * The return value is either the end+1 address of the switch record,
+ * or the end+1 address of the prior segment if we did not need to
+ * write a switch record because we are already at segment start.
+ */
+XLogRecPtr
+RequestXLogSwitch(bool mark_unimportant)
+{
+	XLogRecPtr	RecPtr;
+
+	/* XLOG SWITCH has no data */
+	XLogBeginInsert();
+
+	if (mark_unimportant)
+		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
+
+	return RecPtr;
+}
+
+/*
+ * Write a RESTORE POINT record
+ */
+XLogRecPtr
+XLogRestorePoint(const char *rpName)
+{
+	XLogRecPtr	RecPtr;
+	xl_restore_point xlrec;
+
+	xlrec.rp_time = GetCurrentTimestamp();
+	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
+
+	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
+
+	ereport(LOG,
+			(errmsg("restore point \"%s\" created at %X/%X",
+					rpName, LSN_FORMAT_ARGS(RecPtr))));
+
+	return RecPtr;
+}
+
+/*
+ * Check if any of the GUC parameters that are critical for hot standby
+ * have changed, and update the value in pg_control file if necessary.
+ */
+static void
+XLogReportParameters(void)
+{
+	if (wal_level != ControlFile->wal_level ||
+		wal_log_hints != ControlFile->wal_log_hints ||
+		MaxConnections != ControlFile->MaxConnections ||
+		max_worker_processes != ControlFile->max_worker_processes ||
+		max_wal_senders != ControlFile->max_wal_senders ||
+		max_prepared_xacts != ControlFile->max_prepared_xacts ||
+		max_locks_per_xact != ControlFile->max_locks_per_xact ||
+		track_commit_timestamp != ControlFile->track_commit_timestamp)
+	{
+		/*
+		 * The change in number of backend slots doesn't need to be WAL-logged
+		 * if archiving is not enabled, as you can't start archive recovery
+		 * with wal_level=minimal anyway. We don't really care about the
+		 * values in pg_control either if wal_level=minimal, but seems better
+		 * to keep them up-to-date to avoid confusion.
+		 */
+		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
+		{
+			xl_parameter_change xlrec;
+			XLogRecPtr	recptr;
+
+			xlrec.MaxConnections = MaxConnections;
+			xlrec.max_worker_processes = max_worker_processes;
+			xlrec.max_wal_senders = max_wal_senders;
+			xlrec.max_prepared_xacts = max_prepared_xacts;
+			xlrec.max_locks_per_xact = max_locks_per_xact;
+			xlrec.wal_level = wal_level;
+			xlrec.wal_log_hints = wal_log_hints;
+			xlrec.track_commit_timestamp = track_commit_timestamp;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
+			XLogFlush(recptr);
+		}
+
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+		ControlFile->MaxConnections = MaxConnections;
+		ControlFile->max_worker_processes = max_worker_processes;
+		ControlFile->max_wal_senders = max_wal_senders;
+		ControlFile->max_prepared_xacts = max_prepared_xacts;
+		ControlFile->max_locks_per_xact = max_locks_per_xact;
+		ControlFile->wal_level = wal_level;
+		ControlFile->wal_log_hints = wal_log_hints;
+		ControlFile->track_commit_timestamp = track_commit_timestamp;
+		UpdateControlFile();
+
+		LWLockRelease(ControlFileLock);
+	}
+}
+
+/*
+ * Update full_page_writes in shared memory, and write an
+ * XLOG_FPW_CHANGE record if necessary.
+ *
+ * Note: this function assumes there is no other process running
+ * concurrently that could update it.
+ */
+void
+UpdateFullPageWrites(void)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	bool		recoveryInProgress;
+
+	/*
+	 * Do nothing if full_page_writes has not been changed.
+	 *
+	 * It's safe to check the shared full_page_writes without the lock,
+	 * because we assume that there is no concurrently running process which
+	 * can update it.
+	 */
+	if (fullPageWrites == Insert->fullPageWrites)
+		return;
+
+	/*
+	 * Perform this outside critical section so that the WAL insert
+	 * initialization done by RecoveryInProgress() doesn't trigger an
+	 * assertion failure.
+	 */
+	recoveryInProgress = RecoveryInProgress();
+
+	START_CRIT_SECTION();
+
+	/*
+	 * It's always safe to take full page images, even when not strictly
+	 * required, but not the other round. So if we're setting full_page_writes
+	 * to true, first set it true and then write the WAL record. If we're
+	 * setting it to false, first write the WAL record and then set the global
+	 * flag.
+	 */
+	if (fullPageWrites)
+	{
+		WALInsertLockAcquireExclusive();
+		Insert->fullPageWrites = true;
+		WALInsertLockRelease();
+	}
+
+	/*
+	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
+	 * full_page_writes during archive recovery, if required.
+	 */
+	if (XLogStandbyInfoActive() && !recoveryInProgress)
+	{
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
+
+		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
+	}
+
+	if (!fullPageWrites)
+	{
+		WALInsertLockAcquireExclusive();
+		Insert->fullPageWrites = false;
+		WALInsertLockRelease();
+	}
+	END_CRIT_SECTION();
+}
+
+/*
+ * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all record types are related to control file updates.
+ *
+ * NOTE: Some XLOG record types that are directly related to WAL recovery
+ * are handled in xlogrecovery_redo().
+ */
+void
+xlog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	XLogRecPtr	lsn = record->EndRecPtr;
+
+	/*
+	 * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
+	 * XLOG_FPI_FOR_HINT records.
+	 */
+	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+		   !XLogRecHasAnyBlockRefs(record));
+
+	if (info == XLOG_NEXTOID)
+	{
+		Oid			nextOid;
+
+		/*
+		 * We used to try to take the maximum of ShmemVariableCache->nextOid
+		 * and the recorded nextOid, but that fails if the OID counter wraps
+		 * around.  Since no OID allocation should be happening during replay
+		 * anyway, better to just believe the record exactly.  We still take
+		 * OidGenLock while setting the variable, just in case.
+		 */
+		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
+		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextOid = nextOid;
+		ShmemVariableCache->oidCount = 0;
+		LWLockRelease(OidGenLock);
+	}
+	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
+	{
+		CheckPoint	checkPoint;
+		TimeLineID	replayTLI;
+
+		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+		/* In a SHUTDOWN checkpoint, believe the counters exactly */
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextXid = checkPoint.nextXid;
+		LWLockRelease(XidGenLock);
+		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextOid = checkPoint.nextOid;
+		ShmemVariableCache->oidCount = 0;
+		LWLockRelease(OidGenLock);
+		MultiXactSetNextMXact(checkPoint.nextMulti,
+							  checkPoint.nextMultiOffset);
+
+		MultiXactAdvanceOldest(checkPoint.oldestMulti,
+							   checkPoint.oldestMultiDB);
+
+		/*
+		 * No need to set oldestClogXid here as well; it'll be set when we
+		 * redo an xl_clog_truncate if it changed since initialization.
+		 */
+		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+
+		/*
+		 * If we see a shutdown checkpoint while waiting for an end-of-backup
+		 * record, the backup was canceled and the end-of-backup record will
+		 * never arrive.
+		 */
+		if (ArchiveRecoveryRequested &&
+			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
+			XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
+			ereport(PANIC,
+					(errmsg("online backup was canceled, recovery cannot continue")));
+
+		/*
+		 * If we see a shutdown checkpoint, we know that nothing was running
+		 * on the primary at this point. So fake-up an empty running-xacts
+		 * record and use that here and now. Recover additional standby state
+		 * for prepared transactions.
+		 */
+		if (standbyState >= STANDBY_INITIALIZED)
+		{
+			TransactionId *xids;
+			int			nxids;
+			TransactionId oldestActiveXID;
+			TransactionId latestCompletedXid;
+			RunningTransactionsData running;
+
+			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
+			/*
+			 * Construct a RunningTransactions snapshot representing a shut
+			 * down server, with only prepared transactions still alive. We're
+			 * never overflowed at this point because all subxids are listed
+			 * with their parent prepared transactions.
+			 */
+			running.xcnt = nxids;
+			running.subxcnt = 0;
+			running.subxid_overflow = false;
+			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+			running.oldestRunningXid = oldestActiveXID;
+			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+			TransactionIdRetreat(latestCompletedXid);
+			Assert(TransactionIdIsNormal(latestCompletedXid));
+			running.latestCompletedXid = latestCompletedXid;
+			running.xids = xids;
+
+			ProcArrayApplyRecoveryInfo(&running);
+
+			StandbyRecoverPreparedTransactions();
+		}
+
+		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+		LWLockRelease(ControlFileLock);
+
+		/* Update shared-memory copy of checkpoint XID/epoch */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		/*
+		 * We should've already switched to the new TLI before replaying this
+		 * record.
+		 */
+		(void) GetCurrentReplayRecPtr(&replayTLI);
+		if (checkPoint.ThisTimeLineID != replayTLI)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
+							checkPoint.ThisTimeLineID, replayTLI)));
+
+		RecoveryRestartPoint(&checkPoint, record);
+	}
+	else if (info == XLOG_CHECKPOINT_ONLINE)
+	{
+		CheckPoint	checkPoint;
+		TimeLineID	replayTLI;
+
+		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+		if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
+									  checkPoint.nextXid))
+			ShmemVariableCache->nextXid = checkPoint.nextXid;
+		LWLockRelease(XidGenLock);
+
+		/*
+		 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
+		 * to track OID assignment through XLOG_NEXTOID records.  The nextOid
+		 * counter is from the start of the checkpoint and might well be stale
+		 * compared to later XLOG_NEXTOID records.  We could try to take the
+		 * maximum of the nextOid counter and our latest value, but since
+		 * there's no particular guarantee about the speed with which the OID
+		 * counter wraps around, that's a risky thing to do.  In any case,
+		 * users of the nextOid counter are required to avoid assignment of
+		 * duplicates, so that a somewhat out-of-date value should be safe.
+		 */
+
+		/* Handle multixact */
+		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
+								  checkPoint.nextMultiOffset);
+
+		/*
+		 * NB: This may perform multixact truncation when replaying WAL
+		 * generated by an older primary.
+		 */
+		MultiXactAdvanceOldest(checkPoint.oldestMulti,
+							   checkPoint.oldestMultiDB);
+		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
+								  checkPoint.oldestXid))
+			SetTransactionIdLimit(checkPoint.oldestXid,
+								  checkPoint.oldestXidDB);
+		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+		LWLockRelease(ControlFileLock);
+
+		/* Update shared-memory copy of checkpoint XID/epoch */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		/* TLI should not change in an on-line checkpoint */
+		(void) GetCurrentReplayRecPtr(&replayTLI);
+		if (checkPoint.ThisTimeLineID != replayTLI)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
+							checkPoint.ThisTimeLineID, replayTLI)));
+
+		RecoveryRestartPoint(&checkPoint, record);
+	}
+	else if (info == XLOG_OVERWRITE_CONTRECORD)
+	{
+		/* nothing to do here, handled in xlogrecovery_redo() */
+	}
+	else if (info == XLOG_END_OF_RECOVERY)
+	{
+		xl_end_of_recovery xlrec;
+		TimeLineID	replayTLI;
+
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+		/*
+		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+		 * but this case is rarer and harder to test, so the benefit doesn't
+		 * outweigh the potential extra cost of maintenance.
+		 */
+
+		/*
+		 * We should've already switched to the new TLI before replaying this
+		 * record.
+		 */
+		(void) GetCurrentReplayRecPtr(&replayTLI);
+		if (xlrec.ThisTimeLineID != replayTLI)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
+							xlrec.ThisTimeLineID, replayTLI)));
+	}
+	else if (info == XLOG_NOOP)
+	{
+		/* nothing to do here */
+	}
+	else if (info == XLOG_SWITCH)
+	{
+		/* nothing to do here */
+	}
+	else if (info == XLOG_RESTORE_POINT)
+	{
+		/* nothing to do here, handled in xlogrecovery.c */
+	}
+	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	{
+		/*
+		 * XLOG_FPI records contain nothing else but one or more block
+		 * references. Every block reference must include a full-page image
+		 * even if full_page_writes was disabled when the record was generated
+		 * - otherwise there would be no point in this record.
+		 *
+		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
+		 * WAL-logged because of a hint bit update. They are only generated
+		 * when checksums and/or wal_log_hints are enabled. They may include
+		 * no full-page images if full_page_writes was disabled when they were
+		 * generated. In this case there is nothing to do here.
+		 *
+		 * No recovery conflicts are generated by these generic records - if a
+		 * resource manager needs to generate conflicts, it has to define a
+		 * separate WAL record type and redo routine.
+		 */
+		for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+		{
+			Buffer		buffer;
+
+			if (!XLogRecHasBlockImage(record, block_id))
+			{
+				if (info == XLOG_FPI)
+					elog(ERROR, "XLOG_FPI record did not contain a full-page image");
+				continue;
+			}
+
+			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+	else if (info == XLOG_BACKUP_END)
+	{
+		/* nothing to do here, handled in xlogrecovery_redo() */
+	}
+	else if (info == XLOG_PARAMETER_CHANGE)
+	{
+		xl_parameter_change xlrec;
+
+		/* Update our copy of the parameters in pg_control */
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
+
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->MaxConnections = xlrec.MaxConnections;
+		ControlFile->max_worker_processes = xlrec.max_worker_processes;
+		ControlFile->max_wal_senders = xlrec.max_wal_senders;
+		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
+		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
+		ControlFile->wal_level = xlrec.wal_level;
+		ControlFile->wal_log_hints = xlrec.wal_log_hints;
+
+		/*
+		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
+		 * recover back up to this point before allowing hot standby again.
+		 * This is important if the max_* settings are decreased, to ensure
+		 * you don't run queries against the WAL preceding the change. The
+		 * local copies cannot be updated as long as crash recovery is
+		 * happening and we expect all the WAL to be replayed.
+		 */
+		if (InArchiveRecovery)
+		{
+			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+		}
+		if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
+		{
+			TimeLineID	replayTLI;
+
+			(void) GetCurrentReplayRecPtr(&replayTLI);
+			ControlFile->minRecoveryPoint = lsn;
+			ControlFile->minRecoveryPointTLI = replayTLI;
+		}
+
+		CommitTsParameterChange(xlrec.track_commit_timestamp,
+								ControlFile->track_commit_timestamp);
+		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+
+		/* Check to see if any parameter change gives a problem on recovery */
+		CheckRequiredParameterValues();
+	}
+	else if (info == XLOG_FPW_CHANGE)
+	{
+		bool		fpw;
+
+		memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
+
+		/*
+		 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
+		 * do_pg_backup_start() and do_pg_backup_stop() can check whether
+		 * full_page_writes has been disabled during online backup.
+		 */
+		if (!fpw)
+		{
+			SpinLockAcquire(&XLogCtl->info_lck);
+			if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
+				XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
+			SpinLockRelease(&XLogCtl->info_lck);
+		}
+
+		/* Keep track of full_page_writes */
+		lastFullPageWrites = fpw;
+	}
+}
+
+/*
+ * Return the (possible) sync flag used for opening a file, depending on the
+ * value of the GUC wal_sync_method.
+ */
+static int
+get_sync_bit(int method)
+{
+	int			o_direct_flag = 0;
+
+	/* If fsync is disabled, never open in sync mode */
+	if (!enableFsync)
+		return 0;
+
+	/*
+	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
+	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
+	 * disabled, otherwise the archive command or walsender process will read
+	 * the WAL soon after writing it, which is guaranteed to cause a physical
+	 * read if we bypassed the kernel cache. We also skip the
+	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
+	 * reason.
+	 *
+	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+	 * written by walreceiver is normally read by the startup process soon
+	 * after it's written. Also, walreceiver performs unaligned writes, which
+	 * don't work with O_DIRECT, so it is required for correctness too.
+	 */
+	if (!XLogIsNeeded() && !AmWalReceiverProcess())
+		o_direct_flag = PG_O_DIRECT;
+
+	switch (method)
+	{
+			/*
+			 * enum values for all sync options are defined even if they are
+			 * not supported on the current platform.  But if not, they are
+			 * not included in the enum option array, and therefore will never
+			 * be seen here.
+			 */
+		case SYNC_METHOD_FSYNC:
+		case SYNC_METHOD_FSYNC_WRITETHROUGH:
+		case SYNC_METHOD_FDATASYNC:
+			return 0;
+#ifdef OPEN_SYNC_FLAG
+		case SYNC_METHOD_OPEN:
+			return OPEN_SYNC_FLAG | o_direct_flag;
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+		case SYNC_METHOD_OPEN_DSYNC:
+			return OPEN_DATASYNC_FLAG | o_direct_flag;
+#endif
+		default:
+			/* can't happen (unless we are out of sync with option array) */
+			elog(ERROR, "unrecognized wal_sync_method: %d", method);
+			return 0;			/* silence warning */
+	}
+}
+
+/*
+ * GUC support
+ */
+void
+assign_xlog_sync_method(int new_sync_method, void *extra)
+{
+	if (sync_method != new_sync_method)
+	{
+		/*
+		 * To ensure that no blocks escape unsynced, force an fsync on the
+		 * currently open log segment (if any).  Also, if the open flag is
+		 * changing, close the log file so it will be reopened (with new flag
+		 * bit) at next use.
+		 */
+		if (openLogFile >= 0)
+		{
+			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
+			if (pg_fsync(openLogFile) != 0)
+			{
+				char		xlogfname[MAXFNAMELEN];
+				int			save_errno;
+
+				save_errno = errno;
+				XLogFileName(xlogfname, openLogTLI, openLogSegNo,
+							 wal_segment_size);
+				errno = save_errno;
+				ereport(PANIC,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m", xlogfname)));
+			}
+
+			pgstat_report_wait_end();
+			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
+				XLogFileClose();
+		}
+	}
+}
+
+
+/*
+ * Issue appropriate kind of fsync (if any) for an XLOG output file.
+ *
+ * 'fd' is a file descriptor for the XLOG file to be fsync'd.
+ * 'segno' is for error reporting purposes.
+ */
+void
+issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
+{
+	char	   *msg = NULL;
+	instr_time	start;
+
+	Assert(tli != 0);
+
+	/*
+	 * Quick exit if fsync is disabled or write() has already synced the WAL
+	 * file.
+	 */
+	if (!enableFsync ||
+		sync_method == SYNC_METHOD_OPEN ||
+		sync_method == SYNC_METHOD_OPEN_DSYNC)
+		return;
+
+	/* Measure I/O timing to sync the WAL file */
+	if (track_wal_io_timing)
+		INSTR_TIME_SET_CURRENT(start);
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
+	switch (sync_method)
+	{
+		case SYNC_METHOD_FSYNC:
+			if (pg_fsync_no_writethrough(fd) != 0)
+				msg = _("could not fsync file \"%s\": %m");
+			break;
+#ifdef HAVE_FSYNC_WRITETHROUGH
+		case SYNC_METHOD_FSYNC_WRITETHROUGH:
+			if (pg_fsync_writethrough(fd) != 0)
+				msg = _("could not fsync write-through file \"%s\": %m");
+			break;
+#endif
+#ifdef HAVE_FDATASYNC
+		case SYNC_METHOD_FDATASYNC:
+			if (pg_fdatasync(fd) != 0)
+				msg = _("could not fdatasync file \"%s\": %m");
+			break;
+#endif
+		case SYNC_METHOD_OPEN:
+		case SYNC_METHOD_OPEN_DSYNC:
+			/* not reachable */
+			Assert(false);
+			break;
+		default:
+			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
+			break;
+	}
+
+	/* PANIC if failed to fsync */
+	if (msg)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, tli, segno, wal_segment_size);
+		errno = save_errno;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg(msg, xlogfname)));
+	}
+
+	pgstat_report_wait_end();
+
+	/*
+	 * Increment the I/O timing and the number of times WAL files were synced.
+	 */
+	if (track_wal_io_timing)
+	{
+		instr_time	duration;
+
+		INSTR_TIME_SET_CURRENT(duration);
+		INSTR_TIME_SUBTRACT(duration, start);
+		PendingWalStats.wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
+	}
+
+	PendingWalStats.wal_sync++;
+}
+
+/*
+ * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
+ * function. It creates the necessary starting checkpoint and constructs the
+ * backup label and tablespace map.
+ *
+ * Input parameters are "backupidstr" (the backup label string) and "fast"
+ * (if true, we do the checkpoint in immediate mode to make it faster).
+ *
+ * The backup label and tablespace map contents are appended to *labelfile and
+ * *tblspcmapfile, and the caller is responsible for including them in the
+ * backup archive as 'backup_label' and 'tablespace_map'.
+ * tblspcmapfile is required mainly for tar format in windows as native windows
+ * utilities are not able to create symlinks while extracting files from tar.
+ * However for consistency and platform-independence, we do it the same way
+ * everywhere.
+ *
+ * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
+ * describing the cluster's tablespaces.
+ *
+ * Returns the minimum WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *starttli_p.
+ *
+ * Every successfully started backup must be stopped by calling
+ * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
+ * backups active at the same time.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
+ */
+XLogRecPtr
+do_pg_backup_start(const char *backupidstr, bool fast, TimeLineID *starttli_p,
+				   StringInfo labelfile, List **tablespaces,
+				   StringInfo tblspcmapfile)
+{
+	bool		backup_started_in_recovery = false;
+	XLogRecPtr	checkpointloc;
+	XLogRecPtr	startpoint;
+	TimeLineID	starttli;
+	pg_time_t	stamp_time;
+	char		strfbuf[128];
+	char		xlogfilename[MAXFNAMELEN];
+	XLogSegNo	_logSegNo;
+
+	backup_started_in_recovery = RecoveryInProgress();
+
+	/*
+	 * During recovery, we don't need to check WAL level. Because, if WAL
+	 * level is not sufficient, it's impossible to get here during recovery.
+	 */
+	if (!backup_started_in_recovery && !XLogIsNeeded())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("WAL level not sufficient for making an online backup"),
+				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+	if (strlen(backupidstr) > MAXPGPATH)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("backup label too long (max %d bytes)",
+						MAXPGPATH)));
+
+	/*
+	 * Mark backup active in shared memory.  We must do full-page WAL writes
+	 * during an on-line backup even if not doing so at other times, because
+	 * it's quite possible for the backup dump to obtain a "torn" (partially
+	 * written) copy of a database page if it reads the page concurrently with
+	 * our write to the same page.  This can be fixed as long as the first
+	 * write to the page in the WAL sequence is a full-page write. Hence, we
+	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
+	 * are no dirty pages in shared memory that might get dumped while the
+	 * backup is in progress without having a corresponding WAL record.  (Once
+	 * the backup is complete, we need not force full-page writes anymore,
+	 * since we expect that any pages not modified during the backup interval
+	 * must have been correctly captured by the backup.)
+	 *
+	 * Note that forcePageWrites has no effect during an online backup from
+	 * the standby.
+	 *
+	 * We must hold all the insertion locks to change the value of
+	 * forcePageWrites, to ensure adequate interlocking against
+	 * XLogInsertRecord().
+	 */
+	WALInsertLockAcquireExclusive();
+	XLogCtl->Insert.runningBackups++;
+	XLogCtl->Insert.forcePageWrites = true;
+	WALInsertLockRelease();
+
+	/* Ensure we release forcePageWrites if fail below */
+	PG_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0);
+	{
+		bool		gotUniqueStartpoint = false;
+		DIR		   *tblspcdir;
+		struct dirent *de;
+		tablespaceinfo *ti;
+		int			datadirpathlen;
+
+		/*
+		 * Force an XLOG file switch before the checkpoint, to ensure that the
+		 * WAL segment the checkpoint is written to doesn't contain pages with
+		 * old timeline IDs.  That would otherwise happen if you called
+		 * pg_backup_start() right after restoring from a PITR archive: the
+		 * first WAL segment containing the startup checkpoint has pages in
+		 * the beginning with the old timeline ID.  That can cause trouble at
+		 * recovery: we won't have a history file covering the old timeline if
+		 * pg_wal directory was not included in the base backup and the WAL
+		 * archive was cleared too before starting the backup.
+		 *
+		 * This also ensures that we have emitted a WAL page header that has
+		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
+		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
+		 * compress out removable backup blocks, it won't remove any that
+		 * occur after this point.
+		 *
+		 * During recovery, we skip forcing XLOG file switch, which means that
+		 * the backup taken during recovery is not available for the special
+		 * recovery case described above.
+		 */
+		if (!backup_started_in_recovery)
+			RequestXLogSwitch(false);
+
+		do
+		{
+			bool		checkpointfpw;
+
+			/*
+			 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
+			 * page problems, this guarantees that two successive backup runs
+			 * will have different checkpoint positions and hence different
+			 * history file names, even if nothing happened in between.
+			 *
+			 * During recovery, establish a restartpoint if possible. We use
+			 * the last restartpoint as the backup starting checkpoint. This
+			 * means that two successive backup runs can have same checkpoint
+			 * positions.
+			 *
+			 * Since the fact that we are executing do_pg_backup_start()
+			 * during recovery means that checkpointer is running, we can use
+			 * RequestCheckpoint() to establish a restartpoint.
+			 *
+			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+			 * passing fast = true).  Otherwise this can take awhile.
+			 */
+			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+							  (fast ? CHECKPOINT_IMMEDIATE : 0));
+
+			/*
+			 * Now we need to fetch the checkpoint record location, and also
+			 * its REDO pointer.  The oldest point in WAL that would be needed
+			 * to restore starting from the checkpoint is precisely the REDO
+			 * pointer.
+			 */
+			LWLockAcquire(ControlFileLock, LW_SHARED);
+			checkpointloc = ControlFile->checkPoint;
+			startpoint = ControlFile->checkPointCopy.redo;
+			starttli = ControlFile->checkPointCopy.ThisTimeLineID;
+			checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
+			LWLockRelease(ControlFileLock);
+
+			if (backup_started_in_recovery)
+			{
+				XLogRecPtr	recptr;
+
+				/*
+				 * Check to see if all WAL replayed during online backup
+				 * (i.e., since last restartpoint used as backup starting
+				 * checkpoint) contain full-page writes.
+				 */
+				SpinLockAcquire(&XLogCtl->info_lck);
+				recptr = XLogCtl->lastFpwDisableRecPtr;
+				SpinLockRelease(&XLogCtl->info_lck);
+
+				if (!checkpointfpw || startpoint <= recptr)
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("WAL generated with full_page_writes=off was replayed "
+									"since last restartpoint"),
+							 errhint("This means that the backup being taken on the standby "
+									 "is corrupt and should not be used. "
+									 "Enable full_page_writes and run CHECKPOINT on the primary, "
+									 "and then try an online backup again.")));
+
+				/*
+				 * During recovery, since we don't use the end-of-backup WAL
+				 * record and don't write the backup history file, the
+				 * starting WAL location doesn't need to be unique. This means
+				 * that two base backups started at the same time might use
+				 * the same checkpoint as starting locations.
+				 */
+				gotUniqueStartpoint = true;
+			}
+
+			/*
+			 * If two base backups are started at the same time (in WAL sender
+			 * processes), we need to make sure that they use different
+			 * checkpoints as starting locations, because we use the starting
+			 * WAL location as a unique identifier for the base backup in the
+			 * end-of-backup WAL record and when we write the backup history
+			 * file. Perhaps it would be better generate a separate unique ID
+			 * for each backup instead of forcing another checkpoint, but
+			 * taking a checkpoint right after another is not that expensive
+			 * either because only few buffers have been dirtied yet.
+			 */
+			WALInsertLockAcquireExclusive();
+			if (XLogCtl->Insert.lastBackupStart < startpoint)
+			{
+				XLogCtl->Insert.lastBackupStart = startpoint;
+				gotUniqueStartpoint = true;
+			}
+			WALInsertLockRelease();
+		} while (!gotUniqueStartpoint);
+
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
+
+		/*
+		 * Construct tablespace_map file.
+		 */
+		datadirpathlen = strlen(DataDir);
+
+		/* Collect information about all tablespaces */
+		tblspcdir = AllocateDir("pg_tblspc");
+		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
+		{
+			char		fullpath[MAXPGPATH + 10];
+			char		linkpath[MAXPGPATH];
+			char	   *relpath = NULL;
+			int			rllen;
+			StringInfoData escapedpath;
+			char	   *s;
+
+			/* Skip anything that doesn't look like a tablespace */
+			if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+				continue;
+
+			snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
+
+			/*
+			 * Skip anything that isn't a symlink/junction.  For testing only,
+			 * we sometimes use allow_in_place_tablespaces to create
+			 * directories directly under pg_tblspc, which would fail below.
+			 */
+			if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK)
+				continue;
+
+#if defined(HAVE_READLINK) || defined(WIN32)
+			rllen = readlink(fullpath, linkpath, sizeof(linkpath));
+			if (rllen < 0)
+			{
+				ereport(WARNING,
+						(errmsg("could not read symbolic link \"%s\": %m",
+								fullpath)));
+				continue;
+			}
+			else if (rllen >= sizeof(linkpath))
+			{
+				ereport(WARNING,
+						(errmsg("symbolic link \"%s\" target is too long",
+								fullpath)));
+				continue;
+			}
+			linkpath[rllen] = '\0';
+
+			/*
+			 * Build a backslash-escaped version of the link path to include
+			 * in the tablespace map file.
+			 */
+			initStringInfo(&escapedpath);
+			for (s = linkpath; *s; s++)
+			{
+				if (*s == '\n' || *s == '\r' || *s == '\\')
+					appendStringInfoChar(&escapedpath, '\\');
+				appendStringInfoChar(&escapedpath, *s);
+			}
+
+			/*
+			 * Relpath holds the relative path of the tablespace directory
+			 * when it's located within PGDATA, or NULL if it's located
+			 * elsewhere.
+			 */
+			if (rllen > datadirpathlen &&
+				strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
+				IS_DIR_SEP(linkpath[datadirpathlen]))
+				relpath = linkpath + datadirpathlen + 1;
+
+			ti = palloc(sizeof(tablespaceinfo));
+			ti->oid = pstrdup(de->d_name);
+			ti->path = pstrdup(linkpath);
+			ti->rpath = relpath ? pstrdup(relpath) : NULL;
+			ti->size = -1;
+
+			if (tablespaces)
+				*tablespaces = lappend(*tablespaces, ti);
+
+			appendStringInfo(tblspcmapfile, "%s %s\n",
+							 ti->oid, escapedpath.data);
+
+			pfree(escapedpath.data);
+#else
+
+			/*
+			 * If the platform does not have symbolic links, it should not be
+			 * possible to have tablespaces - clearly somebody else created
+			 * them. Warn about it and ignore.
+			 */
+			ereport(WARNING,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("tablespaces are not supported on this platform")));
+#endif
+		}
+		FreeDir(tblspcdir);
+
+		/*
+		 * Construct backup label file.
+		 */
+
+		/* Use the log timezone here, not the session timezone */
+		stamp_time = (pg_time_t) time(NULL);
+		pg_strftime(strfbuf, sizeof(strfbuf),
+					"%Y-%m-%d %H:%M:%S %Z",
+					pg_localtime(&stamp_time, log_timezone));
+		appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
+						 LSN_FORMAT_ARGS(startpoint), xlogfilename);
+		appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
+						 LSN_FORMAT_ARGS(checkpointloc));
+		appendStringInfo(labelfile, "BACKUP METHOD: streamed\n");
+		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
+						 backup_started_in_recovery ? "standby" : "primary");
+		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
+		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
+		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
+	}
+	PG_END_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0);
+
+	/*
+	 * Mark that the start phase has correctly finished for the backup.
+	 */
+	sessionBackupState = SESSION_BACKUP_RUNNING;
+
+	/*
+	 * We're done.  As a convenience, return the starting WAL location.
+	 */
+	if (starttli_p)
+		*starttli_p = starttli;
+	return startpoint;
+}
+
+/* Error cleanup callback for pg_backup_start */
+static void
+pg_backup_start_callback(int code, Datum arg)
+{
+	/* Update backup counters and forcePageWrites on failure */
+	WALInsertLockAcquireExclusive();
+
+	Assert(XLogCtl->Insert.runningBackups > 0);
+	XLogCtl->Insert.runningBackups--;
+
+	if (XLogCtl->Insert.runningBackups == 0)
+	{
+		XLogCtl->Insert.forcePageWrites = false;
+	}
+	WALInsertLockRelease();
+}
+
+/*
+ * Utility routine to fetch the session-level status of a backup running.
+ */
+SessionBackupState
+get_backup_status(void)
+{
+	return sessionBackupState;
+}
+
+/*
+ * do_pg_backup_stop
+ *
+ * Utility function called at the end of an online backup. It cleans up the
+ * backup state and can optionally wait for WAL segments to be archived.
+ *
+ * Returns the last WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *stoptli_p.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
+ */
+XLogRecPtr
+do_pg_backup_stop(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
+{
+	bool		backup_started_in_recovery = false;
+	XLogRecPtr	startpoint;
+	XLogRecPtr	stoppoint;
+	TimeLineID	stoptli;
+	pg_time_t	stamp_time;
+	char		strfbuf[128];
+	char		histfilepath[MAXPGPATH];
+	char		startxlogfilename[MAXFNAMELEN];
+	char		stopxlogfilename[MAXFNAMELEN];
+	char		lastxlogfilename[MAXFNAMELEN];
+	char		histfilename[MAXFNAMELEN];
+	char		backupfrom[20];
+	XLogSegNo	_logSegNo;
+	FILE	   *fp;
+	char		ch;
+	int			seconds_before_warning;
+	int			waits = 0;
+	bool		reported_waiting = false;
+	char	   *remaining;
+	char	   *ptr;
+	uint32		hi,
+				lo;
+
+	backup_started_in_recovery = RecoveryInProgress();
+
+	/*
+	 * During recovery, we don't need to check WAL level. Because, if WAL
+	 * level is not sufficient, it's impossible to get here during recovery.
+	 */
+	if (!backup_started_in_recovery && !XLogIsNeeded())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("WAL level not sufficient for making an online backup"),
+				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+	/*
+	 * OK to update backup counters, forcePageWrites, and session-level lock.
+	 *
+	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
+	 * Otherwise they can be updated inconsistently, and which might cause
+	 * do_pg_abort_backup() to fail.
+	 */
+	WALInsertLockAcquireExclusive();
+
+	/*
+	 * It is expected that each do_pg_backup_start() call is matched by
+	 * exactly one do_pg_backup_stop() call.
+	 */
+	Assert(XLogCtl->Insert.runningBackups > 0);
+	XLogCtl->Insert.runningBackups--;
+
+	if (XLogCtl->Insert.runningBackups == 0)
+	{
+		XLogCtl->Insert.forcePageWrites = false;
+	}
+
+	/*
+	 * Clean up session-level lock.
+	 *
+	 * You might think that WALInsertLockRelease() can be called before
+	 * cleaning up session-level lock because session-level lock doesn't need
+	 * to be protected with WAL insertion lock. But since
+	 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
+	 * cleaned up before it.
+	 */
+	sessionBackupState = SESSION_BACKUP_NONE;
+
+	WALInsertLockRelease();
+
+	/*
+	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
+	 * but we are not expecting any variability in the file format).
+	 */
+	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
+			   &hi, &lo, startxlogfilename,
+			   &ch) != 4 || ch != '\n')
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	startpoint = ((uint64) hi) << 32 | lo;
+	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */
+
+	/*
+	 * Parse the BACKUP FROM line. If we are taking an online backup from the
+	 * standby, we confirm that the standby has not been promoted during the
+	 * backup.
+	 */
+	ptr = strstr(remaining, "BACKUP FROM:");
+	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("the standby was promoted during online backup"),
+				 errhint("This means that the backup being taken is corrupt "
+						 "and should not be used. "
+						 "Try taking another online backup.")));
+
+	/*
+	 * During recovery, we don't write an end-of-backup record. We assume that
+	 * pg_control was backed up last and its minimum recovery point can be
+	 * available as the backup end location. Since we don't have an
+	 * end-of-backup record, we use the pg_control value to check whether
+	 * we've reached the end of backup when starting recovery from this
+	 * backup. We have no way of checking if pg_control wasn't backed up last
+	 * however.
+	 *
+	 * We don't force a switch to new WAL file but it is still possible to
+	 * wait for all the required files to be archived if waitforarchive is
+	 * true. This is okay if we use the backup to start a standby and fetch
+	 * the missing WAL using streaming replication. But in the case of an
+	 * archive recovery, a user should set waitforarchive to true and wait for
+	 * them to be archived to ensure that all the required files are
+	 * available.
+	 *
+	 * We return the current minimum recovery point as the backup end
+	 * location. Note that it can be greater than the exact backup end
+	 * location if the minimum recovery point is updated after the backup of
+	 * pg_control. This is harmless for current uses.
+	 *
+	 * XXX currently a backup history file is for informational and debug
+	 * purposes only. It's not essential for an online backup. Furthermore,
+	 * even if it's created, it will not be archived during recovery because
+	 * an archiver is not invoked. So it doesn't seem worthwhile to write a
+	 * backup history file during recovery.
+	 */
+	if (backup_started_in_recovery)
+	{
+		XLogRecPtr	recptr;
+
+		/*
+		 * Check to see if all WAL replayed during online backup contain
+		 * full-page writes.
+		 */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		recptr = XLogCtl->lastFpwDisableRecPtr;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		if (startpoint <= recptr)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("WAL generated with full_page_writes=off was replayed "
+							"during online backup"),
+					 errhint("This means that the backup being taken on the standby "
+							 "is corrupt and should not be used. "
+							 "Enable full_page_writes and run CHECKPOINT on the primary, "
+							 "and then try an online backup again.")));
+
+
+		LWLockAcquire(ControlFileLock, LW_SHARED);
+		stoppoint = ControlFile->minRecoveryPoint;
+		stoptli = ControlFile->minRecoveryPointTLI;
+		LWLockRelease(ControlFileLock);
+	}
+	else
+	{
+		/*
+		 * Write the backup-end xlog record
+		 */
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
+		stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
+
+		/*
+		 * Given that we're not in recovery, InsertTimeLineID is set and can't
+		 * change, so we can read it without a lock.
+		 */
+		stoptli = XLogCtl->InsertTimeLineID;
+
+		/*
+		 * Force a switch to a new xlog segment file, so that the backup is
+		 * valid as soon as archiver moves out the current segment file.
+		 */
+		RequestXLogSwitch(false);
+
+		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+		XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
+
+		/* Use the log timezone here, not the session timezone */
+		stamp_time = (pg_time_t) time(NULL);
+		pg_strftime(strfbuf, sizeof(strfbuf),
+					"%Y-%m-%d %H:%M:%S %Z",
+					pg_localtime(&stamp_time, log_timezone));
+
+		/*
+		 * Write the backup history file
+		 */
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
+							  startpoint, wal_segment_size);
+		fp = AllocateFile(histfilepath, "w");
+		if (!fp)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create file \"%s\": %m",
+							histfilepath)));
+		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
+				LSN_FORMAT_ARGS(startpoint), startxlogfilename);
+		fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
+				LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
+
+		/*
+		 * Transfer remaining lines including label and start timeline to
+		 * history file.
+		 */
+		fprintf(fp, "%s", remaining);
+		fprintf(fp, "STOP TIME: %s\n", strfbuf);
+		fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
+		if (fflush(fp) || ferror(fp) || FreeFile(fp))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write file \"%s\": %m",
+							histfilepath)));
+
+		/*
+		 * Clean out any no-longer-needed history files.  As a side effect,
+		 * this will post a .ready file for the newly created history file,
+		 * notifying the archiver that history file may be archived
+		 * immediately.
+		 */
+		CleanupBackupHistory();
+	}
+
+	/*
+	 * If archiving is enabled, wait for all the required WAL files to be
+	 * archived before returning. If archiving isn't enabled, the required WAL
+	 * needs to be transported via streaming replication (hopefully with
+	 * wal_keep_size set high enough), or some more exotic mechanism like
+	 * polling and copying files from pg_wal with script. We have no knowledge
+	 * of those mechanisms, so it's up to the user to ensure that he gets all
+	 * the required WAL.
+	 *
+	 * We wait until both the last WAL file filled during backup and the
+	 * history file have been archived, and assume that the alphabetic sorting
+	 * property of the WAL files ensures any earlier WAL files are safely
+	 * archived as well.
+	 *
+	 * We wait forever, since archive_command is supposed to work and we
+	 * assume the admin wanted his backup to work completely. If you don't
+	 * wish to wait, then either waitforarchive should be passed in as false,
+	 * or you can set statement_timeout.  Also, some notices are issued to
+	 * clue in anyone who might be doing this interactively.
+	 */
+
+	if (waitforarchive &&
+		((!backup_started_in_recovery && XLogArchivingActive()) ||
+		 (backup_started_in_recovery && XLogArchivingAlways())))
+	{
+		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+		XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
+
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		BackupHistoryFileName(histfilename, stoptli, _logSegNo,
+							  startpoint, wal_segment_size);
+
+		seconds_before_warning = 60;
+		waits = 0;
+
+		while (XLogArchiveIsBusy(lastxlogfilename) ||
+			   XLogArchiveIsBusy(histfilename))
+		{
+			CHECK_FOR_INTERRUPTS();
+
+			if (!reported_waiting && waits > 5)
+			{
+				ereport(NOTICE,
+						(errmsg("base backup done, waiting for required WAL segments to be archived")));
+				reported_waiting = true;
+			}
+
+			(void) WaitLatch(MyLatch,
+							 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+							 1000L,
+							 WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
+			ResetLatch(MyLatch);
+
+			if (++waits >= seconds_before_warning)
+			{
+				seconds_before_warning *= 2;	/* This wraps in >10 years... */
+				ereport(WARNING,
+						(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
+								waits),
+						 errhint("Check that your archive_command is executing properly.  "
+								 "You can safely cancel this backup, "
+								 "but the database backup will not be usable without all the WAL segments.")));
+			}
+		}
+
+		ereport(NOTICE,
+				(errmsg("all required WAL segments have been archived")));
+	}
+	else if (waitforarchive)
+		ereport(NOTICE,
+				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
+
+	/*
+	 * We're done.  As a convenience, return the ending WAL location.
+	 */
+	if (stoptli_p)
+		*stoptli_p = stoptli;
+	return stoppoint;
+}
+
+
+/*
+ * do_pg_abort_backup: abort a running backup
+ *
+ * This does just the most basic steps of do_pg_backup_stop(), by taking the
+ * system out of backup mode, thus making it a lot more safe to call from
+ * an error handler.
+ *
+ * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
+ * is emitted.
+ *
+ * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
+ * signature.
+ */
+void
+do_pg_abort_backup(int code, Datum arg)
+{
+	bool		emit_warning = DatumGetBool(arg);
+
+	/*
+	 * Quick exit if session does not have a running backup.
+	 */
+	if (sessionBackupState != SESSION_BACKUP_RUNNING)
+		return;
+
+	WALInsertLockAcquireExclusive();
+	Assert(XLogCtl->Insert.runningBackups > 0);
+	XLogCtl->Insert.runningBackups--;
+
+	if (XLogCtl->Insert.runningBackups == 0)
+	{
+		XLogCtl->Insert.forcePageWrites = false;
+	}
+
+	sessionBackupState = SESSION_BACKUP_NONE;
+	WALInsertLockRelease();
+
+	if (emit_warning)
+		ereport(WARNING,
+				(errmsg("aborting backup due to backend exiting before pg_backup_stop was called")));
+}
+
+/*
+ * Register a handler that will warn about unterminated backups at end of
+ * session, unless this has already been done.
+ */
+void
+register_persistent_abort_backup_handler(void)
+{
+	static bool already_done = false;
+
+	if (already_done)
+		return;
+	before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
+	already_done = true;
+}
+
+/*
+ * Get latest WAL insert pointer
+ */
+XLogRecPtr
+GetXLogInsertRecPtr(void)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint64		current_bytepos;
+
+	SpinLockAcquire(&Insert->insertpos_lck);
+	current_bytepos = Insert->CurrBytePos;
+	SpinLockRelease(&Insert->insertpos_lck);
+
+	return XLogBytePosToRecPtr(current_bytepos);
+}
+
+/*
+ * Get latest WAL write pointer
+ */
+XLogRecPtr
+GetXLogWriteRecPtr(void)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return LogwrtResult.Write;
+}
+
+/*
+ * Returns the redo pointer of the last checkpoint or restartpoint. This is
+ * the oldest point in WAL that we still need, if we have to restart recovery.
+ */
+void
+GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
+{
+	LWLockAcquire(ControlFileLock, LW_SHARED);
+	*oldrecptr = ControlFile->checkPointCopy.redo;
+	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
+	LWLockRelease(ControlFileLock);
+}
+
+/* Thin wrapper around ShutdownWalRcv(). */
+void
+XLogShutdownWalRcv(void)
+{
+	ShutdownWalRcv();
+
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	XLogCtl->InstallXLogFileSegmentActive = false;
+	LWLockRelease(ControlFileLock);
+}
+
+/* Enable WAL file recycling and preallocation. */
+void
+SetInstallXLogFileSegmentActive(void)
+{
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	XLogCtl->InstallXLogFileSegmentActive = true;
+	LWLockRelease(ControlFileLock);
+}
+
+bool
+IsInstallXLogFileSegmentActive(void)
+{
+	bool		result;
+
+	LWLockAcquire(ControlFileLock, LW_SHARED);
+	result = XLogCtl->InstallXLogFileSegmentActive;
+	LWLockRelease(ControlFileLock);
+
+	return result;
+}
+
+/*
+ * Update the WalWriterSleeping flag.
+ */
+void
+SetWalWriterSleeping(bool sleeping)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->WalWriterSleeping = sleeping;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
new file mode 100644
index 0000000..6516a74
--- /dev/null
+++ b/src/backend/access/transam/xlogarchive.c
@@ -0,0 +1,762 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogarchive.c
+ *		Functions for archiving WAL files and restoring from the archive.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogarchive.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "common/archive.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/pgarch.h"
+#include "replication/walsender.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+
+/*
+ * Attempt to retrieve the specified file from off-line archival storage.
+ * If successful, fill "path" with its complete path (note that this will be
+ * a temp file name that doesn't follow the normal naming convention), and
+ * return true.
+ *
+ * If not successful, fill "path" with the name of the normal on-line file
+ * (which may or may not actually exist, but we'll try to use it), and return
+ * false.
+ *
+ * For fixed-size files, the caller may pass the expected size as an
+ * additional crosscheck on successful recovery.  If the file size is not
+ * known, set expectedSize = 0.
+ *
+ * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments
+ * in the archive. This is used when fetching the initial checkpoint record,
+ * when we are not yet sure how far back we need the WAL.
+ */
+bool
+RestoreArchivedFile(char *path, const char *xlogfname,
+					const char *recovername, off_t expectedSize,
+					bool cleanupEnabled)
+{
+	char		xlogpath[MAXPGPATH];
+	char	   *xlogRestoreCmd;
+	char		lastRestartPointFname[MAXPGPATH];
+	int			rc;
+	struct stat stat_buf;
+	XLogSegNo	restartSegNo;
+	XLogRecPtr	restartRedoPtr;
+	TimeLineID	restartTli;
+
+	/*
+	 * Ignore restore_command when not in archive recovery (meaning we are in
+	 * crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		goto not_available;
+
+	/* In standby mode, restore_command might not be supplied */
+	if (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)
+		goto not_available;
+
+	/*
+	 * When doing archive recovery, we always prefer an archived log file even
+	 * if a file of the same name exists in XLOGDIR.  The reason is that the
+	 * file in XLOGDIR could be an old, un-filled or partly-filled version
+	 * that was copied and restored as part of backing up $PGDATA.
+	 *
+	 * We could try to optimize this slightly by checking the local copy
+	 * lastchange timestamp against the archived copy, but we have no API to
+	 * do this, nor can we guarantee that the lastchange timestamp was
+	 * preserved correctly when we copied to archive. Our aim is robustness,
+	 * so we elect not to do this.
+	 *
+	 * If we cannot obtain the log file from the archive, however, we will try
+	 * to use the XLOGDIR file if it exists.  This is so that we can make use
+	 * of log segments that weren't yet transferred to the archive.
+	 *
+	 * Notice that we don't actually overwrite any files when we copy back
+	 * from archive because the restore_command may inadvertently restore
+	 * inappropriate xlogs, or they may be corrupt, so we may wish to fallback
+	 * to the segments remaining in current XLOGDIR later. The
+	 * copy-from-archive filename is always the same, ensuring that we don't
+	 * run out of disk space on long recoveries.
+	 */
+	snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
+
+	/*
+	 * Make sure there is no existing file named recovername.
+	 */
+	if (stat(xlogpath, &stat_buf) != 0)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							xlogpath)));
+	}
+	else
+	{
+		if (unlink(xlogpath) != 0)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m",
+							xlogpath)));
+	}
+
+	/*
+	 * Calculate the archive file cutoff point for use during log shipping
+	 * replication. All files earlier than this point can be deleted from the
+	 * archive, though there is no requirement to do so.
+	 *
+	 * If cleanup is not enabled, initialise this with the filename of
+	 * InvalidXLogRecPtr, which will prevent the deletion of any WAL files
+	 * from the archive because of the alphabetic sorting property of WAL
+	 * filenames.
+	 *
+	 * Once we have successfully located the redo pointer of the checkpoint
+	 * from which we start recovery we never request a file prior to the redo
+	 * pointer of the last restartpoint. When redo begins we know that we have
+	 * successfully located it, so there is no need for additional status
+	 * flags to signify the point when we can begin deleting WAL files from
+	 * the archive.
+	 */
+	if (cleanupEnabled)
+	{
+		GetOldestRestartPoint(&restartRedoPtr, &restartTli);
+		XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
+		XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
+					 wal_segment_size);
+		/* we shouldn't need anything earlier than last restart point */
+		Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
+	}
+	else
+		XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size);
+
+	/* Build the restore command to execute */
+	xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand,
+										 xlogpath, xlogfname,
+										 lastRestartPointFname);
+	if (xlogRestoreCmd == NULL)
+		elog(ERROR, "could not build restore command \"%s\"",
+			 recoveryRestoreCommand);
+
+	ereport(DEBUG3,
+			(errmsg_internal("executing restore command \"%s\"",
+							 xlogRestoreCmd)));
+
+	pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND);
+
+	/*
+	 * PreRestoreCommand() informs the SIGTERM handler for the startup process
+	 * that it should proc_exit() right away.  This is done for the duration
+	 * of the system() call because there isn't a good way to break out while
+	 * it is executing.  Since we might call proc_exit() in a signal handler,
+	 * it is best to put any additional logic before or after the
+	 * PreRestoreCommand()/PostRestoreCommand() section.
+	 */
+	PreRestoreCommand();
+
+	/*
+	 * Copy xlog from archival storage to XLOGDIR
+	 */
+	rc = system(xlogRestoreCmd);
+
+	PostRestoreCommand();
+
+	pgstat_report_wait_end();
+	pfree(xlogRestoreCmd);
+
+	if (rc == 0)
+	{
+		/*
+		 * command apparently succeeded, but let's make sure the file is
+		 * really there now and has the correct size.
+		 */
+		if (stat(xlogpath, &stat_buf) == 0)
+		{
+			if (expectedSize > 0 && stat_buf.st_size != expectedSize)
+			{
+				int			elevel;
+
+				/*
+				 * If we find a partial file in standby mode, we assume it's
+				 * because it's just being copied to the archive, and keep
+				 * trying.
+				 *
+				 * Otherwise treat a wrong-sized file as FATAL to ensure the
+				 * DBA would notice it, but is that too strong? We could try
+				 * to plow ahead with a local copy of the file ... but the
+				 * problem is that there probably isn't one, and we'd
+				 * incorrectly conclude we've reached the end of WAL and we're
+				 * done recovering ...
+				 */
+				if (StandbyMode && stat_buf.st_size < expectedSize)
+					elevel = DEBUG1;
+				else
+					elevel = FATAL;
+				ereport(elevel,
+						(errmsg("archive file \"%s\" has wrong size: %lld instead of %lld",
+								xlogfname,
+								(long long int) stat_buf.st_size,
+								(long long int) expectedSize)));
+				return false;
+			}
+			else
+			{
+				ereport(LOG,
+						(errmsg("restored log file \"%s\" from archive",
+								xlogfname)));
+				strcpy(path, xlogpath);
+				return true;
+			}
+		}
+		else
+		{
+			/* stat failed */
+			int			elevel = (errno == ENOENT) ? LOG : FATAL;
+
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", xlogpath),
+					 errdetail("restore_command returned a zero exit status, but stat() failed.")));
+		}
+	}
+
+	/*
+	 * Remember, we rollforward UNTIL the restore fails so failure here is
+	 * just part of the process... that makes it difficult to determine
+	 * whether the restore failed because there isn't an archive to restore,
+	 * or because the administrator has specified the restore program
+	 * incorrectly.  We have to assume the former.
+	 *
+	 * However, if the failure was due to any sort of signal, it's best to
+	 * punt and abort recovery.  (If we "return false" here, upper levels will
+	 * assume that recovery is complete and start up the database!) It's
+	 * essential to abort on child SIGINT and SIGQUIT, because per spec
+	 * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
+	 * those it's a good bet we should have gotten it too.
+	 *
+	 * On SIGTERM, assume we have received a fast shutdown request, and exit
+	 * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+	 * child process. If we receive it first, the signal handler will call
+	 * proc_exit, otherwise we do it here. If we or the child process received
+	 * SIGTERM for any other reason than a fast shutdown request, postmaster
+	 * will perform an immediate shutdown when it sees us exiting
+	 * unexpectedly.
+	 *
+	 * We treat hard shell errors such as "command not found" as fatal, too.
+	 */
+	if (wait_result_is_signal(rc, SIGTERM))
+		proc_exit(1);
+
+	ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
+			(errmsg("could not restore file \"%s\" from archive: %s",
+					xlogfname, wait_result_to_str(rc))));
+
+not_available:
+
+	/*
+	 * if an archived file is not available, there might still be a version of
+	 * this file in XLOGDIR, so return that as the filename to open.
+	 *
+	 * In many recovery scenarios we expect this to fail also, but if so that
+	 * just means we've reached the end of WAL.
+	 */
+	snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+	return false;
+}
+
+/*
+ * Attempt to execute an external shell command during recovery.
+ *
+ * 'command' is the shell command to be executed, 'commandName' is a
+ * human-readable name describing the command emitted in the logs. If
+ * 'failOnSignal' is true and the command is killed by a signal, a FATAL
+ * error is thrown. Otherwise a WARNING is emitted.
+ *
+ * This is currently used for recovery_end_command and archive_cleanup_command.
+ */
+void
+ExecuteRecoveryCommand(const char *command, const char *commandName,
+					   bool failOnSignal, uint32 wait_event_info)
+{
+	char		xlogRecoveryCmd[MAXPGPATH];
+	char		lastRestartPointFname[MAXPGPATH];
+	char	   *dp;
+	char	   *endp;
+	const char *sp;
+	int			rc;
+	XLogSegNo	restartSegNo;
+	XLogRecPtr	restartRedoPtr;
+	TimeLineID	restartTli;
+
+	Assert(command && commandName);
+
+	/*
+	 * Calculate the archive file cutoff point for use during log shipping
+	 * replication. All files earlier than this point can be deleted from the
+	 * archive, though there is no requirement to do so.
+	 */
+	GetOldestRestartPoint(&restartRedoPtr, &restartTli);
+	XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
+	XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
+				 wal_segment_size);
+
+	/*
+	 * construct the command to be executed
+	 */
+	dp = xlogRecoveryCmd;
+	endp = xlogRecoveryCmd + MAXPGPATH - 1;
+	*endp = '\0';
+
+	for (sp = command; *sp; sp++)
+	{
+		if (*sp == '%')
+		{
+			switch (sp[1])
+			{
+				case 'r':
+					/* %r: filename of last restartpoint */
+					sp++;
+					strlcpy(dp, lastRestartPointFname, endp - dp);
+					dp += strlen(dp);
+					break;
+				case '%':
+					/* convert %% to a single % */
+					sp++;
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+				default:
+					/* otherwise treat the % as not special */
+					if (dp < endp)
+						*dp++ = *sp;
+					break;
+			}
+		}
+		else
+		{
+			if (dp < endp)
+				*dp++ = *sp;
+		}
+	}
+	*dp = '\0';
+
+	ereport(DEBUG3,
+			(errmsg_internal("executing %s \"%s\"", commandName, command)));
+
+	/*
+	 * execute the constructed command
+	 */
+	pgstat_report_wait_start(wait_event_info);
+	rc = system(xlogRecoveryCmd);
+	pgstat_report_wait_end();
+
+	if (rc != 0)
+	{
+		/*
+		 * If the failure was due to any sort of signal, it's best to punt and
+		 * abort recovery.  See comments in RestoreArchivedFile().
+		 */
+		ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
+		/*------
+		   translator: First %s represents a postgresql.conf parameter name like
+		  "recovery_end_command", the 2nd is the value of that parameter, the
+		  third an already translated error message. */
+				(errmsg("%s \"%s\": %s", commandName,
+						command, wait_result_to_str(rc))));
+	}
+}
+
+
+/*
+ * A file was restored from the archive under a temporary filename (path),
+ * and now we want to keep it. Rename it under the permanent filename in
+ * pg_wal (xlogfname), replacing any existing file with the same name.
+ */
+void
+KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
+{
+	char		xlogfpath[MAXPGPATH];
+	bool		reload = false;
+	struct stat statbuf;
+
+	snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+
+	if (stat(xlogfpath, &statbuf) == 0)
+	{
+		char		oldpath[MAXPGPATH];
+
+#ifdef WIN32
+		static unsigned int deletedcounter = 1;
+
+		/*
+		 * On Windows, if another process (e.g a walsender process) holds the
+		 * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the
+		 * file will still show up in directory listing until the last handle
+		 * is closed, and we cannot rename the new file in its place until
+		 * that. To avoid that problem, rename the old file to a temporary
+		 * name first. Use a counter to create a unique filename, because the
+		 * same file might be restored from the archive multiple times, and a
+		 * walsender could still be holding onto an old deleted version of it.
+		 */
+		snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
+				 xlogfpath, deletedcounter++);
+		if (rename(xlogfpath, oldpath) != 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not rename file \"%s\" to \"%s\": %m",
+							xlogfpath, oldpath)));
+		}
+#else
+		/* same-size buffers, so this never truncates */
+		strlcpy(oldpath, xlogfpath, MAXPGPATH);
+#endif
+		if (unlink(oldpath) != 0)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m",
+							xlogfpath)));
+		reload = true;
+	}
+
+	durable_rename(path, xlogfpath, ERROR);
+
+	/*
+	 * Create .done file forcibly to prevent the restored segment from being
+	 * archived again later.
+	 */
+	if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS)
+		XLogArchiveForceDone(xlogfname);
+	else
+		XLogArchiveNotify(xlogfname);
+
+	/*
+	 * If the existing file was replaced, since walsenders might have it open,
+	 * request them to reload a currently-open segment. This is only required
+	 * for WAL segments, walsenders don't hold other files open, but there's
+	 * no harm in doing this too often, and we don't know what kind of a file
+	 * we're dealing with here.
+	 */
+	if (reload)
+		WalSndRqstFileReload();
+
+	/*
+	 * Signal walsender that new WAL has arrived. Again, this isn't necessary
+	 * if we restored something other than a WAL segment, but it does no harm
+	 * either.
+	 */
+	WalSndWakeup();
+}
+
+/*
+ * XLogArchiveNotify
+ *
+ * Create an archive notification file
+ *
+ * The name of the notification file is the message that will be picked up
+ * by the archiver, e.g. we write 0000000100000001000000C6.ready
+ * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
+ * then when complete, rename it to 0000000100000001000000C6.done
+ */
+void
+XLogArchiveNotify(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+	FILE	   *fd;
+
+	/* insert an otherwise empty file called <XLOG>.ready */
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	fd = AllocateFile(archiveStatusPath, "w");
+	if (fd == NULL)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not create archive status file \"%s\": %m",
+						archiveStatusPath)));
+		return;
+	}
+	if (FreeFile(fd))
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not write archive status file \"%s\": %m",
+						archiveStatusPath)));
+		return;
+	}
+
+	/*
+	 * Timeline history files are given the highest archival priority to lower
+	 * the chance that a promoted standby will choose a timeline that is
+	 * already in use.  However, the archiver ordinarily tries to gather
+	 * multiple files to archive from each scan of the archive_status
+	 * directory, which means that newly created timeline history files could
+	 * be left unarchived for a while.  To ensure that the archiver picks up
+	 * timeline history files as soon as possible, we force the archiver to
+	 * scan the archive_status directory the next time it looks for a file to
+	 * archive.
+	 */
+	if (IsTLHistoryFileName(xlog))
+		PgArchForceDirScan();
+
+	/* Notify archiver that it's got something to do */
+	if (IsUnderPostmaster)
+		PgArchWakeup();
+}
+
+/*
+ * Convenience routine to notify using segment number representation of filename
+ */
+void
+XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli)
+{
+	char		xlog[MAXFNAMELEN];
+
+	Assert(tli != 0);
+
+	XLogFileName(xlog, tli, segno, wal_segment_size);
+	XLogArchiveNotify(xlog);
+}
+
+/*
+ * XLogArchiveForceDone
+ *
+ * Emit notification forcibly that an XLOG segment file has been successfully
+ * archived, by creating <XLOG>.done regardless of whether <XLOG>.ready
+ * exists or not.
+ */
+void
+XLogArchiveForceDone(const char *xlog)
+{
+	char		archiveReady[MAXPGPATH];
+	char		archiveDone[MAXPGPATH];
+	struct stat stat_buf;
+	FILE	   *fd;
+
+	/* Exit if already known done */
+	StatusFilePath(archiveDone, xlog, ".done");
+	if (stat(archiveDone, &stat_buf) == 0)
+		return;
+
+	/* If .ready exists, rename it to .done */
+	StatusFilePath(archiveReady, xlog, ".ready");
+	if (stat(archiveReady, &stat_buf) == 0)
+	{
+		(void) durable_rename(archiveReady, archiveDone, WARNING);
+		return;
+	}
+
+	/* insert an otherwise empty file called <XLOG>.done */
+	fd = AllocateFile(archiveDone, "w");
+	if (fd == NULL)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not create archive status file \"%s\": %m",
+						archiveDone)));
+		return;
+	}
+	if (FreeFile(fd))
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not write archive status file \"%s\": %m",
+						archiveDone)));
+		return;
+	}
+}
+
+/*
+ * XLogArchiveCheckDone
+ *
+ * This is called when we are ready to delete or recycle an old XLOG segment
+ * file or backup history file.  If it is okay to delete it then return true.
+ * If it is not time to delete it, make sure a .ready file exists, and return
+ * false.
+ *
+ * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
+ * then return false; else create <XLOG>.ready and return false.
+ *
+ * The reason we do things this way is so that if the original attempt to
+ * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
+ */
+bool
+XLogArchiveCheckDone(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+	struct stat stat_buf;
+
+	/* The file is always deletable if archive_mode is "off". */
+	if (!XLogArchivingActive())
+		return true;
+
+	/*
+	 * During archive recovery, the file is deletable if archive_mode is not
+	 * "always".
+	 */
+	if (!XLogArchivingAlways() &&
+		GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
+		return true;
+
+	/*
+	 * At this point of the logic, note that we are either a primary with
+	 * archive_mode set to "on" or "always", or a standby with archive_mode
+	 * set to "always".
+	 */
+
+	/* First check for .done --- this means archiver is done with it */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	/* check for .ready --- this means archiver is still busy with it */
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return false;
+
+	/* Race condition --- maybe archiver just finished, so recheck */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	/* Retry creation of the .ready file */
+	XLogArchiveNotify(xlog);
+	return false;
+}
+
+/*
+ * XLogArchiveIsBusy
+ *
+ * Check to see if an XLOG segment file is still unarchived.
+ * This is almost but not quite the inverse of XLogArchiveCheckDone: in
+ * the first place we aren't chartered to recreate the .ready file, and
+ * in the second place we should consider that if the file is already gone
+ * then it's not busy.  (This check is needed to handle the race condition
+ * that a checkpoint already deleted the no-longer-needed file.)
+ */
+bool
+XLogArchiveIsBusy(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+	struct stat stat_buf;
+
+	/* First check for .done --- this means archiver is done with it */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return false;
+
+	/* check for .ready --- this means archiver is still busy with it */
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	/* Race condition --- maybe archiver just finished, so recheck */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return false;
+
+	/*
+	 * Check to see if the WAL file has been removed by checkpoint, which
+	 * implies it has already been archived, and explains why we can't see a
+	 * status file for it.
+	 */
+	snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
+	if (stat(archiveStatusPath, &stat_buf) != 0 &&
+		errno == ENOENT)
+		return false;
+
+	return true;
+}
+
+/*
+ * XLogArchiveIsReadyOrDone
+ *
+ * Check to see if an XLOG segment file has a .ready or .done file.
+ * This is similar to XLogArchiveIsBusy(), but returns true if the file
+ * is already archived or is about to be archived.
+ *
+ * This is currently only used at recovery.  During normal operation this
+ * would be racy: the file might get removed or marked with .ready as we're
+ * checking it, or immediately after we return.
+ */
+bool
+XLogArchiveIsReadyOrDone(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+	struct stat stat_buf;
+
+	/* First check for .done --- this means archiver is done with it */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	/* check for .ready --- this means archiver is still busy with it */
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	/* Race condition --- maybe archiver just finished, so recheck */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * XLogArchiveIsReady
+ *
+ * Check to see if an XLOG segment file has an archive notification (.ready)
+ * file.
+ */
+bool
+XLogArchiveIsReady(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+	struct stat stat_buf;
+
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	if (stat(archiveStatusPath, &stat_buf) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * XLogArchiveCleanup
+ *
+ * Cleanup archive notification file(s) for a particular xlog segment
+ */
+void
+XLogArchiveCleanup(const char *xlog)
+{
+	char		archiveStatusPath[MAXPGPATH];
+
+	/* Remove the .done file */
+	StatusFilePath(archiveStatusPath, xlog, ".done");
+	unlink(archiveStatusPath);
+	/* should we complain about failure? */
+
+	/* Remove the .ready file if present --- normally it shouldn't be */
+	StatusFilePath(archiveStatusPath, xlog, ".ready");
+	unlink(archiveStatusPath);
+	/* should we complain about failure? */
+}
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
new file mode 100644
index 0000000..02bd919
--- /dev/null
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -0,0 +1,648 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogfuncs.c
+ *
+ * PostgreSQL write-ahead log manager user interface functions
+ *
+ * This file contains WAL control and information functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/htup_details.h"
+#include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/numeric.h"
+#include "utils/pg_lsn.h"
+#include "utils/timestamp.h"
+#include "utils/tuplestore.h"
+
+/*
+ * Store label file and tablespace map during backups.
+ */
+static StringInfo label_file;
+static StringInfo tblspc_map_file;
+
+/*
+ * pg_backup_start: set up for taking an on-line backup dump
+ *
+ * Essentially what this does is to create a backup label file in $PGDATA,
+ * where it will be archived as part of the backup dump.  The label file
+ * contains the user-supplied label string (typically this would be used
+ * to tell where the backup dump will be stored) and the starting time and
+ * starting WAL location for the dump.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_backup_start(PG_FUNCTION_ARGS)
+{
+	text	   *backupid = PG_GETARG_TEXT_PP(0);
+	bool		fast = PG_GETARG_BOOL(1);
+	char	   *backupidstr;
+	XLogRecPtr	startpoint;
+	SessionBackupState status = get_backup_status();
+	MemoryContext oldcontext;
+
+	backupidstr = text_to_cstring(backupid);
+
+	if (status == SESSION_BACKUP_RUNNING)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("a backup is already in progress in this session")));
+
+	/*
+	 * Label file and tablespace map file need to be long-lived, since they
+	 * are read in pg_backup_stop.
+	 */
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	label_file = makeStringInfo();
+	tblspc_map_file = makeStringInfo();
+	MemoryContextSwitchTo(oldcontext);
+
+	register_persistent_abort_backup_handler();
+
+	startpoint = do_pg_backup_start(backupidstr, fast, NULL, label_file,
+									NULL, tblspc_map_file);
+
+	PG_RETURN_LSN(startpoint);
+}
+
+
+/*
+ * pg_backup_stop: finish taking an on-line backup.
+ *
+ * The first parameter (variable 'waitforarchive'), which is optional,
+ * allows the user to choose if they want to wait for the WAL to be archived
+ * or if we should just return as soon as the WAL record is written.
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_backup_stop(PG_FUNCTION_ARGS)
+{
+#define PG_STOP_BACKUP_V2_COLS 3
+	TupleDesc	tupdesc;
+	Datum		values[PG_STOP_BACKUP_V2_COLS];
+	bool		nulls[PG_STOP_BACKUP_V2_COLS];
+
+	bool		waitforarchive = PG_GETARG_BOOL(0);
+	XLogRecPtr	stoppoint;
+	SessionBackupState status = get_backup_status();
+
+	/* Initialize attributes information in the tuple descriptor */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	MemSet(values, 0, sizeof(values));
+	MemSet(nulls, 0, sizeof(nulls));
+
+	if (status != SESSION_BACKUP_RUNNING)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("backup is not in progress"),
+				 errhint("Did you call pg_backup_start()?")));
+
+	/*
+	 * Stop the backup. Return a copy of the backup label and tablespace map
+	 * so they can be written to disk by the caller.
+	 */
+	stoppoint = do_pg_backup_stop(label_file->data, waitforarchive, NULL);
+
+	values[0] = LSNGetDatum(stoppoint);
+	values[1] = CStringGetTextDatum(label_file->data);
+	values[2] = CStringGetTextDatum(tblspc_map_file->data);
+
+	/* Free structures allocated in TopMemoryContext */
+	pfree(label_file->data);
+	pfree(label_file);
+	label_file = NULL;
+	pfree(tblspc_map_file->data);
+	pfree(tblspc_map_file);
+	tblspc_map_file = NULL;
+
+	/* Returns the record as Datum */
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
+/*
+ * pg_switch_wal: switch to next xlog file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_switch_wal(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	switchpoint;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	switchpoint = RequestXLogSwitch(false);
+
+	/*
+	 * As a convenience, return the WAL location of the switch record
+	 */
+	PG_RETURN_LSN(switchpoint);
+}
+
+/*
+ * pg_create_restore_point: a named point for restore
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_create_restore_point(PG_FUNCTION_ARGS)
+{
+	text	   *restore_name = PG_GETARG_TEXT_PP(0);
+	char	   *restore_name_str;
+	XLogRecPtr	restorepoint;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	if (!XLogIsNeeded())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("WAL level not sufficient for creating a restore point"),
+				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+	restore_name_str = text_to_cstring(restore_name);
+
+	if (strlen(restore_name_str) >= MAXFNAMELEN)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
+
+	restorepoint = XLogRestorePoint(restore_name_str);
+
+	/*
+	 * As a convenience, return the WAL location of the restore point record
+	 */
+	PG_RETURN_LSN(restorepoint);
+}
+
+/*
+ * Report the current WAL write location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is visible to an external
+ * archiving process.  Note that the data before this point is written out
+ * to the kernel, but is not necessarily synced to disk.
+ */
+Datum
+pg_current_wal_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	current_recptr;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	current_recptr = GetXLogWriteRecPtr();
+
+	PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the current WAL insert location (same format as pg_backup_start etc)
+ *
+ * This function is mostly for debugging purposes.
+ */
+Datum
+pg_current_wal_insert_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	current_recptr;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	current_recptr = GetXLogInsertRecPtr();
+
+	PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the current WAL flush location (same format as pg_backup_start etc)
+ *
+ * This function is mostly for debugging purposes.
+ */
+Datum
+pg_current_wal_flush_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	current_recptr;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	current_recptr = GetFlushRecPtr(NULL);
+
+	PG_RETURN_LSN(current_recptr);
+}
+
+/*
+ * Report the last WAL receive location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is guaranteed to be received
+ * and synced to disk by walreceiver.
+ */
+Datum
+pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	recptr;
+
+	recptr = GetWalRcvFlushRecPtr(NULL, NULL);
+
+	if (recptr == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_LSN(recptr);
+}
+
+/*
+ * Report the last WAL replay location (same format as pg_backup_start etc)
+ *
+ * This is useful for determining how much of WAL is visible to read-only
+ * connections during recovery.
+ */
+Datum
+pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	recptr;
+
+	recptr = GetXLogReplayRecPtr(NULL);
+
+	if (recptr == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_LSN(recptr);
+}
+
+/*
+ * Compute an xlog file name and decimal byte offset given a WAL location,
+ * such as is returned by pg_backup_stop() or pg_switch_wal().
+ *
+ * Note that a location exactly at a segment boundary is taken to be in
+ * the previous segment.  This is usually the right thing, since the
+ * expected usage is to determine which xlog file(s) are ready to archive.
+ */
+Datum
+pg_walfile_name_offset(PG_FUNCTION_ARGS)
+{
+	XLogSegNo	xlogsegno;
+	uint32		xrecoff;
+	XLogRecPtr	locationpoint = PG_GETARG_LSN(0);
+	char		xlogfilename[MAXFNAMELEN];
+	Datum		values[2];
+	bool		isnull[2];
+	TupleDesc	resultTupleDesc;
+	HeapTuple	resultHeapTuple;
+	Datum		result;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("%s cannot be executed during recovery.",
+						 "pg_walfile_name_offset()")));
+
+	/*
+	 * Construct a tuple descriptor for the result row.  This must match this
+	 * function's pg_proc entry!
+	 */
+	resultTupleDesc = CreateTemplateTupleDesc(2);
+	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
+					   TEXTOID, -1, 0);
+	TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
+					   INT4OID, -1, 0);
+
+	resultTupleDesc = BlessTupleDesc(resultTupleDesc);
+
+	/*
+	 * xlogfilename
+	 */
+	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
+				 wal_segment_size);
+
+	values[0] = CStringGetTextDatum(xlogfilename);
+	isnull[0] = false;
+
+	/*
+	 * offset
+	 */
+	xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size);
+
+	values[1] = UInt32GetDatum(xrecoff);
+	isnull[1] = false;
+
+	/*
+	 * Tuple jam: Having first prepared your Datums, then squash together
+	 */
+	resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
+
+	result = HeapTupleGetDatum(resultHeapTuple);
+
+	PG_RETURN_DATUM(result);
+}
+
+/*
+ * Compute an xlog file name given a WAL location,
+ * such as is returned by pg_backup_stop() or pg_switch_wal().
+ */
+Datum
+pg_walfile_name(PG_FUNCTION_ARGS)
+{
+	XLogSegNo	xlogsegno;
+	XLogRecPtr	locationpoint = PG_GETARG_LSN(0);
+	char		xlogfilename[MAXFNAMELEN];
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("%s cannot be executed during recovery.",
+						 "pg_walfile_name()")));
+
+	XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
+	XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno,
+				 wal_segment_size);
+
+	PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
+}
+
+/*
+ * pg_wal_replay_pause - Request to pause recovery
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_wal_replay_pause(PG_FUNCTION_ARGS)
+{
+	if (!RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("Recovery control functions can only be executed during recovery.")));
+
+	if (PromoteIsTriggered())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("standby promotion is ongoing"),
+				 errhint("%s cannot be executed after promotion is triggered.",
+						 "pg_wal_replay_pause()")));
+
+	SetRecoveryPause(true);
+
+	/* wake up the recovery process so that it can process the pause request */
+	WakeupRecovery();
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * pg_wal_replay_resume - resume recovery now
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_wal_replay_resume(PG_FUNCTION_ARGS)
+{
+	if (!RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("Recovery control functions can only be executed during recovery.")));
+
+	if (PromoteIsTriggered())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("standby promotion is ongoing"),
+				 errhint("%s cannot be executed after promotion is triggered.",
+						 "pg_wal_replay_resume()")));
+
+	SetRecoveryPause(false);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * pg_is_wal_replay_paused
+ */
+Datum
+pg_is_wal_replay_paused(PG_FUNCTION_ARGS)
+{
+	if (!RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("Recovery control functions can only be executed during recovery.")));
+
+	PG_RETURN_BOOL(GetRecoveryPauseState() != RECOVERY_NOT_PAUSED);
+}
+
+/*
+ * pg_get_wal_replay_pause_state - Returns the recovery pause state.
+ *
+ * Returned values:
+ *
+ * 'not paused' - if pause is not requested
+ * 'pause requested' - if pause is requested but recovery is not yet paused
+ * 'paused' - if recovery is paused
+ */
+Datum
+pg_get_wal_replay_pause_state(PG_FUNCTION_ARGS)
+{
+	char	   *statestr = NULL;
+
+	if (!RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("Recovery control functions can only be executed during recovery.")));
+
+	/* get the recovery pause state */
+	switch (GetRecoveryPauseState())
+	{
+		case RECOVERY_NOT_PAUSED:
+			statestr = "not paused";
+			break;
+		case RECOVERY_PAUSE_REQUESTED:
+			statestr = "pause requested";
+			break;
+		case RECOVERY_PAUSED:
+			statestr = "paused";
+			break;
+	}
+
+	Assert(statestr != NULL);
+	PG_RETURN_TEXT_P(cstring_to_text(statestr));
+}
+
+/*
+ * Returns timestamp of latest processed commit/abort record.
+ *
+ * When the server has been started normally without recovery the function
+ * returns NULL.
+ */
+Datum
+pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
+{
+	TimestampTz xtime;
+
+	xtime = GetLatestXTime();
+	if (xtime == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_TIMESTAMPTZ(xtime);
+}
+
+/*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(RecoveryInProgress());
+}
+
+/*
+ * Compute the difference in bytes between two WAL locations.
+ */
+Datum
+pg_wal_lsn_diff(PG_FUNCTION_ARGS)
+{
+	Datum		result;
+
+	result = DirectFunctionCall2(pg_lsn_mi,
+								 PG_GETARG_DATUM(0),
+								 PG_GETARG_DATUM(1));
+
+	PG_RETURN_NUMERIC(result);
+}
+
+/*
+ * Promotes a standby server.
+ *
+ * A result of "true" means that promotion has been completed if "wait" is
+ * "true", or initiated if "wait" is false.
+ */
+Datum
+pg_promote(PG_FUNCTION_ARGS)
+{
+	bool		wait = PG_GETARG_BOOL(0);
+	int			wait_seconds = PG_GETARG_INT32(1);
+	FILE	   *promote_file;
+	int			i;
+
+	if (!RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("Recovery control functions can only be executed during recovery.")));
+
+	if (wait_seconds <= 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("\"wait_seconds\" must not be negative or zero")));
+
+	/* create the promote signal file */
+	promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w");
+	if (!promote_file)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m",
+						PROMOTE_SIGNAL_FILE)));
+
+	if (FreeFile(promote_file))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write file \"%s\": %m",
+						PROMOTE_SIGNAL_FILE)));
+
+	/* signal the postmaster */
+	if (kill(PostmasterPid, SIGUSR1) != 0)
+	{
+		ereport(WARNING,
+				(errmsg("failed to send signal to postmaster: %m")));
+		(void) unlink(PROMOTE_SIGNAL_FILE);
+		PG_RETURN_BOOL(false);
+	}
+
+	/* return immediately if waiting was not requested */
+	if (!wait)
+		PG_RETURN_BOOL(true);
+
+	/* wait for the amount of time wanted until promotion */
+#define WAITS_PER_SECOND 10
+	for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++)
+	{
+		int			rc;
+
+		ResetLatch(MyLatch);
+
+		if (!RecoveryInProgress())
+			PG_RETURN_BOOL(true);
+
+		CHECK_FOR_INTERRUPTS();
+
+		rc = WaitLatch(MyLatch,
+					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+					   1000L / WAITS_PER_SECOND,
+					   WAIT_EVENT_PROMOTE);
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (rc & WL_POSTMASTER_DEATH)
+			PG_RETURN_BOOL(false);
+	}
+
+	ereport(WARNING,
+			(errmsg_plural("server did not promote within %d second",
+						   "server did not promote within %d seconds",
+						   wait_seconds,
+						   wait_seconds)));
+	PG_RETURN_BOOL(false);
+}
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
new file mode 100644
index 0000000..35cc055
--- /dev/null
+++ b/src/backend/access/transam/xloginsert.c
@@ -0,0 +1,1318 @@
+/*-------------------------------------------------------------------------
+ *
+ * xloginsert.c
+ *		Functions for constructing WAL records
+ *
+ * Constructing a WAL record begins with a call to XLogBeginInsert,
+ * followed by a number of XLogRegister* calls. The registered data is
+ * collected in private working memory, and finally assembled into a chain
+ * of XLogRecData structs by a call to XLogRecordAssemble(). See
+ * access/transam/README for details.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xloginsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#ifdef USE_LZ4
+#include <lz4.h>
+#endif
+
+#ifdef USE_ZSTD
+#include <zstd.h>
+#endif
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xloginsert.h"
+#include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "replication/origin.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+/*
+ * Guess the maximum buffer size required to store a compressed version of
+ * backup block image.
+ */
+#ifdef USE_LZ4
+#define	LZ4_MAX_BLCKSZ		LZ4_COMPRESSBOUND(BLCKSZ)
+#else
+#define LZ4_MAX_BLCKSZ		0
+#endif
+
+#ifdef USE_ZSTD
+#define ZSTD_MAX_BLCKSZ		ZSTD_COMPRESSBOUND(BLCKSZ)
+#else
+#define ZSTD_MAX_BLCKSZ		0
+#endif
+
+#define PGLZ_MAX_BLCKSZ		PGLZ_MAX_OUTPUT(BLCKSZ)
+
+/* Buffer size required to store a compressed version of backup block image */
+#define COMPRESS_BUFSIZE	Max(Max(PGLZ_MAX_BLCKSZ, LZ4_MAX_BLCKSZ), ZSTD_MAX_BLCKSZ)
+
+/*
+ * For each block reference registered with XLogRegisterBuffer, we fill in
+ * a registered_buffer struct.
+ */
+typedef struct
+{
+	bool		in_use;			/* is this slot in use? */
+	uint8		flags;			/* REGBUF_* flags */
+	RelFileNode rnode;			/* identifies the relation and block */
+	ForkNumber	forkno;
+	BlockNumber block;
+	Page		page;			/* page content */
+	uint32		rdata_len;		/* total length of data in rdata chain */
+	XLogRecData *rdata_head;	/* head of the chain of data registered with
+								 * this block */
+	XLogRecData *rdata_tail;	/* last entry in the chain, or &rdata_head if
+								 * empty */
+
+	XLogRecData bkp_rdatas[2];	/* temporary rdatas used to hold references to
+								 * backup block data in XLogRecordAssemble() */
+
+	/* buffer to store a compressed version of backup block image */
+	char		compressed_page[COMPRESS_BUFSIZE];
+} registered_buffer;
+
+static registered_buffer *registered_buffers;
+static int	max_registered_buffers; /* allocated size */
+static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
+											 * registered */
+
+/*
+ * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered
+ * with XLogRegisterData(...).
+ */
+static XLogRecData *mainrdata_head;
+static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head;
+static uint32 mainrdata_len;	/* total # of bytes in chain */
+
+/* flags for the in-progress insertion */
+static uint8 curinsert_flags = 0;
+
+/*
+ * These are used to hold the record header while constructing a record.
+ * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization,
+ * because we want it to be MAXALIGNed and padding bytes zeroed.
+ *
+ * For simplicity, it's allocated large enough to hold the headers for any
+ * WAL record.
+ */
+static XLogRecData hdr_rdt;
+static char *hdr_scratch = NULL;
+
+#define SizeOfXlogOrigin	(sizeof(RepOriginId) + sizeof(char))
+#define SizeOfXLogTransactionId	(sizeof(TransactionId) + sizeof(char))
+
+#define HEADER_SCRATCH_SIZE \
+	(SizeOfXLogRecord + \
+	 MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \
+	 SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \
+	 SizeOfXLogTransactionId)
+
+/*
+ * An array of XLogRecData structs, to hold registered data.
+ */
+static XLogRecData *rdatas;
+static int	num_rdatas;			/* entries currently used */
+static int	max_rdatas;			/* allocated size */
+
+static bool begininsert_called = false;
+
+/* Memory context to hold the registered buffer and data references. */
+static MemoryContext xloginsert_cxt;
+
+static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
+									   XLogRecPtr RedoRecPtr, bool doPageWrites,
+									   XLogRecPtr *fpw_lsn, int *num_fpi,
+									   bool *topxid_included);
+static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
+									uint16 hole_length, char *dest, uint16 *dlen);
+
+/*
+ * Begin constructing a WAL record. This must be called before the
+ * XLogRegister* functions and XLogInsert().
+ */
+void
+XLogBeginInsert(void)
+{
+	Assert(max_registered_block_id == 0);
+	Assert(mainrdata_last == (XLogRecData *) &mainrdata_head);
+	Assert(mainrdata_len == 0);
+
+	/* cross-check on whether we should be here or not */
+	if (!XLogInsertAllowed())
+		elog(ERROR, "cannot make new WAL entries during recovery");
+
+	if (begininsert_called)
+		elog(ERROR, "XLogBeginInsert was already called");
+
+	begininsert_called = true;
+}
+
+/*
+ * Ensure that there are enough buffer and data slots in the working area,
+ * for subsequent XLogRegisterBuffer, XLogRegisterData and XLogRegisterBufData
+ * calls.
+ *
+ * There is always space for a small number of buffers and data chunks, enough
+ * for most record types. This function is for the exceptional cases that need
+ * more.
+ */
+void
+XLogEnsureRecordSpace(int max_block_id, int ndatas)
+{
+	int			nbuffers;
+
+	/*
+	 * This must be called before entering a critical section, because
+	 * allocating memory inside a critical section can fail. repalloc() will
+	 * check the same, but better to check it here too so that we fail
+	 * consistently even if the arrays happen to be large enough already.
+	 */
+	Assert(CritSectionCount == 0);
+
+	/* the minimum values can't be decreased */
+	if (max_block_id < XLR_NORMAL_MAX_BLOCK_ID)
+		max_block_id = XLR_NORMAL_MAX_BLOCK_ID;
+	if (ndatas < XLR_NORMAL_RDATAS)
+		ndatas = XLR_NORMAL_RDATAS;
+
+	if (max_block_id > XLR_MAX_BLOCK_ID)
+		elog(ERROR, "maximum number of WAL record block references exceeded");
+	nbuffers = max_block_id + 1;
+
+	if (nbuffers > max_registered_buffers)
+	{
+		registered_buffers = (registered_buffer *)
+			repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers);
+
+		/*
+		 * At least the padding bytes in the structs must be zeroed, because
+		 * they are included in WAL data, but initialize it all for tidiness.
+		 */
+		MemSet(&registered_buffers[max_registered_buffers], 0,
+			   (nbuffers - max_registered_buffers) * sizeof(registered_buffer));
+		max_registered_buffers = nbuffers;
+	}
+
+	if (ndatas > max_rdatas)
+	{
+		rdatas = (XLogRecData *) repalloc(rdatas, sizeof(XLogRecData) * ndatas);
+		max_rdatas = ndatas;
+	}
+}
+
+/*
+ * Reset WAL record construction buffers.
+ */
+void
+XLogResetInsertion(void)
+{
+	int			i;
+
+	for (i = 0; i < max_registered_block_id; i++)
+		registered_buffers[i].in_use = false;
+
+	num_rdatas = 0;
+	max_registered_block_id = 0;
+	mainrdata_len = 0;
+	mainrdata_last = (XLogRecData *) &mainrdata_head;
+	curinsert_flags = 0;
+	begininsert_called = false;
+}
+
+/*
+ * Register a reference to a buffer with the WAL record being constructed.
+ * This must be called for every page that the WAL-logged operation modifies.
+ */
+void
+XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
+{
+	registered_buffer *regbuf;
+
+	/* NO_IMAGE doesn't make sense with FORCE_IMAGE */
+	Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE))));
+	Assert(begininsert_called);
+
+	if (block_id >= max_registered_block_id)
+	{
+		if (block_id >= max_registered_buffers)
+			elog(ERROR, "too many registered buffers");
+		max_registered_block_id = block_id + 1;
+	}
+
+	regbuf = &registered_buffers[block_id];
+
+	BufferGetTag(buffer, &regbuf->rnode, &regbuf->forkno, &regbuf->block);
+	regbuf->page = BufferGetPage(buffer);
+	regbuf->flags = flags;
+	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
+	regbuf->rdata_len = 0;
+
+	/*
+	 * Check that this page hasn't already been registered with some other
+	 * block_id.
+	 */
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		for (i = 0; i < max_registered_block_id; i++)
+		{
+			registered_buffer *regbuf_old = &registered_buffers[i];
+
+			if (i == block_id || !regbuf_old->in_use)
+				continue;
+
+			Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) ||
+				   regbuf_old->forkno != regbuf->forkno ||
+				   regbuf_old->block != regbuf->block);
+		}
+	}
+#endif
+
+	regbuf->in_use = true;
+}
+
+/*
+ * Like XLogRegisterBuffer, but for registering a block that's not in the
+ * shared buffer pool (i.e. when you don't have a Buffer for it).
+ */
+void
+XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
+				  BlockNumber blknum, Page page, uint8 flags)
+{
+	registered_buffer *regbuf;
+
+	Assert(begininsert_called);
+
+	if (block_id >= max_registered_block_id)
+		max_registered_block_id = block_id + 1;
+
+	if (block_id >= max_registered_buffers)
+		elog(ERROR, "too many registered buffers");
+
+	regbuf = &registered_buffers[block_id];
+
+	regbuf->rnode = *rnode;
+	regbuf->forkno = forknum;
+	regbuf->block = blknum;
+	regbuf->page = page;
+	regbuf->flags = flags;
+	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
+	regbuf->rdata_len = 0;
+
+	/*
+	 * Check that this page hasn't already been registered with some other
+	 * block_id.
+	 */
+#ifdef USE_ASSERT_CHECKING
+	{
+		int			i;
+
+		for (i = 0; i < max_registered_block_id; i++)
+		{
+			registered_buffer *regbuf_old = &registered_buffers[i];
+
+			if (i == block_id || !regbuf_old->in_use)
+				continue;
+
+			Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) ||
+				   regbuf_old->forkno != regbuf->forkno ||
+				   regbuf_old->block != regbuf->block);
+		}
+	}
+#endif
+
+	regbuf->in_use = true;
+}
+
+/*
+ * Add data to the WAL record that's being constructed.
+ *
+ * The data is appended to the "main chunk", available at replay with
+ * XLogRecGetData().
+ */
+void
+XLogRegisterData(char *data, int len)
+{
+	XLogRecData *rdata;
+
+	Assert(begininsert_called);
+
+	if (num_rdatas >= max_rdatas)
+		elog(ERROR, "too much WAL data");
+	rdata = &rdatas[num_rdatas++];
+
+	rdata->data = data;
+	rdata->len = len;
+
+	/*
+	 * we use the mainrdata_last pointer to track the end of the chain, so no
+	 * need to clear 'next' here.
+	 */
+
+	mainrdata_last->next = rdata;
+	mainrdata_last = rdata;
+
+	mainrdata_len += len;
+}
+
+/*
+ * Add buffer-specific data to the WAL record that's being constructed.
+ *
+ * Block_id must reference a block previously registered with
+ * XLogRegisterBuffer(). If this is called more than once for the same
+ * block_id, the data is appended.
+ *
+ * The maximum amount of data that can be registered per block is 65535
+ * bytes. That should be plenty; if you need more than BLCKSZ bytes to
+ * reconstruct the changes to the page, you might as well just log a full
+ * copy of it. (the "main data" that's not associated with a block is not
+ * limited)
+ */
+void
+XLogRegisterBufData(uint8 block_id, char *data, int len)
+{
+	registered_buffer *regbuf;
+	XLogRecData *rdata;
+
+	Assert(begininsert_called);
+
+	/* find the registered buffer struct */
+	regbuf = &registered_buffers[block_id];
+	if (!regbuf->in_use)
+		elog(ERROR, "no block with id %d registered with WAL insertion",
+			 block_id);
+
+	if (num_rdatas >= max_rdatas)
+		elog(ERROR, "too much WAL data");
+	rdata = &rdatas[num_rdatas++];
+
+	rdata->data = data;
+	rdata->len = len;
+
+	regbuf->rdata_tail->next = rdata;
+	regbuf->rdata_tail = rdata;
+	regbuf->rdata_len += len;
+}
+
+/*
+ * Set insert status flags for the upcoming WAL record.
+ *
+ * The flags that can be used here are:
+ * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be
+ *	 included in the record.
+ * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for
+ *	 durability, which allows to avoid triggering WAL archiving and other
+ *	 background activity.
+ */
+void
+XLogSetRecordFlags(uint8 flags)
+{
+	Assert(begininsert_called);
+	curinsert_flags |= flags;
+}
+
+/*
+ * Insert an XLOG record having the specified RMID and info bytes, with the
+ * body of the record being the data and buffer references registered earlier
+ * with XLogRegister* calls.
+ *
+ * Returns XLOG pointer to end of record (beginning of next record).
+ * This can be used as LSN for data pages affected by the logged action.
+ * (LSN is the XLOG point up to which the XLOG must be flushed to disk
+ * before the data page can be written out.  This implements the basic
+ * WAL rule "write the log before the data".)
+ */
+XLogRecPtr
+XLogInsert(RmgrId rmid, uint8 info)
+{
+	XLogRecPtr	EndPos;
+
+	/* XLogBeginInsert() must have been called. */
+	if (!begininsert_called)
+		elog(ERROR, "XLogBeginInsert was not called");
+
+	/*
+	 * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
+	 * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
+	 */
+	if ((info & ~(XLR_RMGR_INFO_MASK |
+				  XLR_SPECIAL_REL_UPDATE |
+				  XLR_CHECK_CONSISTENCY)) != 0)
+		elog(PANIC, "invalid xlog info mask %02X", info);
+
+	TRACE_POSTGRESQL_WAL_INSERT(rmid, info);
+
+	/*
+	 * In bootstrap mode, we don't actually log anything but XLOG resources;
+	 * return a phony record pointer.
+	 */
+	if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
+	{
+		XLogResetInsertion();
+		EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */
+		return EndPos;
+	}
+
+	do
+	{
+		XLogRecPtr	RedoRecPtr;
+		bool		doPageWrites;
+		bool		topxid_included = false;
+		XLogRecPtr	fpw_lsn;
+		XLogRecData *rdt;
+		int			num_fpi = 0;
+
+		/*
+		 * Get values needed to decide whether to do full-page writes. Since
+		 * we don't yet have an insertion lock, these could change under us,
+		 * but XLogInsertRecord will recheck them once it has a lock.
+		 */
+		GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites);
+
+		rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites,
+								 &fpw_lsn, &num_fpi, &topxid_included);
+
+		EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi,
+								  topxid_included);
+	} while (EndPos == InvalidXLogRecPtr);
+
+	XLogResetInsertion();
+
+	return EndPos;
+}
+
+/*
+ * Assemble a WAL record from the registered data and buffers into an
+ * XLogRecData chain, ready for insertion with XLogInsertRecord().
+ *
+ * The record header fields are filled in, except for the xl_prev field. The
+ * calculated CRC does not include the record header yet.
+ *
+ * If there are any registered buffers, and a full-page image was not taken
+ * of all of them, *fpw_lsn is set to the lowest LSN among such pages. This
+ * signals that the assembled record is only good for insertion on the
+ * assumption that the RedoRecPtr and doPageWrites values were up-to-date.
+ *
+ * *topxid_included is set if the topmost transaction ID is logged with the
+ * current subtransaction.
+ */
+static XLogRecData *
+XLogRecordAssemble(RmgrId rmid, uint8 info,
+				   XLogRecPtr RedoRecPtr, bool doPageWrites,
+				   XLogRecPtr *fpw_lsn, int *num_fpi, bool *topxid_included)
+{
+	XLogRecData *rdt;
+	uint32		total_len = 0;
+	int			block_id;
+	pg_crc32c	rdata_crc;
+	registered_buffer *prev_regbuf = NULL;
+	XLogRecData *rdt_datas_last;
+	XLogRecord *rechdr;
+	char	   *scratch = hdr_scratch;
+
+	/*
+	 * Note: this function can be called multiple times for the same record.
+	 * All the modifications we do to the rdata chains below must handle that.
+	 */
+
+	/* The record begins with the fixed-size header */
+	rechdr = (XLogRecord *) scratch;
+	scratch += SizeOfXLogRecord;
+
+	hdr_rdt.next = NULL;
+	rdt_datas_last = &hdr_rdt;
+	hdr_rdt.data = hdr_scratch;
+
+	/*
+	 * Enforce consistency checks for this record if user is looking for it.
+	 * Do this before at the beginning of this routine to give the possibility
+	 * for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for
+	 * a record.
+	 */
+	if (wal_consistency_checking[rmid])
+		info |= XLR_CHECK_CONSISTENCY;
+
+	/*
+	 * Make an rdata chain containing all the data portions of all block
+	 * references. This includes the data for full-page images. Also append
+	 * the headers for the block references in the scratch buffer.
+	 */
+	*fpw_lsn = InvalidXLogRecPtr;
+	for (block_id = 0; block_id < max_registered_block_id; block_id++)
+	{
+		registered_buffer *regbuf = &registered_buffers[block_id];
+		bool		needs_backup;
+		bool		needs_data;
+		XLogRecordBlockHeader bkpb;
+		XLogRecordBlockImageHeader bimg;
+		XLogRecordBlockCompressHeader cbimg = {0};
+		bool		samerel;
+		bool		is_compressed = false;
+		bool		include_image;
+
+		if (!regbuf->in_use)
+			continue;
+
+		/* Determine if this block needs to be backed up */
+		if (regbuf->flags & REGBUF_FORCE_IMAGE)
+			needs_backup = true;
+		else if (regbuf->flags & REGBUF_NO_IMAGE)
+			needs_backup = false;
+		else if (!doPageWrites)
+			needs_backup = false;
+		else
+		{
+			/*
+			 * We assume page LSN is first data on *every* page that can be
+			 * passed to XLogInsert, whether it has the standard page layout
+			 * or not.
+			 */
+			XLogRecPtr	page_lsn = PageGetLSN(regbuf->page);
+
+			needs_backup = (page_lsn <= RedoRecPtr);
+			if (!needs_backup)
+			{
+				if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn)
+					*fpw_lsn = page_lsn;
+			}
+		}
+
+		/* Determine if the buffer data needs to included */
+		if (regbuf->rdata_len == 0)
+			needs_data = false;
+		else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0)
+			needs_data = true;
+		else
+			needs_data = !needs_backup;
+
+		bkpb.id = block_id;
+		bkpb.fork_flags = regbuf->forkno;
+		bkpb.data_length = 0;
+
+		if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
+			bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
+
+		/*
+		 * If needs_backup is true or WAL checking is enabled for current
+		 * resource manager, log a full-page write for the current block.
+		 */
+		include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
+
+		if (include_image)
+		{
+			Page		page = regbuf->page;
+			uint16		compressed_len = 0;
+
+			/*
+			 * The page needs to be backed up, so calculate its hole length
+			 * and offset.
+			 */
+			if (regbuf->flags & REGBUF_STANDARD)
+			{
+				/* Assume we can omit data between pd_lower and pd_upper */
+				uint16		lower = ((PageHeader) page)->pd_lower;
+				uint16		upper = ((PageHeader) page)->pd_upper;
+
+				if (lower >= SizeOfPageHeaderData &&
+					upper > lower &&
+					upper <= BLCKSZ)
+				{
+					bimg.hole_offset = lower;
+					cbimg.hole_length = upper - lower;
+				}
+				else
+				{
+					/* No "hole" to remove */
+					bimg.hole_offset = 0;
+					cbimg.hole_length = 0;
+				}
+			}
+			else
+			{
+				/* Not a standard page header, don't try to eliminate "hole" */
+				bimg.hole_offset = 0;
+				cbimg.hole_length = 0;
+			}
+
+			/*
+			 * Try to compress a block image if wal_compression is enabled
+			 */
+			if (wal_compression != WAL_COMPRESSION_NONE)
+			{
+				is_compressed =
+					XLogCompressBackupBlock(page, bimg.hole_offset,
+											cbimg.hole_length,
+											regbuf->compressed_page,
+											&compressed_len);
+			}
+
+			/*
+			 * Fill in the remaining fields in the XLogRecordBlockHeader
+			 * struct
+			 */
+			bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE;
+
+			/* Report a full page image constructed for the WAL record */
+			*num_fpi += 1;
+
+			/*
+			 * Construct XLogRecData entries for the page content.
+			 */
+			rdt_datas_last->next = &regbuf->bkp_rdatas[0];
+			rdt_datas_last = rdt_datas_last->next;
+
+			bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
+
+			/*
+			 * If WAL consistency checking is enabled for the resource manager
+			 * of this WAL record, a full-page image is included in the record
+			 * for the block modified. During redo, the full-page is replayed
+			 * only if BKPIMAGE_APPLY is set.
+			 */
+			if (needs_backup)
+				bimg.bimg_info |= BKPIMAGE_APPLY;
+
+			if (is_compressed)
+			{
+				/* The current compression is stored in the WAL record */
+				bimg.length = compressed_len;
+
+				/* Set the compression method used for this block */
+				switch ((WalCompression) wal_compression)
+				{
+					case WAL_COMPRESSION_PGLZ:
+						bimg.bimg_info |= BKPIMAGE_COMPRESS_PGLZ;
+						break;
+
+					case WAL_COMPRESSION_LZ4:
+#ifdef USE_LZ4
+						bimg.bimg_info |= BKPIMAGE_COMPRESS_LZ4;
+#else
+						elog(ERROR, "LZ4 is not supported by this build");
+#endif
+						break;
+
+					case WAL_COMPRESSION_ZSTD:
+#ifdef USE_ZSTD
+						bimg.bimg_info |= BKPIMAGE_COMPRESS_ZSTD;
+#else
+						elog(ERROR, "zstd is not supported by this build");
+#endif
+						break;
+
+					case WAL_COMPRESSION_NONE:
+						Assert(false);	/* cannot happen */
+						break;
+						/* no default case, so that compiler will warn */
+				}
+
+				rdt_datas_last->data = regbuf->compressed_page;
+				rdt_datas_last->len = compressed_len;
+			}
+			else
+			{
+				bimg.length = BLCKSZ - cbimg.hole_length;
+
+				if (cbimg.hole_length == 0)
+				{
+					rdt_datas_last->data = page;
+					rdt_datas_last->len = BLCKSZ;
+				}
+				else
+				{
+					/* must skip the hole */
+					rdt_datas_last->data = page;
+					rdt_datas_last->len = bimg.hole_offset;
+
+					rdt_datas_last->next = &regbuf->bkp_rdatas[1];
+					rdt_datas_last = rdt_datas_last->next;
+
+					rdt_datas_last->data =
+						page + (bimg.hole_offset + cbimg.hole_length);
+					rdt_datas_last->len =
+						BLCKSZ - (bimg.hole_offset + cbimg.hole_length);
+				}
+			}
+
+			total_len += bimg.length;
+		}
+
+		if (needs_data)
+		{
+			/*
+			 * Link the caller-supplied rdata chain for this buffer to the
+			 * overall list.
+			 */
+			bkpb.fork_flags |= BKPBLOCK_HAS_DATA;
+			bkpb.data_length = regbuf->rdata_len;
+			total_len += regbuf->rdata_len;
+
+			rdt_datas_last->next = regbuf->rdata_head;
+			rdt_datas_last = regbuf->rdata_tail;
+		}
+
+		if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode))
+		{
+			samerel = true;
+			bkpb.fork_flags |= BKPBLOCK_SAME_REL;
+		}
+		else
+			samerel = false;
+		prev_regbuf = regbuf;
+
+		/* Ok, copy the header to the scratch buffer */
+		memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
+		scratch += SizeOfXLogRecordBlockHeader;
+		if (include_image)
+		{
+			memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
+			scratch += SizeOfXLogRecordBlockImageHeader;
+			if (cbimg.hole_length != 0 && is_compressed)
+			{
+				memcpy(scratch, &cbimg,
+					   SizeOfXLogRecordBlockCompressHeader);
+				scratch += SizeOfXLogRecordBlockCompressHeader;
+			}
+		}
+		if (!samerel)
+		{
+			memcpy(scratch, &regbuf->rnode, sizeof(RelFileNode));
+			scratch += sizeof(RelFileNode);
+		}
+		memcpy(scratch, &regbuf->block, sizeof(BlockNumber));
+		scratch += sizeof(BlockNumber);
+	}
+
+	/* followed by the record's origin, if any */
+	if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) &&
+		replorigin_session_origin != InvalidRepOriginId)
+	{
+		*(scratch++) = (char) XLR_BLOCK_ID_ORIGIN;
+		memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin));
+		scratch += sizeof(replorigin_session_origin);
+	}
+
+	/* followed by toplevel XID, if not already included in previous record */
+	if (IsSubxactTopXidLogPending())
+	{
+		TransactionId xid = GetTopTransactionIdIfAny();
+
+		/* Set the flag that the top xid is included in the WAL */
+		*topxid_included = true;
+
+		*(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID;
+		memcpy(scratch, &xid, sizeof(TransactionId));
+		scratch += sizeof(TransactionId);
+	}
+
+	/* followed by main data, if any */
+	if (mainrdata_len > 0)
+	{
+		if (mainrdata_len > 255)
+		{
+			*(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG;
+			memcpy(scratch, &mainrdata_len, sizeof(uint32));
+			scratch += sizeof(uint32);
+		}
+		else
+		{
+			*(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT;
+			*(scratch++) = (uint8) mainrdata_len;
+		}
+		rdt_datas_last->next = mainrdata_head;
+		rdt_datas_last = mainrdata_last;
+		total_len += mainrdata_len;
+	}
+	rdt_datas_last->next = NULL;
+
+	hdr_rdt.len = (scratch - hdr_scratch);
+	total_len += hdr_rdt.len;
+
+	/*
+	 * Calculate CRC of the data
+	 *
+	 * Note that the record header isn't added into the CRC initially since we
+	 * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
+	 * the whole record in the order: rdata, then backup blocks, then record
+	 * header.
+	 */
+	INIT_CRC32C(rdata_crc);
+	COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord);
+	for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next)
+		COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
+
+	/*
+	 * Fill in the fields in the record header. Prev-link is filled in later,
+	 * once we know where in the WAL the record will be inserted. The CRC does
+	 * not include the record header yet.
+	 */
+	rechdr->xl_xid = GetCurrentTransactionIdIfAny();
+	rechdr->xl_tot_len = total_len;
+	rechdr->xl_info = info;
+	rechdr->xl_rmid = rmid;
+	rechdr->xl_prev = InvalidXLogRecPtr;
+	rechdr->xl_crc = rdata_crc;
+
+	return &hdr_rdt;
+}
+
+/*
+ * Create a compressed version of a backup block image.
+ *
+ * Returns false if compression fails (i.e., compressed result is actually
+ * bigger than original). Otherwise, returns true and sets 'dlen' to
+ * the length of compressed block image.
+ */
+static bool
+XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length,
+						char *dest, uint16 *dlen)
+{
+	int32		orig_len = BLCKSZ - hole_length;
+	int32		len = -1;
+	int32		extra_bytes = 0;
+	char	   *source;
+	PGAlignedBlock tmp;
+
+	if (hole_length != 0)
+	{
+		/* must skip the hole */
+		source = tmp.data;
+		memcpy(source, page, hole_offset);
+		memcpy(source + hole_offset,
+			   page + (hole_offset + hole_length),
+			   BLCKSZ - (hole_length + hole_offset));
+
+		/*
+		 * Extra data needs to be stored in WAL record for the compressed
+		 * version of block image if the hole exists.
+		 */
+		extra_bytes = SizeOfXLogRecordBlockCompressHeader;
+	}
+	else
+		source = page;
+
+	switch ((WalCompression) wal_compression)
+	{
+		case WAL_COMPRESSION_PGLZ:
+			len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default);
+			break;
+
+		case WAL_COMPRESSION_LZ4:
+#ifdef USE_LZ4
+			len = LZ4_compress_default(source, dest, orig_len,
+									   COMPRESS_BUFSIZE);
+			if (len <= 0)
+				len = -1;		/* failure */
+#else
+			elog(ERROR, "LZ4 is not supported by this build");
+#endif
+			break;
+
+		case WAL_COMPRESSION_ZSTD:
+#ifdef USE_ZSTD
+			len = ZSTD_compress(dest, COMPRESS_BUFSIZE, source, orig_len,
+								ZSTD_CLEVEL_DEFAULT);
+			if (ZSTD_isError(len))
+				len = -1;		/* failure */
+#else
+			elog(ERROR, "zstd is not supported by this build");
+#endif
+			break;
+
+		case WAL_COMPRESSION_NONE:
+			Assert(false);		/* cannot happen */
+			break;
+			/* no default case, so that compiler will warn */
+	}
+
+	/*
+	 * We recheck the actual size even if compression reports success and see
+	 * if the number of bytes saved by compression is larger than the length
+	 * of extra data needed for the compressed version of block image.
+	 */
+	if (len >= 0 &&
+		len + extra_bytes < orig_len)
+	{
+		*dlen = (uint16) len;	/* successful compression */
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Determine whether the buffer referenced has to be backed up.
+ *
+ * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites
+ * could change later, so the result should be used for optimization purposes
+ * only.
+ */
+bool
+XLogCheckBufferNeedsBackup(Buffer buffer)
+{
+	XLogRecPtr	RedoRecPtr;
+	bool		doPageWrites;
+	Page		page;
+
+	GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites);
+
+	page = BufferGetPage(buffer);
+
+	if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+		return true;			/* buffer requires backup */
+
+	return false;				/* buffer does not need to be backed up */
+}
+
+/*
+ * Write a backup block if needed when we are setting a hint. Note that
+ * this may be called for a variety of page types, not just heaps.
+ *
+ * Callable while holding just share lock on the buffer content.
+ *
+ * We can't use the plain backup block mechanism since that relies on the
+ * Buffer being exclusively locked. Since some modifications (setting LSN, hint
+ * bits) are allowed in a sharelocked buffer that can lead to wal checksum
+ * failures. So instead we copy the page and insert the copied data as normal
+ * record data.
+ *
+ * We only need to do something if page has not yet been full page written in
+ * this checkpoint round. The LSN of the inserted wal record is returned if we
+ * had to write, InvalidXLogRecPtr otherwise.
+ *
+ * It is possible that multiple concurrent backends could attempt to write WAL
+ * records. In that case, multiple copies of the same block would be recorded
+ * in separate WAL records by different backends, though that is still OK from
+ * a correctness perspective.
+ */
+XLogRecPtr
+XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
+{
+	XLogRecPtr	recptr = InvalidXLogRecPtr;
+	XLogRecPtr	lsn;
+	XLogRecPtr	RedoRecPtr;
+
+	/*
+	 * Ensure no checkpoint can change our view of RedoRecPtr.
+	 */
+	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) != 0);
+
+	/*
+	 * Update RedoRecPtr so that we can make the right decision
+	 */
+	RedoRecPtr = GetRedoRecPtr();
+
+	/*
+	 * We assume page LSN is first data on *every* page that can be passed to
+	 * XLogInsert, whether it has the standard page layout or not. Since we're
+	 * only holding a share-lock on the page, we must take the buffer header
+	 * lock when we look at the LSN.
+	 */
+	lsn = BufferGetLSNAtomic(buffer);
+
+	if (lsn <= RedoRecPtr)
+	{
+		int			flags = 0;
+		PGAlignedBlock copied_buffer;
+		char	   *origdata = (char *) BufferGetBlock(buffer);
+		RelFileNode rnode;
+		ForkNumber	forkno;
+		BlockNumber blkno;
+
+		/*
+		 * Copy buffer so we don't have to worry about concurrent hint bit or
+		 * lsn updates. We assume pd_lower/upper cannot be changed without an
+		 * exclusive lock, so the contents bkp are not racy.
+		 */
+		if (buffer_std)
+		{
+			/* Assume we can omit data between pd_lower and pd_upper */
+			Page		page = BufferGetPage(buffer);
+			uint16		lower = ((PageHeader) page)->pd_lower;
+			uint16		upper = ((PageHeader) page)->pd_upper;
+
+			memcpy(copied_buffer.data, origdata, lower);
+			memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+		}
+		else
+			memcpy(copied_buffer.data, origdata, BLCKSZ);
+
+		XLogBeginInsert();
+
+		if (buffer_std)
+			flags |= REGBUF_STANDARD;
+
+		BufferGetTag(buffer, &rnode, &forkno, &blkno);
+		XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags);
+
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT);
+	}
+
+	return recptr;
+}
+
+/*
+ * Write a WAL record containing a full image of a page. Caller is responsible
+ * for writing the page to disk after calling this routine.
+ *
+ * Note: If you're using this function, you should be building pages in private
+ * memory and writing them directly to smgr.  If you're using buffers, call
+ * log_newpage_buffer instead.
+ *
+ * If the page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL record, making it smaller.
+ */
+XLogRecPtr
+log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+			Page page, bool page_std)
+{
+	int			flags;
+	XLogRecPtr	recptr;
+
+	flags = REGBUF_FORCE_IMAGE;
+	if (page_std)
+		flags |= REGBUF_STANDARD;
+
+	XLogBeginInsert();
+	XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags);
+	recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+	/*
+	 * The page may be uninitialized. If so, we can't set the LSN because that
+	 * would corrupt the page.
+	 */
+	if (!PageIsNew(page))
+	{
+		PageSetLSN(page, recptr);
+	}
+
+	return recptr;
+}
+
+/*
+ * Like log_newpage(), but allows logging multiple pages in one operation.
+ * It is more efficient than calling log_newpage() for each page separately,
+ * because we can write multiple pages in a single WAL record.
+ */
+void
+log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages,
+			 BlockNumber *blknos, Page *pages, bool page_std)
+{
+	int			flags;
+	XLogRecPtr	recptr;
+	int			i;
+	int			j;
+
+	flags = REGBUF_FORCE_IMAGE;
+	if (page_std)
+		flags |= REGBUF_STANDARD;
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
+	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
+
+	i = 0;
+	while (i < num_pages)
+	{
+		int			batch_start = i;
+		int			nbatch;
+
+		XLogBeginInsert();
+
+		nbatch = 0;
+		while (nbatch < XLR_MAX_BLOCK_ID && i < num_pages)
+		{
+			XLogRegisterBlock(nbatch, rnode, forkNum, blknos[i], pages[i], flags);
+			i++;
+			nbatch++;
+		}
+
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+		for (j = batch_start; j < i; j++)
+		{
+			/*
+			 * The page may be uninitialized. If so, we can't set the LSN
+			 * because that would corrupt the page.
+			 */
+			if (!PageIsNew(pages[j]))
+			{
+				PageSetLSN(pages[j], recptr);
+			}
+		}
+	}
+}
+
+/*
+ * Write a WAL record containing a full image of a page.
+ *
+ * Caller should initialize the buffer and mark it dirty before calling this
+ * function.  This function will set the page LSN.
+ *
+ * If the page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL record, making it smaller.
+ */
+XLogRecPtr
+log_newpage_buffer(Buffer buffer, bool page_std)
+{
+	Page		page = BufferGetPage(buffer);
+	RelFileNode rnode;
+	ForkNumber	forkNum;
+	BlockNumber blkno;
+
+	/* Shared buffers should be modified in a critical section. */
+	Assert(CritSectionCount > 0);
+
+	BufferGetTag(buffer, &rnode, &forkNum, &blkno);
+
+	return log_newpage(&rnode, forkNum, blkno, page, page_std);
+}
+
+/*
+ * WAL-log a range of blocks in a relation.
+ *
+ * An image of all pages with block numbers 'startblk' <= X < 'endblk' is
+ * written to the WAL. If the range is large, this is done in multiple WAL
+ * records.
+ *
+ * If all page follows the standard page layout, with a PageHeader and unused
+ * space between pd_lower and pd_upper, set 'page_std' to true. That allows
+ * the unused space to be left out from the WAL records, making them smaller.
+ *
+ * NOTE: This function acquires exclusive-locks on the pages. Typically, this
+ * is used on a newly-built relation, and the caller is holding a
+ * AccessExclusiveLock on it, so no other backend can be accessing it at the
+ * same time. If that's not the case, you must ensure that this does not
+ * cause a deadlock through some other means.
+ */
+void
+log_newpage_range(Relation rel, ForkNumber forkNum,
+				  BlockNumber startblk, BlockNumber endblk,
+				  bool page_std)
+{
+	int			flags;
+	BlockNumber blkno;
+
+	flags = REGBUF_FORCE_IMAGE;
+	if (page_std)
+		flags |= REGBUF_STANDARD;
+
+	/*
+	 * Iterate over all the pages in the range. They are collected into
+	 * batches of XLR_MAX_BLOCK_ID pages, and a single WAL-record is written
+	 * for each batch.
+	 */
+	XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0);
+
+	blkno = startblk;
+	while (blkno < endblk)
+	{
+		Buffer		bufpack[XLR_MAX_BLOCK_ID];
+		XLogRecPtr	recptr;
+		int			nbufs;
+		int			i;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Collect a batch of blocks. */
+		nbufs = 0;
+		while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk)
+		{
+			Buffer		buf = ReadBufferExtended(rel, forkNum, blkno,
+												 RBM_NORMAL, NULL);
+
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+			/*
+			 * Completely empty pages are not WAL-logged. Writing a WAL record
+			 * would change the LSN, and we don't want that. We want the page
+			 * to stay empty.
+			 */
+			if (!PageIsNew(BufferGetPage(buf)))
+				bufpack[nbufs++] = buf;
+			else
+				UnlockReleaseBuffer(buf);
+			blkno++;
+		}
+
+		/* Nothing more to do if all remaining blocks were empty. */
+		if (nbufs == 0)
+			break;
+
+		/* Write WAL record for this batch. */
+		XLogBeginInsert();
+
+		START_CRIT_SECTION();
+		for (i = 0; i < nbufs; i++)
+		{
+			XLogRegisterBuffer(i, bufpack[i], flags);
+			MarkBufferDirty(bufpack[i]);
+		}
+
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+		for (i = 0; i < nbufs; i++)
+		{
+			PageSetLSN(BufferGetPage(bufpack[i]), recptr);
+			UnlockReleaseBuffer(bufpack[i]);
+		}
+		END_CRIT_SECTION();
+	}
+}
+
+/*
+ * Allocate working buffers needed for WAL record construction.
+ */
+void
+InitXLogInsert(void)
+{
+	/* Initialize the working areas */
+	if (xloginsert_cxt == NULL)
+	{
+		xloginsert_cxt = AllocSetContextCreate(TopMemoryContext,
+											   "WAL record construction",
+											   ALLOCSET_DEFAULT_SIZES);
+	}
+
+	if (registered_buffers == NULL)
+	{
+		registered_buffers = (registered_buffer *)
+			MemoryContextAllocZero(xloginsert_cxt,
+								   sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1));
+		max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1;
+	}
+	if (rdatas == NULL)
+	{
+		rdatas = MemoryContextAlloc(xloginsert_cxt,
+									sizeof(XLogRecData) * XLR_NORMAL_RDATAS);
+		max_rdatas = XLR_NORMAL_RDATAS;
+	}
+
+	/*
+	 * Allocate a buffer to hold the header information for a WAL record.
+	 */
+	if (hdr_scratch == NULL)
+		hdr_scratch = MemoryContextAllocZero(xloginsert_cxt,
+											 HEADER_SCRATCH_SIZE);
+}
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
new file mode 100644
index 0000000..b98b319
--- /dev/null
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -0,0 +1,1105 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogprefetcher.c
+ *		Prefetching support for recovery.
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *		src/backend/access/transam/xlogprefetcher.c
+ *
+ * This module provides a drop-in replacement for an XLogReader that tries to
+ * minimize I/O stalls by looking ahead in the WAL.  If blocks that will be
+ * accessed in the near future are not already in the buffer pool, it initiates
+ * I/Os that might complete before the caller eventually needs the data.  When
+ * referenced blocks are found in the buffer pool already, the buffer is
+ * recorded in the decoded record so that XLogReadBufferForRedo() can try to
+ * avoid a second buffer mapping table lookup.
+ *
+ * Currently, only the main fork is considered for prefetching.  Currently,
+ * prefetching is only effective on systems where BufferPrefetch() does
+ * something useful (mainly Linux).
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_control.h"
+#include "catalog/storage_xlog.h"
+#include "commands/dbcommands_xlog.h"
+#include "utils/fmgrprotos.h"
+#include "utils/timestamp.h"
+#include "funcapi.h"
+#include "pgstat.h"
+#include "miscadmin.h"
+#include "port/atomics.h"
+#include "storage/bufmgr.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+
+/*
+ * Every time we process this much WAL, we'll update the values in
+ * pg_stat_recovery_prefetch.
+ */
+#define XLOGPREFETCHER_STATS_DISTANCE BLCKSZ
+
+/*
+ * To detect repeated access to the same block and skip useless extra system
+ * calls, we remember a small window of recently prefetched blocks.
+ */
+#define XLOGPREFETCHER_SEQ_WINDOW_SIZE 4
+
+/*
+ * When maintenance_io_concurrency is not saturated, we're prepared to look
+ * ahead up to N times that number of block references.
+ */
+#define XLOGPREFETCHER_DISTANCE_MULTIPLIER 4
+
+/* Define to log internal debugging messages. */
+/* #define XLOGPREFETCHER_DEBUG_LEVEL LOG */
+
+/* GUCs */
+int			recovery_prefetch = RECOVERY_PREFETCH_TRY;
+
+#ifdef USE_PREFETCH
+#define RecoveryPrefetchEnabled() \
+		(recovery_prefetch != RECOVERY_PREFETCH_OFF && \
+		 maintenance_io_concurrency > 0)
+#else
+#define RecoveryPrefetchEnabled() false
+#endif
+
+static int	XLogPrefetchReconfigureCount = 0;
+
+/*
+ * Enum used to report whether an IO should be started.
+ */
+typedef enum
+{
+	LRQ_NEXT_NO_IO,
+	LRQ_NEXT_IO,
+	LRQ_NEXT_AGAIN
+} LsnReadQueueNextStatus;
+
+/*
+ * Type of callback that can decide which block to prefetch next.  For now
+ * there is only one.
+ */
+typedef LsnReadQueueNextStatus (*LsnReadQueueNextFun) (uintptr_t lrq_private,
+													   XLogRecPtr *lsn);
+
+/*
+ * A simple circular queue of LSNs, using to control the number of
+ * (potentially) inflight IOs.  This stands in for a later more general IO
+ * control mechanism, which is why it has the apparently unnecessary
+ * indirection through a function pointer.
+ */
+typedef struct LsnReadQueue
+{
+	LsnReadQueueNextFun next;
+	uintptr_t	lrq_private;
+	uint32		max_inflight;
+	uint32		inflight;
+	uint32		completed;
+	uint32		head;
+	uint32		tail;
+	uint32		size;
+	struct
+	{
+		bool		io;
+		XLogRecPtr	lsn;
+	}			queue[FLEXIBLE_ARRAY_MEMBER];
+} LsnReadQueue;
+
+/*
+ * A prefetcher.  This is a mechanism that wraps an XLogReader, prefetching
+ * blocks that will be soon be referenced, to try to avoid IO stalls.
+ */
+struct XLogPrefetcher
+{
+	/* WAL reader and current reading state. */
+	XLogReaderState *reader;
+	DecodedXLogRecord *record;
+	int			next_block_id;
+
+	/* When to publish stats. */
+	XLogRecPtr	next_stats_shm_lsn;
+
+	/* Book-keeping to avoid accessing blocks that don't exist yet. */
+	HTAB	   *filter_table;
+	dlist_head	filter_queue;
+
+	/* Book-keeping to avoid repeat prefetches. */
+	RelFileNode recent_rnode[XLOGPREFETCHER_SEQ_WINDOW_SIZE];
+	BlockNumber recent_block[XLOGPREFETCHER_SEQ_WINDOW_SIZE];
+	int			recent_idx;
+
+	/* Book-keeping to disable prefetching temporarily. */
+	XLogRecPtr	no_readahead_until;
+
+	/* IO depth manager. */
+	LsnReadQueue *streaming_read;
+
+	XLogRecPtr	begin_ptr;
+
+	int			reconfigure_count;
+};
+
+/*
+ * A temporary filter used to track block ranges that haven't been created
+ * yet, whole relations that haven't been created yet, and whole relations
+ * that (we assume) have already been dropped, or will be created by bulk WAL
+ * operators.
+ */
+typedef struct XLogPrefetcherFilter
+{
+	RelFileNode rnode;
+	XLogRecPtr	filter_until_replayed;
+	BlockNumber filter_from_block;
+	dlist_node	link;
+} XLogPrefetcherFilter;
+
+/*
+ * Counters exposed in shared memory for pg_stat_recovery_prefetch.
+ */
+typedef struct XLogPrefetchStats
+{
+	pg_atomic_uint64 reset_time;	/* Time of last reset. */
+	pg_atomic_uint64 prefetch;	/* Prefetches initiated. */
+	pg_atomic_uint64 hit;		/* Blocks already in cache. */
+	pg_atomic_uint64 skip_init; /* Zero-inited blocks skipped. */
+	pg_atomic_uint64 skip_new;	/* New/missing blocks filtered. */
+	pg_atomic_uint64 skip_fpw;	/* FPWs skipped. */
+	pg_atomic_uint64 skip_rep;	/* Repeat accesses skipped. */
+
+	/* Dynamic values */
+	int			wal_distance;	/* Number of WAL bytes ahead. */
+	int			block_distance; /* Number of block references ahead. */
+	int			io_depth;		/* Number of I/Os in progress. */
+} XLogPrefetchStats;
+
+static inline void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher,
+										   RelFileNode rnode,
+										   BlockNumber blockno,
+										   XLogRecPtr lsn);
+static inline bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher,
+											RelFileNode rnode,
+											BlockNumber blockno);
+static inline void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher,
+												 XLogRecPtr replaying_lsn);
+static LsnReadQueueNextStatus XLogPrefetcherNextBlock(uintptr_t pgsr_private,
+													  XLogRecPtr *lsn);
+
+static XLogPrefetchStats *SharedStats;
+
+static inline LsnReadQueue *
+lrq_alloc(uint32 max_distance,
+		  uint32 max_inflight,
+		  uintptr_t lrq_private,
+		  LsnReadQueueNextFun next)
+{
+	LsnReadQueue *lrq;
+	uint32		size;
+
+	Assert(max_distance >= max_inflight);
+
+	size = max_distance + 1;	/* full ring buffer has a gap */
+	lrq = palloc(offsetof(LsnReadQueue, queue) + sizeof(lrq->queue[0]) * size);
+	lrq->lrq_private = lrq_private;
+	lrq->max_inflight = max_inflight;
+	lrq->size = size;
+	lrq->next = next;
+	lrq->head = 0;
+	lrq->tail = 0;
+	lrq->inflight = 0;
+	lrq->completed = 0;
+
+	return lrq;
+}
+
+static inline void
+lrq_free(LsnReadQueue *lrq)
+{
+	pfree(lrq);
+}
+
+static inline uint32
+lrq_inflight(LsnReadQueue *lrq)
+{
+	return lrq->inflight;
+}
+
+static inline uint32
+lrq_completed(LsnReadQueue *lrq)
+{
+	return lrq->completed;
+}
+
+static inline void
+lrq_prefetch(LsnReadQueue *lrq)
+{
+	/* Try to start as many IOs as we can within our limits. */
+	while (lrq->inflight < lrq->max_inflight &&
+		   lrq->inflight + lrq->completed < lrq->size - 1)
+	{
+		Assert(((lrq->head + 1) % lrq->size) != lrq->tail);
+		switch (lrq->next(lrq->lrq_private, &lrq->queue[lrq->head].lsn))
+		{
+			case LRQ_NEXT_AGAIN:
+				return;
+			case LRQ_NEXT_IO:
+				lrq->queue[lrq->head].io = true;
+				lrq->inflight++;
+				break;
+			case LRQ_NEXT_NO_IO:
+				lrq->queue[lrq->head].io = false;
+				lrq->completed++;
+				break;
+		}
+		lrq->head++;
+		if (lrq->head == lrq->size)
+			lrq->head = 0;
+	}
+}
+
+static inline void
+lrq_complete_lsn(LsnReadQueue *lrq, XLogRecPtr lsn)
+{
+	/*
+	 * We know that LSNs before 'lsn' have been replayed, so we can now assume
+	 * that any IOs that were started before then have finished.
+	 */
+	while (lrq->tail != lrq->head &&
+		   lrq->queue[lrq->tail].lsn < lsn)
+	{
+		if (lrq->queue[lrq->tail].io)
+			lrq->inflight--;
+		else
+			lrq->completed--;
+		lrq->tail++;
+		if (lrq->tail == lrq->size)
+			lrq->tail = 0;
+	}
+	if (RecoveryPrefetchEnabled())
+		lrq_prefetch(lrq);
+}
+
+size_t
+XLogPrefetchShmemSize(void)
+{
+	return sizeof(XLogPrefetchStats);
+}
+
+/*
+ * Reset all counters to zero.
+ */
+void
+XLogPrefetchResetStats(void)
+{
+	pg_atomic_write_u64(&SharedStats->reset_time, GetCurrentTimestamp());
+	pg_atomic_write_u64(&SharedStats->prefetch, 0);
+	pg_atomic_write_u64(&SharedStats->hit, 0);
+	pg_atomic_write_u64(&SharedStats->skip_init, 0);
+	pg_atomic_write_u64(&SharedStats->skip_new, 0);
+	pg_atomic_write_u64(&SharedStats->skip_fpw, 0);
+	pg_atomic_write_u64(&SharedStats->skip_rep, 0);
+}
+
+void
+XLogPrefetchShmemInit(void)
+{
+	bool		found;
+
+	SharedStats = (XLogPrefetchStats *)
+		ShmemInitStruct("XLogPrefetchStats",
+						sizeof(XLogPrefetchStats),
+						&found);
+
+	if (!found)
+	{
+		pg_atomic_init_u64(&SharedStats->reset_time, GetCurrentTimestamp());
+		pg_atomic_init_u64(&SharedStats->prefetch, 0);
+		pg_atomic_init_u64(&SharedStats->hit, 0);
+		pg_atomic_init_u64(&SharedStats->skip_init, 0);
+		pg_atomic_init_u64(&SharedStats->skip_new, 0);
+		pg_atomic_init_u64(&SharedStats->skip_fpw, 0);
+		pg_atomic_init_u64(&SharedStats->skip_rep, 0);
+	}
+}
+
+/*
+ * Called when any GUC is changed that affects prefetching.
+ */
+void
+XLogPrefetchReconfigure(void)
+{
+	XLogPrefetchReconfigureCount++;
+}
+
+/*
+ * Increment a counter in shared memory.  This is equivalent to *counter++ on a
+ * plain uint64 without any memory barrier or locking, except on platforms
+ * where readers can't read uint64 without possibly observing a torn value.
+ */
+static inline void
+XLogPrefetchIncrement(pg_atomic_uint64 *counter)
+{
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+	pg_atomic_write_u64(counter, pg_atomic_read_u64(counter) + 1);
+}
+
+/*
+ * Create a prefetcher that is ready to begin prefetching blocks referenced by
+ * WAL records.
+ */
+XLogPrefetcher *
+XLogPrefetcherAllocate(XLogReaderState *reader)
+{
+	XLogPrefetcher *prefetcher;
+	static HASHCTL hash_table_ctl = {
+		.keysize = sizeof(RelFileNode),
+		.entrysize = sizeof(XLogPrefetcherFilter)
+	};
+
+	prefetcher = palloc0(sizeof(XLogPrefetcher));
+
+	prefetcher->reader = reader;
+	prefetcher->filter_table = hash_create("XLogPrefetcherFilterTable", 1024,
+										   &hash_table_ctl,
+										   HASH_ELEM | HASH_BLOBS);
+	dlist_init(&prefetcher->filter_queue);
+
+	SharedStats->wal_distance = 0;
+	SharedStats->block_distance = 0;
+	SharedStats->io_depth = 0;
+
+	/* First usage will cause streaming_read to be allocated. */
+	prefetcher->reconfigure_count = XLogPrefetchReconfigureCount - 1;
+
+	return prefetcher;
+}
+
+/*
+ * Destroy a prefetcher and release all resources.
+ */
+void
+XLogPrefetcherFree(XLogPrefetcher *prefetcher)
+{
+	lrq_free(prefetcher->streaming_read);
+	hash_destroy(prefetcher->filter_table);
+	pfree(prefetcher);
+}
+
+/*
+ * Provide access to the reader.
+ */
+XLogReaderState *
+XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
+{
+	return prefetcher->reader;
+}
+
+/*
+ * Update the statistics visible in the pg_stat_recovery_prefetch view.
+ */
+void
+XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
+{
+	uint32		io_depth;
+	uint32		completed;
+	int64		wal_distance;
+
+
+	/* How far ahead of replay are we now? */
+	if (prefetcher->reader->decode_queue_tail)
+	{
+		wal_distance =
+			prefetcher->reader->decode_queue_tail->lsn -
+			prefetcher->reader->decode_queue_head->lsn;
+	}
+	else
+	{
+		wal_distance = 0;
+	}
+
+	/* How many IOs are currently in flight and completed? */
+	io_depth = lrq_inflight(prefetcher->streaming_read);
+	completed = lrq_completed(prefetcher->streaming_read);
+
+	/* Update the instantaneous stats visible in pg_stat_recovery_prefetch. */
+	SharedStats->io_depth = io_depth;
+	SharedStats->block_distance = io_depth + completed;
+	SharedStats->wal_distance = wal_distance;
+
+	prefetcher->next_stats_shm_lsn =
+		prefetcher->reader->ReadRecPtr + XLOGPREFETCHER_STATS_DISTANCE;
+}
+
+/*
+ * A callback that examines the next block reference in the WAL, and possibly
+ * starts an IO so that a later read will be fast.
+ *
+ * Returns LRQ_NEXT_AGAIN if no more WAL data is available yet.
+ *
+ * Returns LRQ_NEXT_IO if the next block reference is for a main fork block
+ * that isn't in the buffer pool, and the kernel has been asked to start
+ * reading it to make a future read system call faster. An LSN is written to
+ * *lsn, and the I/O will be considered to have completed once that LSN is
+ * replayed.
+ *
+ * Returns LRQ_NO_IO if we examined the next block reference and found that it
+ * was already in the buffer pool, or we decided for various reasons not to
+ * prefetch.
+ */
+static LsnReadQueueNextStatus
+XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
+{
+	XLogPrefetcher *prefetcher = (XLogPrefetcher *) pgsr_private;
+	XLogReaderState *reader = prefetcher->reader;
+	XLogRecPtr	replaying_lsn = reader->ReadRecPtr;
+
+	/*
+	 * We keep track of the record and block we're up to between calls with
+	 * prefetcher->record and prefetcher->next_block_id.
+	 */
+	for (;;)
+	{
+		DecodedXLogRecord *record;
+
+		/* Try to read a new future record, if we don't already have one. */
+		if (prefetcher->record == NULL)
+		{
+			bool		nonblocking;
+
+			/*
+			 * If there are already records or an error queued up that could
+			 * be replayed, we don't want to block here.  Otherwise, it's OK
+			 * to block waiting for more data: presumably the caller has
+			 * nothing else to do.
+			 */
+			nonblocking = XLogReaderHasQueuedRecordOrError(reader);
+
+			/* Readahead is disabled until we replay past a certain point. */
+			if (nonblocking && replaying_lsn <= prefetcher->no_readahead_until)
+				return LRQ_NEXT_AGAIN;
+
+			record = XLogReadAhead(prefetcher->reader, nonblocking);
+			if (record == NULL)
+			{
+				/*
+				 * We can't read any more, due to an error or lack of data in
+				 * nonblocking mode.  Don't try to read ahead again until
+				 * we've replayed everything already decoded.
+				 */
+				if (nonblocking && prefetcher->reader->decode_queue_tail)
+					prefetcher->no_readahead_until =
+						prefetcher->reader->decode_queue_tail->lsn;
+
+				return LRQ_NEXT_AGAIN;
+			}
+
+			/*
+			 * If prefetching is disabled, we don't need to analyze the record
+			 * or issue any prefetches.  We just need to cause one record to
+			 * be decoded.
+			 */
+			if (!RecoveryPrefetchEnabled())
+			{
+				*lsn = InvalidXLogRecPtr;
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/* We have a new record to process. */
+			prefetcher->record = record;
+			prefetcher->next_block_id = 0;
+		}
+		else
+		{
+			/* Continue to process from last call, or last loop. */
+			record = prefetcher->record;
+		}
+
+		/*
+		 * Check for operations that require us to filter out block ranges, or
+		 * pause readahead completely.
+		 */
+		if (replaying_lsn < record->lsn)
+		{
+			uint8		rmid = record->header.xl_rmid;
+			uint8		record_type = record->header.xl_info & ~XLR_INFO_MASK;
+
+			if (rmid == RM_XLOG_ID)
+			{
+				if (record_type == XLOG_CHECKPOINT_SHUTDOWN ||
+					record_type == XLOG_END_OF_RECOVERY)
+				{
+					/*
+					 * These records might change the TLI.  Avoid potential
+					 * bugs if we were to allow "read TLI" and "replay TLI" to
+					 * differ without more analysis.
+					 */
+					prefetcher->no_readahead_until = record->lsn;
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+					elog(XLOGPREFETCHER_DEBUG_LEVEL,
+						 "suppressing all readahead until %X/%X is replayed due to possible TLI change",
+						 LSN_FORMAT_ARGS(record->lsn));
+#endif
+
+					/* Fall through so we move past this record. */
+				}
+			}
+			else if (rmid == RM_DBASE_ID)
+			{
+				/*
+				 * When databases are created with the file-copy strategy,
+				 * there are no WAL records to tell us about the creation of
+				 * individual relations.
+				 */
+				if (record_type == XLOG_DBASE_CREATE_FILE_COPY)
+				{
+					xl_dbase_create_file_copy_rec *xlrec =
+					(xl_dbase_create_file_copy_rec *) record->main_data;
+					RelFileNode rnode = {InvalidOid, xlrec->db_id, InvalidOid};
+
+					/*
+					 * Don't try to prefetch anything in this database until
+					 * it has been created, or we might confuse the blocks of
+					 * different generations, if a database OID or relfilenode
+					 * is reused.  It's also more efficient than discovering
+					 * that relations don't exist on disk yet with ENOENT
+					 * errors.
+					 */
+					XLogPrefetcherAddFilter(prefetcher, rnode, 0, record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+					elog(XLOGPREFETCHER_DEBUG_LEVEL,
+						 "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy",
+						 rnode.dbNode,
+						 LSN_FORMAT_ARGS(record->lsn));
+#endif
+				}
+			}
+			else if (rmid == RM_SMGR_ID)
+			{
+				if (record_type == XLOG_SMGR_CREATE)
+				{
+					xl_smgr_create *xlrec = (xl_smgr_create *)
+					record->main_data;
+
+					if (xlrec->forkNum == MAIN_FORKNUM)
+					{
+						/*
+						 * Don't prefetch anything for this whole relation
+						 * until it has been created.  Otherwise we might
+						 * confuse the blocks of different generations, if a
+						 * relfilenode is reused.  This also avoids the need
+						 * to discover the problem via extra syscalls that
+						 * report ENOENT.
+						 */
+						XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, 0,
+												record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+						elog(XLOGPREFETCHER_DEBUG_LEVEL,
+							 "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+							 xlrec->rnode.spcNode,
+							 xlrec->rnode.dbNode,
+							 xlrec->rnode.relNode,
+							 LSN_FORMAT_ARGS(record->lsn));
+#endif
+					}
+				}
+				else if (record_type == XLOG_SMGR_TRUNCATE)
+				{
+					xl_smgr_truncate *xlrec = (xl_smgr_truncate *)
+					record->main_data;
+
+					/*
+					 * Don't consider prefetching anything in the truncated
+					 * range until the truncation has been performed.
+					 */
+					XLogPrefetcherAddFilter(prefetcher, xlrec->rnode,
+											xlrec->blkno,
+											record->lsn);
+
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+					elog(XLOGPREFETCHER_DEBUG_LEVEL,
+						 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+						 xlrec->rnode.spcNode,
+						 xlrec->rnode.dbNode,
+						 xlrec->rnode.relNode,
+						 xlrec->blkno,
+						 LSN_FORMAT_ARGS(record->lsn));
+#endif
+				}
+			}
+		}
+
+		/* Scan the block references, starting where we left off last time. */
+		while (prefetcher->next_block_id <= record->max_block_id)
+		{
+			int			block_id = prefetcher->next_block_id++;
+			DecodedBkpBlock *block = &record->blocks[block_id];
+			SMgrRelation reln;
+			PrefetchBufferResult result;
+
+			if (!block->in_use)
+				continue;
+
+			Assert(!BufferIsValid(block->prefetch_buffer));;
+
+			/*
+			 * Record the LSN of this record.  When it's replayed,
+			 * LsnReadQueue will consider any IOs submitted for earlier LSNs
+			 * to be finished.
+			 */
+			*lsn = record->lsn;
+
+			/* We don't try to prefetch anything but the main fork for now. */
+			if (block->forknum != MAIN_FORKNUM)
+			{
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/*
+			 * If there is a full page image attached, we won't be reading the
+			 * page, so don't bother trying to prefetch.
+			 */
+			if (block->has_image)
+			{
+				XLogPrefetchIncrement(&SharedStats->skip_fpw);
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/* There is no point in reading a page that will be zeroed. */
+			if (block->flags & BKPBLOCK_WILL_INIT)
+			{
+				XLogPrefetchIncrement(&SharedStats->skip_init);
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/* Should we skip prefetching this block due to a filter? */
+			if (XLogPrefetcherIsFiltered(prefetcher, block->rnode, block->blkno))
+			{
+				XLogPrefetchIncrement(&SharedStats->skip_new);
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/* There is no point in repeatedly prefetching the same block. */
+			for (int i = 0; i < XLOGPREFETCHER_SEQ_WINDOW_SIZE; ++i)
+			{
+				if (block->blkno == prefetcher->recent_block[i] &&
+					RelFileNodeEquals(block->rnode, prefetcher->recent_rnode[i]))
+				{
+					/*
+					 * XXX If we also remembered where it was, we could set
+					 * recent_buffer so that recovery could skip smgropen()
+					 * and a buffer table lookup.
+					 */
+					XLogPrefetchIncrement(&SharedStats->skip_rep);
+					return LRQ_NEXT_NO_IO;
+				}
+			}
+			prefetcher->recent_rnode[prefetcher->recent_idx] = block->rnode;
+			prefetcher->recent_block[prefetcher->recent_idx] = block->blkno;
+			prefetcher->recent_idx =
+				(prefetcher->recent_idx + 1) % XLOGPREFETCHER_SEQ_WINDOW_SIZE;
+
+			/*
+			 * We could try to have a fast path for repeated references to the
+			 * same relation (with some scheme to handle invalidations
+			 * safely), but for now we'll call smgropen() every time.
+			 */
+			reln = smgropen(block->rnode, InvalidBackendId);
+
+			/*
+			 * If the relation file doesn't exist on disk, for example because
+			 * we're replaying after a crash and the file will be created and
+			 * then unlinked by WAL that hasn't been replayed yet, suppress
+			 * further prefetching in the relation until this record is
+			 * replayed.
+			 */
+			if (!smgrexists(reln, MAIN_FORKNUM))
+			{
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+				elog(XLOGPREFETCHER_DEBUG_LEVEL,
+					 "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 LSN_FORMAT_ARGS(record->lsn));
+#endif
+				XLogPrefetcherAddFilter(prefetcher, block->rnode, 0,
+										record->lsn);
+				XLogPrefetchIncrement(&SharedStats->skip_new);
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/*
+			 * If the relation isn't big enough to contain the referenced
+			 * block yet, suppress prefetching of this block and higher until
+			 * this record is replayed.
+			 */
+			if (block->blkno >= smgrnblocks(reln, block->forknum))
+			{
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+				elog(XLOGPREFETCHER_DEBUG_LEVEL,
+					 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 block->blkno,
+					 LSN_FORMAT_ARGS(record->lsn));
+#endif
+				XLogPrefetcherAddFilter(prefetcher, block->rnode, block->blkno,
+										record->lsn);
+				XLogPrefetchIncrement(&SharedStats->skip_new);
+				return LRQ_NEXT_NO_IO;
+			}
+
+			/* Try to initiate prefetching. */
+			result = PrefetchSharedBuffer(reln, block->forknum, block->blkno);
+			if (BufferIsValid(result.recent_buffer))
+			{
+				/* Cache hit, nothing to do. */
+				XLogPrefetchIncrement(&SharedStats->hit);
+				block->prefetch_buffer = result.recent_buffer;
+				return LRQ_NEXT_NO_IO;
+			}
+			else if (result.initiated_io)
+			{
+				/* Cache miss, I/O (presumably) started. */
+				XLogPrefetchIncrement(&SharedStats->prefetch);
+				block->prefetch_buffer = InvalidBuffer;
+				return LRQ_NEXT_IO;
+			}
+			else
+			{
+				/*
+				 * This shouldn't be possible, because we already determined
+				 * that the relation exists on disk and is big enough.
+				 * Something is wrong with the cache invalidation for
+				 * smgrexists(), smgrnblocks(), or the file was unlinked or
+				 * truncated beneath our feet?
+				 */
+				elog(ERROR,
+					 "could not prefetch relation %u/%u/%u block %u",
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 block->blkno);
+			}
+		}
+
+		/*
+		 * Several callsites need to be able to read exactly one record
+		 * without any internal readahead.  Examples: xlog.c reading
+		 * checkpoint records with emode set to PANIC, which might otherwise
+		 * cause XLogPageRead() to panic on some future page, and xlog.c
+		 * determining where to start writing WAL next, which depends on the
+		 * contents of the reader's internal buffer after reading one record.
+		 * Therefore, don't even think about prefetching until the first
+		 * record after XLogPrefetcherBeginRead() has been consumed.
+		 */
+		if (prefetcher->reader->decode_queue_tail &&
+			prefetcher->reader->decode_queue_tail->lsn == prefetcher->begin_ptr)
+			return LRQ_NEXT_AGAIN;
+
+		/* Advance to the next record. */
+		prefetcher->record = NULL;
+	}
+	pg_unreachable();
+}
+
+/*
+ * Expose statistics about recovery prefetching.
+ */
+Datum
+pg_stat_get_recovery_prefetch(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_RECOVERY_PREFETCH_COLS 10
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[PG_STAT_GET_RECOVERY_PREFETCH_COLS];
+	bool		nulls[PG_STAT_GET_RECOVERY_PREFETCH_COLS];
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	for (int i = 0; i < PG_STAT_GET_RECOVERY_PREFETCH_COLS; ++i)
+		nulls[i] = false;
+
+	values[0] = TimestampTzGetDatum(pg_atomic_read_u64(&SharedStats->reset_time));
+	values[1] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->prefetch));
+	values[2] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->hit));
+	values[3] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_init));
+	values[4] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_new));
+	values[5] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_fpw));
+	values[6] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_rep));
+	values[7] = Int32GetDatum(SharedStats->wal_distance);
+	values[8] = Int32GetDatum(SharedStats->block_distance);
+	values[9] = Int32GetDatum(SharedStats->io_depth);
+	tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+
+	return (Datum) 0;
+}
+
+/*
+ * Don't prefetch any blocks >= 'blockno' from a given 'rnode', until 'lsn'
+ * has been replayed.
+ */
+static inline void
+XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, RelFileNode rnode,
+						BlockNumber blockno, XLogRecPtr lsn)
+{
+	XLogPrefetcherFilter *filter;
+	bool		found;
+
+	filter = hash_search(prefetcher->filter_table, &rnode, HASH_ENTER, &found);
+	if (!found)
+	{
+		/*
+		 * Don't allow any prefetching of this block or higher until replayed.
+		 */
+		filter->filter_until_replayed = lsn;
+		filter->filter_from_block = blockno;
+		dlist_push_head(&prefetcher->filter_queue, &filter->link);
+	}
+	else
+	{
+		/*
+		 * We were already filtering this rnode.  Extend the filter's lifetime
+		 * to cover this WAL record, but leave the lower of the block numbers
+		 * there because we don't want to have to track individual blocks.
+		 */
+		filter->filter_until_replayed = lsn;
+		dlist_delete(&filter->link);
+		dlist_push_head(&prefetcher->filter_queue, &filter->link);
+		filter->filter_from_block = Min(filter->filter_from_block, blockno);
+	}
+}
+
+/*
+ * Have we replayed any records that caused us to begin filtering a block
+ * range?  That means that relations should have been created, extended or
+ * dropped as required, so we can stop filtering out accesses to a given
+ * relfilenode.
+ */
+static inline void
+XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn)
+{
+	while (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
+	{
+		XLogPrefetcherFilter *filter = dlist_tail_element(XLogPrefetcherFilter,
+														  link,
+														  &prefetcher->filter_queue);
+
+		if (filter->filter_until_replayed >= replaying_lsn)
+			break;
+
+		dlist_delete(&filter->link);
+		hash_search(prefetcher->filter_table, filter, HASH_REMOVE, NULL);
+	}
+}
+
+/*
+ * Check if a given block should be skipped due to a filter.
+ */
+static inline bool
+XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileNode rnode,
+						 BlockNumber blockno)
+{
+	/*
+	 * Test for empty queue first, because we expect it to be empty most of
+	 * the time and we can avoid the hash table lookup in that case.
+	 */
+	if (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
+	{
+		XLogPrefetcherFilter *filter;
+
+		/* See if the block range is filtered. */
+		filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
+		if (filter && filter->filter_from_block <= blockno)
+		{
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+			elog(XLOGPREFETCHER_DEBUG_LEVEL,
+				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+				 rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
+				 LSN_FORMAT_ARGS(filter->filter_until_replayed),
+				 filter->filter_from_block);
+#endif
+			return true;
+		}
+
+		/* See if the whole database is filtered. */
+		rnode.relNode = InvalidOid;
+		rnode.spcNode = InvalidOid;
+		filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
+		if (filter)
+		{
+#ifdef XLOGPREFETCHER_DEBUG_LEVEL
+			elog(XLOGPREFETCHER_DEBUG_LEVEL,
+				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+				 rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
+				 LSN_FORMAT_ARGS(filter->filter_until_replayed));
+#endif
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * A wrapper for XLogBeginRead() that also resets the prefetcher.
+ */
+void
+XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
+{
+	/* This will forget about any in-flight IO. */
+	prefetcher->reconfigure_count--;
+
+	/* Book-keeping to avoid readahead on first read. */
+	prefetcher->begin_ptr = recPtr;
+
+	prefetcher->no_readahead_until = 0;
+
+	/* This will forget about any queued up records in the decoder. */
+	XLogBeginRead(prefetcher->reader, recPtr);
+}
+
+/*
+ * A wrapper for XLogReadRecord() that provides the same interface, but also
+ * tries to initiate I/O for blocks referenced in future WAL records.
+ */
+XLogRecord *
+XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
+{
+	DecodedXLogRecord *record;
+	XLogRecPtr	replayed_up_to;
+
+	/*
+	 * See if it's time to reset the prefetching machinery, because a relevant
+	 * GUC was changed.
+	 */
+	if (unlikely(XLogPrefetchReconfigureCount != prefetcher->reconfigure_count))
+	{
+		uint32		max_distance;
+		uint32		max_inflight;
+
+		if (prefetcher->streaming_read)
+			lrq_free(prefetcher->streaming_read);
+
+		if (RecoveryPrefetchEnabled())
+		{
+			Assert(maintenance_io_concurrency > 0);
+			max_inflight = maintenance_io_concurrency;
+			max_distance = max_inflight * XLOGPREFETCHER_DISTANCE_MULTIPLIER;
+		}
+		else
+		{
+			max_inflight = 1;
+			max_distance = 1;
+		}
+
+		prefetcher->streaming_read = lrq_alloc(max_distance,
+											   max_inflight,
+											   (uintptr_t) prefetcher,
+											   XLogPrefetcherNextBlock);
+
+		prefetcher->reconfigure_count = XLogPrefetchReconfigureCount;
+	}
+
+	/*
+	 * Release last returned record, if there is one, as it's now been
+	 * replayed.
+	 */
+	replayed_up_to = XLogReleasePreviousRecord(prefetcher->reader);
+
+	/*
+	 * Can we drop any filters yet?  If we were waiting for a relation to be
+	 * created or extended, it is now OK to access blocks in the covered
+	 * range.
+	 */
+	XLogPrefetcherCompleteFilters(prefetcher, replayed_up_to);
+
+	/*
+	 * All IO initiated by earlier WAL is now completed.  This might trigger
+	 * further prefetching.
+	 */
+	lrq_complete_lsn(prefetcher->streaming_read, replayed_up_to);
+
+	/*
+	 * If there's nothing queued yet, then start prefetching to cause at least
+	 * one record to be queued.
+	 */
+	if (!XLogReaderHasQueuedRecordOrError(prefetcher->reader))
+	{
+		Assert(lrq_inflight(prefetcher->streaming_read) == 0);
+		Assert(lrq_completed(prefetcher->streaming_read) == 0);
+		lrq_prefetch(prefetcher->streaming_read);
+	}
+
+	/* Read the next record. */
+	record = XLogNextRecord(prefetcher->reader, errmsg);
+	if (!record)
+		return NULL;
+
+	/*
+	 * The record we just got is the "current" one, for the benefit of the
+	 * XLogRecXXX() macros.
+	 */
+	Assert(record == prefetcher->reader->record);
+
+	/*
+	 * If maintenance_io_concurrency is set very low, we might have started
+	 * prefetching some but not all of the blocks referenced in the record
+	 * we're about to return.  Forget about the rest of the blocks in this
+	 * record by dropping the prefetcher's reference to it.
+	 */
+	if (record == prefetcher->record)
+		prefetcher->record = NULL;
+
+	/*
+	 * See if it's time to compute some statistics, because enough WAL has
+	 * been processed.
+	 */
+	if (unlikely(record->lsn >= prefetcher->next_stats_shm_lsn))
+		XLogPrefetcherComputeStats(prefetcher);
+
+	Assert(record == prefetcher->reader->record);
+
+	return &record->header;
+}
+
+bool
+check_recovery_prefetch(int *new_value, void **extra, GucSource source)
+{
+#ifndef USE_PREFETCH
+	if (*new_value == RECOVERY_PREFETCH_ON)
+	{
+		GUC_check_errdetail("recovery_prefetch is not supported on platforms that lack posix_fadvise().");
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+void
+assign_recovery_prefetch(int new_value, void *extra)
+{
+	/* Reconfigure prefetching, because a setting it depends on changed. */
+	recovery_prefetch = new_value;
+	if (AmStartupProcess())
+		XLogPrefetchReconfigure();
+}
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
new file mode 100644
index 0000000..c15da9d
--- /dev/null
+++ b/src/backend/access/transam/xlogreader.c
@@ -0,0 +1,2165 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogreader.c
+ *		Generic XLog reading facility
+ *
+ * Portions Copyright (c) 2013-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/backend/access/transam/xlogreader.c
+ *
+ * NOTES
+ *		See xlogreader.h for more notes on this facility.
+ *
+ *		This file is compiled as both front-end and backend code, so it
+ *		may not use ereport, server-defined static variables, etc.
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#ifdef USE_LZ4
+#include <lz4.h>
+#endif
+#ifdef USE_ZSTD
+#include <zstd.h>
+#endif
+
+#include "access/transam.h"
+#include "access/xlog_internal.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecord.h"
+#include "catalog/pg_control.h"
+#include "common/pg_lzcompress.h"
+#include "replication/origin.h"
+
+#ifndef FRONTEND
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/memutils.h"
+#else
+#include "common/logging.h"
+#endif
+
+static void report_invalid_record(XLogReaderState *state, const char *fmt,...)
+			pg_attribute_printf(2, 3);
+static void allocate_recordbuf(XLogReaderState *state, uint32 reclength);
+static int	ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr,
+							 int reqLen);
+static void XLogReaderInvalReadState(XLogReaderState *state);
+static XLogPageReadResult XLogDecodeNextRecord(XLogReaderState *state, bool non_blocking);
+static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
+								  XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess);
+static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record,
+							XLogRecPtr recptr);
+static void ResetDecoder(XLogReaderState *state);
+static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
+							   int segsize, const char *waldir);
+
+/* size of the buffer allocated for error message. */
+#define MAX_ERRORMSG_LEN 1000
+
+/*
+ * Default size; large enough that typical users of XLogReader won't often need
+ * to use the 'oversized' memory allocation code path.
+ */
+#define DEFAULT_DECODE_BUFFER_SIZE (64 * 1024)
+
+/*
+ * Construct a string in state->errormsg_buf explaining what's wrong with
+ * the current record being read.
+ */
+static void
+report_invalid_record(XLogReaderState *state, const char *fmt,...)
+{
+	va_list		args;
+
+	fmt = _(fmt);
+
+	va_start(args, fmt);
+	vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args);
+	va_end(args);
+
+	state->errormsg_deferred = true;
+}
+
+/*
+ * Set the size of the decoding buffer.  A pointer to a caller supplied memory
+ * region may also be passed in, in which case non-oversized records will be
+ * decoded there.
+ */
+void
+XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
+{
+	Assert(state->decode_buffer == NULL);
+
+	state->decode_buffer = buffer;
+	state->decode_buffer_size = size;
+	state->decode_buffer_tail = buffer;
+	state->decode_buffer_head = buffer;
+}
+
+/*
+ * Allocate and initialize a new XLogReader.
+ *
+ * Returns NULL if the xlogreader couldn't be allocated.
+ */
+XLogReaderState *
+XLogReaderAllocate(int wal_segment_size, const char *waldir,
+				   XLogReaderRoutine *routine, void *private_data)
+{
+	XLogReaderState *state;
+
+	state = (XLogReaderState *)
+		palloc_extended(sizeof(XLogReaderState),
+						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+	if (!state)
+		return NULL;
+
+	/* initialize caller-provided support functions */
+	state->routine = *routine;
+
+	/*
+	 * Permanently allocate readBuf.  We do it this way, rather than just
+	 * making a static array, for two reasons: (1) no need to waste the
+	 * storage in most instantiations of the backend; (2) a static char array
+	 * isn't guaranteed to have any particular alignment, whereas
+	 * palloc_extended() will provide MAXALIGN'd storage.
+	 */
+	state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ,
+											  MCXT_ALLOC_NO_OOM);
+	if (!state->readBuf)
+	{
+		pfree(state);
+		return NULL;
+	}
+
+	/* Initialize segment info. */
+	WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size,
+					   waldir);
+
+	/* system_identifier initialized to zeroes above */
+	state->private_data = private_data;
+	/* ReadRecPtr, EndRecPtr and readLen initialized to zeroes above */
+	state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1,
+										  MCXT_ALLOC_NO_OOM);
+	if (!state->errormsg_buf)
+	{
+		pfree(state->readBuf);
+		pfree(state);
+		return NULL;
+	}
+	state->errormsg_buf[0] = '\0';
+
+	/*
+	 * Allocate an initial readRecordBuf of minimal size, which can later be
+	 * enlarged if necessary.
+	 */
+	allocate_recordbuf(state, 0);
+	return state;
+}
+
+void
+XLogReaderFree(XLogReaderState *state)
+{
+	if (state->seg.ws_file != -1)
+		state->routine.segment_close(state);
+
+	if (state->decode_buffer && state->free_decode_buffer)
+		pfree(state->decode_buffer);
+
+	pfree(state->errormsg_buf);
+	if (state->readRecordBuf)
+		pfree(state->readRecordBuf);
+	pfree(state->readBuf);
+	pfree(state);
+}
+
+/*
+ * Allocate readRecordBuf to fit a record of at least the given length.
+ *
+ * readRecordBufSize is set to the new buffer size.
+ *
+ * To avoid useless small increases, round its size to a multiple of
+ * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start
+ * with.  (That is enough for all "normal" records, but very large commit or
+ * abort records might need more space.)
+ *
+ * Note: This routine should *never* be called for xl_tot_len until the header
+ * of the record has been fully validated.
+ */
+static void
+allocate_recordbuf(XLogReaderState *state, uint32 reclength)
+{
+	uint32		newSize = reclength;
+
+	newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
+	newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ));
+
+	if (state->readRecordBuf)
+		pfree(state->readRecordBuf);
+	state->readRecordBuf = (char *) palloc(newSize);
+	state->readRecordBufSize = newSize;
+}
+
+/*
+ * Initialize the passed segment structs.
+ */
+static void
+WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
+				   int segsize, const char *waldir)
+{
+	seg->ws_file = -1;
+	seg->ws_segno = 0;
+	seg->ws_tli = 0;
+
+	segcxt->ws_segsize = segsize;
+	if (waldir)
+		snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir);
+}
+
+/*
+ * Begin reading WAL at 'RecPtr'.
+ *
+ * 'RecPtr' should point to the beginning of a valid WAL record.  Pointing at
+ * the beginning of a page is also OK, if there is a new record right after
+ * the page header, i.e. not a continuation.
+ *
+ * This does not make any attempt to read the WAL yet, and hence cannot fail.
+ * If the starting address is not correct, the first call to XLogReadRecord()
+ * will error out.
+ */
+void
+XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
+{
+	Assert(!XLogRecPtrIsInvalid(RecPtr));
+
+	ResetDecoder(state);
+
+	/* Begin at the passed-in record pointer. */
+	state->EndRecPtr = RecPtr;
+	state->NextRecPtr = RecPtr;
+	state->ReadRecPtr = InvalidXLogRecPtr;
+	state->DecodeRecPtr = InvalidXLogRecPtr;
+}
+
+/*
+ * Release the last record that was returned by XLogNextRecord(), if any, to
+ * free up space.  Returns the LSN past the end of the record.
+ */
+XLogRecPtr
+XLogReleasePreviousRecord(XLogReaderState *state)
+{
+	DecodedXLogRecord *record;
+	XLogRecPtr		next_lsn;
+
+	if (!state->record)
+		return InvalidXLogRecPtr;
+
+	/*
+	 * Remove it from the decoded record queue.  It must be the oldest item
+	 * decoded, decode_queue_head.
+	 */
+	record = state->record;
+	next_lsn = record->next_lsn;
+	Assert(record == state->decode_queue_head);
+	state->record = NULL;
+	state->decode_queue_head = record->next;
+
+	/* It might also be the newest item decoded, decode_queue_tail. */
+	if (state->decode_queue_tail == record)
+		state->decode_queue_tail = NULL;
+
+	/* Release the space. */
+	if (unlikely(record->oversized))
+	{
+		/* It's not in the decode buffer, so free it to release space. */
+		pfree(record);
+	}
+	else
+	{
+		/* It must be the head (oldest) record in the decode buffer. */
+		Assert(state->decode_buffer_head == (char *) record);
+
+		/*
+		 * We need to update head to point to the next record that is in the
+		 * decode buffer, if any, being careful to skip oversized ones
+		 * (they're not in the decode buffer).
+		 */
+		record = record->next;
+		while (unlikely(record && record->oversized))
+			record = record->next;
+
+		if (record)
+		{
+			/* Adjust head to release space up to the next record. */
+			state->decode_buffer_head = (char *) record;
+		}
+		else
+		{
+			/*
+			 * Otherwise we might as well just reset head and tail to the
+			 * start of the buffer space, because we're empty.  This means
+			 * we'll keep overwriting the same piece of memory if we're not
+			 * doing any prefetching.
+			 */
+			state->decode_buffer_head = state->decode_buffer;
+			state->decode_buffer_tail = state->decode_buffer;
+		}
+	}
+
+	return next_lsn;
+}
+
+/*
+ * Attempt to read an XLOG record.
+ *
+ * XLogBeginRead() or XLogFindNextRecord() and then XLogReadAhead() must be
+ * called before the first call to XLogNextRecord().  This functions returns
+ * records and errors that were put into an internal queue by XLogReadAhead().
+ *
+ * On success, a record is returned.
+ *
+ * The returned record (or *errormsg) points to an internal buffer that's
+ * valid until the next call to XLogNextRecord.
+ */
+DecodedXLogRecord *
+XLogNextRecord(XLogReaderState *state, char **errormsg)
+{
+	/* Release the last record returned by XLogNextRecord(). */
+	XLogReleasePreviousRecord(state);
+
+	if (state->decode_queue_head == NULL)
+	{
+		*errormsg = NULL;
+		if (state->errormsg_deferred)
+		{
+			if (state->errormsg_buf[0] != '\0')
+				*errormsg = state->errormsg_buf;
+			state->errormsg_deferred = false;
+		}
+
+		/*
+		 * state->EndRecPtr is expected to have been set by the last call to
+		 * XLogBeginRead() or XLogNextRecord(), and is the location of the
+		 * error.
+		 */
+		Assert(!XLogRecPtrIsInvalid(state->EndRecPtr));
+
+		return NULL;
+	}
+
+	/*
+	 * Record this as the most recent record returned, so that we'll release
+	 * it next time.  This also exposes it to the traditional
+	 * XLogRecXXX(xlogreader) macros, which work with the decoder rather than
+	 * the record for historical reasons.
+	 */
+	state->record = state->decode_queue_head;
+
+	/*
+	 * Update the pointers to the beginning and one-past-the-end of this
+	 * record, again for the benefit of historical code that expected the
+	 * decoder to track this rather than accessing these fields of the record
+	 * itself.
+	 */
+	state->ReadRecPtr = state->record->lsn;
+	state->EndRecPtr = state->record->next_lsn;
+
+	*errormsg = NULL;
+
+	return state->record;
+}
+
+/*
+ * Attempt to read an XLOG record.
+ *
+ * XLogBeginRead() or XLogFindNextRecord() must be called before the first call
+ * to XLogReadRecord().
+ *
+ * If the page_read callback fails to read the requested data, NULL is
+ * returned.  The callback is expected to have reported the error; errormsg
+ * is set to NULL.
+ *
+ * If the reading fails for some other reason, NULL is also returned, and
+ * *errormsg is set to a string with details of the failure.
+ *
+ * The returned pointer (or *errormsg) points to an internal buffer that's
+ * valid until the next call to XLogReadRecord.
+ */
+XLogRecord *
+XLogReadRecord(XLogReaderState *state, char **errormsg)
+{
+	DecodedXLogRecord *decoded;
+
+	/*
+	 * Release last returned record, if there is one.  We need to do this so
+	 * that we can check for empty decode queue accurately.
+	 */
+	XLogReleasePreviousRecord(state);
+
+	/*
+	 * Call XLogReadAhead() in blocking mode to make sure there is something
+	 * in the queue, though we don't use the result.
+	 */
+	if (!XLogReaderHasQueuedRecordOrError(state))
+		XLogReadAhead(state, false /* nonblocking */ );
+
+	/* Consume the head record or error. */
+	decoded = XLogNextRecord(state, errormsg);
+	if (decoded)
+	{
+		/*
+		 * This function returns a pointer to the record's header, not the
+		 * actual decoded record.  The caller will access the decoded record
+		 * through the XLogRecGetXXX() macros, which reach the decoded
+		 * recorded as xlogreader->record.
+		 */
+		Assert(state->record == decoded);
+		return &decoded->header;
+	}
+
+	return NULL;
+}
+
+/*
+ * Allocate space for a decoded record.  The only member of the returned
+ * object that is initialized is the 'oversized' flag, indicating that the
+ * decoded record wouldn't fit in the decode buffer and must eventually be
+ * freed explicitly.
+ *
+ * The caller is responsible for adjusting decode_buffer_tail with the real
+ * size after successfully decoding a record into this space.  This way, if
+ * decoding fails, then there is nothing to undo unless the 'oversized' flag
+ * was set and pfree() must be called.
+ *
+ * Return NULL if there is no space in the decode buffer and allow_oversized
+ * is false, or if memory allocation fails for an oversized buffer.
+ */
+static DecodedXLogRecord *
+XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized)
+{
+	size_t		required_space = DecodeXLogRecordRequiredSpace(xl_tot_len);
+	DecodedXLogRecord *decoded = NULL;
+
+	/* Allocate a circular decode buffer if we don't have one already. */
+	if (unlikely(state->decode_buffer == NULL))
+	{
+		if (state->decode_buffer_size == 0)
+			state->decode_buffer_size = DEFAULT_DECODE_BUFFER_SIZE;
+		state->decode_buffer = palloc(state->decode_buffer_size);
+		state->decode_buffer_head = state->decode_buffer;
+		state->decode_buffer_tail = state->decode_buffer;
+		state->free_decode_buffer = true;
+	}
+
+	/* Try to allocate space in the circular decode buffer. */
+	if (state->decode_buffer_tail >= state->decode_buffer_head)
+	{
+		/* Empty, or tail is to the right of head. */
+		if (state->decode_buffer_tail + required_space <=
+			state->decode_buffer + state->decode_buffer_size)
+		{
+			/* There is space between tail and end. */
+			decoded = (DecodedXLogRecord *) state->decode_buffer_tail;
+			decoded->oversized = false;
+			return decoded;
+		}
+		else if (state->decode_buffer + required_space <
+				 state->decode_buffer_head)
+		{
+			/* There is space between start and head. */
+			decoded = (DecodedXLogRecord *) state->decode_buffer;
+			decoded->oversized = false;
+			return decoded;
+		}
+	}
+	else
+	{
+		/* Tail is to the left of head. */
+		if (state->decode_buffer_tail + required_space <
+			state->decode_buffer_head)
+		{
+			/* There is space between tail and head. */
+			decoded = (DecodedXLogRecord *) state->decode_buffer_tail;
+			decoded->oversized = false;
+			return decoded;
+		}
+	}
+
+	/* Not enough space in the decode buffer.  Are we allowed to allocate? */
+	if (allow_oversized)
+	{
+		decoded = palloc(required_space);
+		decoded->oversized = true;
+		return decoded;
+	}
+
+	return NULL;
+}
+
+static XLogPageReadResult
+XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
+{
+	XLogRecPtr	RecPtr;
+	XLogRecord *record;
+	XLogRecPtr	targetPagePtr;
+	bool		randAccess;
+	uint32		len,
+				total_len;
+	uint32		targetRecOff;
+	uint32		pageHeaderSize;
+	bool		assembled;
+	bool		gotheader;
+	int			readOff;
+	DecodedXLogRecord *decoded;
+	char	   *errormsg;		/* not used */
+
+	/*
+	 * randAccess indicates whether to verify the previous-record pointer of
+	 * the record we're reading.  We only do this if we're reading
+	 * sequentially, which is what we initially assume.
+	 */
+	randAccess = false;
+
+	/* reset error state */
+	state->errormsg_buf[0] = '\0';
+	decoded = NULL;
+
+	state->abortedRecPtr = InvalidXLogRecPtr;
+	state->missingContrecPtr = InvalidXLogRecPtr;
+
+	RecPtr = state->NextRecPtr;
+
+	if (state->DecodeRecPtr != InvalidXLogRecPtr)
+	{
+		/* read the record after the one we just read */
+
+		/*
+		 * NextRecPtr is pointing to end+1 of the previous WAL record.  If
+		 * we're at a page boundary, no more records can fit on the current
+		 * page. We must skip over the page header, but we can't do that until
+		 * we've read in the page, since the header size is variable.
+		 */
+	}
+	else
+	{
+		/*
+		 * Caller supplied a position to start at.
+		 *
+		 * In this case, NextRecPtr should already be pointing to a valid
+		 * record starting position.
+		 */
+		Assert(XRecOffIsValid(RecPtr));
+		randAccess = true;
+	}
+
+restart:
+	state->nonblocking = nonblocking;
+	state->currRecPtr = RecPtr;
+	assembled = false;
+
+	targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ);
+	targetRecOff = RecPtr % XLOG_BLCKSZ;
+
+	/*
+	 * Read the page containing the record into state->readBuf. Request enough
+	 * byte to cover the whole record header, or at least the part of it that
+	 * fits on the same page.
+	 */
+	readOff = ReadPageInternal(state, targetPagePtr,
+							   Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ));
+	if (readOff == XLREAD_WOULDBLOCK)
+		return XLREAD_WOULDBLOCK;
+	else if (readOff < 0)
+		goto err;
+
+	/*
+	 * ReadPageInternal always returns at least the page header, so we can
+	 * examine it now.
+	 */
+	pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
+	if (targetRecOff == 0)
+	{
+		/*
+		 * At page start, so skip over page header.
+		 */
+		RecPtr += pageHeaderSize;
+		targetRecOff = pageHeaderSize;
+	}
+	else if (targetRecOff < pageHeaderSize)
+	{
+		report_invalid_record(state, "invalid record offset at %X/%X",
+							  LSN_FORMAT_ARGS(RecPtr));
+		goto err;
+	}
+
+	if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
+		targetRecOff == pageHeaderSize)
+	{
+		report_invalid_record(state, "contrecord is requested by %X/%X",
+							  LSN_FORMAT_ARGS(RecPtr));
+		goto err;
+	}
+
+	/* ReadPageInternal has verified the page header */
+	Assert(pageHeaderSize <= readOff);
+
+	/*
+	 * Read the record length.
+	 *
+	 * NB: Even though we use an XLogRecord pointer here, the whole record
+	 * header might not fit on this page. xl_tot_len is the first field of the
+	 * struct, so it must be on this page (the records are MAXALIGNed), but we
+	 * cannot access any other fields until we've verified that we got the
+	 * whole header.
+	 */
+	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
+	total_len = record->xl_tot_len;
+
+	/*
+	 * If the whole record header is on this page, validate it immediately.
+	 * Otherwise do just a basic sanity check on xl_tot_len, and validate the
+	 * rest of the header after reading it from the next page.  The xl_tot_len
+	 * check is necessary here to ensure that we enter the "Need to reassemble
+	 * record" code path below; otherwise we might fail to apply
+	 * ValidXLogRecordHeader at all.
+	 */
+	if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord)
+	{
+		if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record,
+								   randAccess))
+			goto err;
+		gotheader = true;
+	}
+	else
+	{
+		/* There may be no next page if it's too small. */
+		if (total_len < SizeOfXLogRecord)
+		{
+			report_invalid_record(state,
+								  "invalid record length at %X/%X: wanted %u, got %u",
+								  LSN_FORMAT_ARGS(RecPtr),
+								  (uint32) SizeOfXLogRecord, total_len);
+			goto err;
+		}
+		/* We'll validate the header once we have the next page. */
+		gotheader = false;
+	}
+
+	/*
+	 * Try to find space to decode this record, if we can do so without
+	 * calling palloc.  If we can't, we'll try again below after we've
+	 * validated that total_len isn't garbage bytes from a recycled WAL page.
+	 */
+	decoded = XLogReadRecordAlloc(state,
+								  total_len,
+								  false /* allow_oversized */ );
+	if (decoded == NULL && nonblocking)
+	{
+		/*
+		 * There is no space in the circular decode buffer, and the caller is
+		 * only reading ahead.  The caller should consume existing records to
+		 * make space.
+		 */
+		return XLREAD_WOULDBLOCK;
+	}
+
+	len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ;
+	if (total_len > len)
+	{
+		/* Need to reassemble record */
+		char	   *contdata;
+		XLogPageHeader pageHeader;
+		char	   *buffer;
+		uint32		gotlen;
+
+		assembled = true;
+
+		/*
+		 * We always have space for a couple of pages, enough to validate a
+		 * boundary-spanning record header.
+		 */
+		Assert(state->readRecordBufSize >= XLOG_BLCKSZ * 2);
+		Assert(state->readRecordBufSize >= len);
+
+		/* Copy the first fragment of the record from the first page. */
+		memcpy(state->readRecordBuf,
+			   state->readBuf + RecPtr % XLOG_BLCKSZ, len);
+		buffer = state->readRecordBuf + len;
+		gotlen = len;
+
+		do
+		{
+			/* Calculate pointer to beginning of next page */
+			targetPagePtr += XLOG_BLCKSZ;
+
+			/* Wait for the next page to become available */
+			readOff = ReadPageInternal(state, targetPagePtr,
+									   Min(total_len - gotlen + SizeOfXLogShortPHD,
+										   XLOG_BLCKSZ));
+
+			if (readOff == XLREAD_WOULDBLOCK)
+				return XLREAD_WOULDBLOCK;
+			else if (readOff < 0)
+				goto err;
+
+			Assert(SizeOfXLogShortPHD <= readOff);
+
+			pageHeader = (XLogPageHeader) state->readBuf;
+
+			/*
+			 * If we were expecting a continuation record and got an
+			 * "overwrite contrecord" flag, that means the continuation record
+			 * was overwritten with a different record.  Restart the read by
+			 * assuming the address to read is the location where we found
+			 * this flag; but keep track of the LSN of the record we were
+			 * reading, for later verification.
+			 */
+			if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD)
+			{
+				state->overwrittenRecPtr = RecPtr;
+				RecPtr = targetPagePtr;
+				goto restart;
+			}
+
+			/* Check that the continuation on next page looks valid */
+			if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
+			{
+				report_invalid_record(state,
+									  "there is no contrecord flag at %X/%X",
+									  LSN_FORMAT_ARGS(RecPtr));
+				goto err;
+			}
+
+			/*
+			 * Cross-check that xlp_rem_len agrees with how much of the record
+			 * we expect there to be left.
+			 */
+			if (pageHeader->xlp_rem_len == 0 ||
+				total_len != (pageHeader->xlp_rem_len + gotlen))
+			{
+				report_invalid_record(state,
+									  "invalid contrecord length %u (expected %lld) at %X/%X",
+									  pageHeader->xlp_rem_len,
+									  ((long long) total_len) - gotlen,
+									  LSN_FORMAT_ARGS(RecPtr));
+				goto err;
+			}
+
+			/* Append the continuation from this page to the buffer */
+			pageHeaderSize = XLogPageHeaderSize(pageHeader);
+
+			if (readOff < pageHeaderSize)
+				readOff = ReadPageInternal(state, targetPagePtr,
+										   pageHeaderSize);
+
+			Assert(pageHeaderSize <= readOff);
+
+			contdata = (char *) state->readBuf + pageHeaderSize;
+			len = XLOG_BLCKSZ - pageHeaderSize;
+			if (pageHeader->xlp_rem_len < len)
+				len = pageHeader->xlp_rem_len;
+
+			if (readOff < pageHeaderSize + len)
+				readOff = ReadPageInternal(state, targetPagePtr,
+										   pageHeaderSize + len);
+
+			memcpy(buffer, (char *) contdata, len);
+			buffer += len;
+			gotlen += len;
+
+			/* If we just reassembled the record header, validate it. */
+			if (!gotheader)
+			{
+				record = (XLogRecord *) state->readRecordBuf;
+				if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr,
+										   record, randAccess))
+					goto err;
+				gotheader = true;
+			}
+
+			/*
+			 * We might need a bigger buffer.  We have validated the record
+			 * header, in the case that it split over a page boundary.  We've
+			 * also cross-checked total_len against xlp_rem_len on the second
+			 * page, and verified xlp_pageaddr on both.
+			 */
+			if (total_len > state->readRecordBufSize)
+			{
+				char		save_copy[XLOG_BLCKSZ * 2];
+
+				/*
+				 * Save and restore the data we already had.  It can't be more
+				 * than two pages.
+				 */
+				Assert(gotlen <= lengthof(save_copy));
+				Assert(gotlen <= state->readRecordBufSize);
+				memcpy(save_copy, state->readRecordBuf, gotlen);
+				allocate_recordbuf(state, total_len);
+				memcpy(state->readRecordBuf, save_copy, gotlen);
+				buffer = state->readRecordBuf + gotlen;
+			}
+		} while (gotlen < total_len);
+		Assert(gotheader);
+
+		record = (XLogRecord *) state->readRecordBuf;
+		if (!ValidXLogRecord(state, record, RecPtr))
+			goto err;
+
+		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
+		state->DecodeRecPtr = RecPtr;
+		state->NextRecPtr = targetPagePtr + pageHeaderSize
+			+ MAXALIGN(pageHeader->xlp_rem_len);
+	}
+	else
+	{
+		/* Wait for the record data to become available */
+		readOff = ReadPageInternal(state, targetPagePtr,
+								   Min(targetRecOff + total_len, XLOG_BLCKSZ));
+		if (readOff == XLREAD_WOULDBLOCK)
+			return XLREAD_WOULDBLOCK;
+		else if (readOff < 0)
+			goto err;
+
+		/* Record does not cross a page boundary */
+		if (!ValidXLogRecord(state, record, RecPtr))
+			goto err;
+
+		state->NextRecPtr = RecPtr + MAXALIGN(total_len);
+
+		state->DecodeRecPtr = RecPtr;
+	}
+
+	/*
+	 * Special processing if it's an XLOG SWITCH record
+	 */
+	if (record->xl_rmid == RM_XLOG_ID &&
+		(record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH)
+	{
+		/* Pretend it extends to end of segment */
+		state->NextRecPtr += state->segcxt.ws_segsize - 1;
+		state->NextRecPtr -= XLogSegmentOffset(state->NextRecPtr, state->segcxt.ws_segsize);
+	}
+
+	/*
+	 * If we got here without a DecodedXLogRecord, it means we needed to
+	 * validate total_len before trusting it, but by now now we've done that.
+	 */
+	if (decoded == NULL)
+	{
+		Assert(!nonblocking);
+		decoded = XLogReadRecordAlloc(state,
+									  total_len,
+									  true /* allow_oversized */ );
+		/* allocation should always happen under allow_oversized */
+		Assert(decoded != NULL);
+	}
+
+	if (DecodeXLogRecord(state, decoded, record, RecPtr, &errormsg))
+	{
+		/* Record the location of the next record. */
+		decoded->next_lsn = state->NextRecPtr;
+
+		/*
+		 * If it's in the decode buffer, mark the decode buffer space as
+		 * occupied.
+		 */
+		if (!decoded->oversized)
+		{
+			/* The new decode buffer head must be MAXALIGNed. */
+			Assert(decoded->size == MAXALIGN(decoded->size));
+			if ((char *) decoded == state->decode_buffer)
+				state->decode_buffer_tail = state->decode_buffer + decoded->size;
+			else
+				state->decode_buffer_tail += decoded->size;
+		}
+
+		/* Insert it into the queue of decoded records. */
+		Assert(state->decode_queue_tail != decoded);
+		if (state->decode_queue_tail)
+			state->decode_queue_tail->next = decoded;
+		state->decode_queue_tail = decoded;
+		if (!state->decode_queue_head)
+			state->decode_queue_head = decoded;
+		return XLREAD_SUCCESS;
+	}
+
+err:
+	if (assembled)
+	{
+		/*
+		 * We get here when a record that spans multiple pages needs to be
+		 * assembled, but something went wrong -- perhaps a contrecord piece
+		 * was lost.  If caller is WAL replay, it will know where the aborted
+		 * record was and where to direct followup WAL to be written, marking
+		 * the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will
+		 * in turn signal downstream WAL consumers that the broken WAL record
+		 * is to be ignored.
+		 */
+		state->abortedRecPtr = RecPtr;
+		state->missingContrecPtr = targetPagePtr;
+
+		/*
+		 * If we got here without reporting an error, make sure an error is
+		 * queued so that XLogPrefetcherReadRecord() doesn't bring us back a
+		 * second time and clobber the above state.
+		 */
+		state->errormsg_deferred = true;
+	}
+
+	if (decoded && decoded->oversized)
+		pfree(decoded);
+
+	/*
+	 * Invalidate the read state. We might read from a different source after
+	 * failure.
+	 */
+	XLogReaderInvalReadState(state);
+
+	/*
+	 * If an error was written to errmsg_buf, it'll be returned to the caller
+	 * of XLogReadRecord() after all successfully decoded records from the
+	 * read queue.
+	 */
+
+	return XLREAD_FAIL;
+}
+
+/*
+ * Try to decode the next available record, and return it.  The record will
+ * also be returned to XLogNextRecord(), which must be called to 'consume'
+ * each record.
+ *
+ * If nonblocking is true, may return NULL due to lack of data or WAL decoding
+ * space.
+ */
+DecodedXLogRecord *
+XLogReadAhead(XLogReaderState *state, bool nonblocking)
+{
+	XLogPageReadResult result;
+
+	if (state->errormsg_deferred)
+		return NULL;
+
+	result = XLogDecodeNextRecord(state, nonblocking);
+	if (result == XLREAD_SUCCESS)
+	{
+		Assert(state->decode_queue_tail != NULL);
+		return state->decode_queue_tail;
+	}
+
+	return NULL;
+}
+
+/*
+ * Read a single xlog page including at least [pageptr, reqLen] of valid data
+ * via the page_read() callback.
+ *
+ * Returns XLREAD_FAIL if the required page cannot be read for some
+ * reason; errormsg_buf is set in that case (unless the error occurs in the
+ * page_read callback).
+ *
+ * Returns XLREAD_WOULDBLOCK if the requested data can't be read without
+ * waiting.  This can be returned only if the installed page_read callback
+ * respects the state->nonblocking flag, and cannot read the requested data
+ * immediately.
+ *
+ * We fetch the page from a reader-local cache if we know we have the required
+ * data and if there hasn't been any error since caching the data.
+ */
+static int
+ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
+{
+	int			readLen;
+	uint32		targetPageOff;
+	XLogSegNo	targetSegNo;
+	XLogPageHeader hdr;
+
+	Assert((pageptr % XLOG_BLCKSZ) == 0);
+
+	XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize);
+	targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize);
+
+	/* check whether we have all the requested data already */
+	if (targetSegNo == state->seg.ws_segno &&
+		targetPageOff == state->segoff && reqLen <= state->readLen)
+		return state->readLen;
+
+	/*
+	 * Invalidate contents of internal buffer before read attempt.  Just set
+	 * the length to 0, rather than a full XLogReaderInvalReadState(), so we
+	 * don't forget the segment we last successfully read.
+	 */
+	state->readLen = 0;
+
+	/*
+	 * Data is not in our buffer.
+	 *
+	 * Every time we actually read the segment, even if we looked at parts of
+	 * it before, we need to do verification as the page_read callback might
+	 * now be rereading data from a different source.
+	 *
+	 * Whenever switching to a new WAL segment, we read the first page of the
+	 * file and validate its header, even if that's not where the target
+	 * record is.  This is so that we can check the additional identification
+	 * info that is present in the first page's "long" header.
+	 */
+	if (targetSegNo != state->seg.ws_segno && targetPageOff != 0)
+	{
+		XLogRecPtr	targetSegmentPtr = pageptr - targetPageOff;
+
+		readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ,
+										   state->currRecPtr,
+										   state->readBuf);
+		if (readLen == XLREAD_WOULDBLOCK)
+			return XLREAD_WOULDBLOCK;
+		else if (readLen < 0)
+			goto err;
+
+		/* we can be sure to have enough WAL available, we scrolled back */
+		Assert(readLen == XLOG_BLCKSZ);
+
+		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
+										  state->readBuf))
+			goto err;
+	}
+
+	/*
+	 * First, read the requested data length, but at least a short page header
+	 * so that we can validate it.
+	 */
+	readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD),
+									   state->currRecPtr,
+									   state->readBuf);
+	if (readLen == XLREAD_WOULDBLOCK)
+		return XLREAD_WOULDBLOCK;
+	else if (readLen < 0)
+		goto err;
+
+	Assert(readLen <= XLOG_BLCKSZ);
+
+	/* Do we have enough data to check the header length? */
+	if (readLen <= SizeOfXLogShortPHD)
+		goto err;
+
+	Assert(readLen >= reqLen);
+
+	hdr = (XLogPageHeader) state->readBuf;
+
+	/* still not enough */
+	if (readLen < XLogPageHeaderSize(hdr))
+	{
+		readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr),
+										   state->currRecPtr,
+										   state->readBuf);
+		if (readLen == XLREAD_WOULDBLOCK)
+			return XLREAD_WOULDBLOCK;
+		else if (readLen < 0)
+			goto err;
+	}
+
+	/*
+	 * Now that we know we have the full header, validate it.
+	 */
+	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
+		goto err;
+
+	/* update read state information */
+	state->seg.ws_segno = targetSegNo;
+	state->segoff = targetPageOff;
+	state->readLen = readLen;
+
+	return readLen;
+
+err:
+	XLogReaderInvalReadState(state);
+
+	return XLREAD_FAIL;
+}
+
+/*
+ * Invalidate the xlogreader's read state to force a re-read.
+ */
+static void
+XLogReaderInvalReadState(XLogReaderState *state)
+{
+	state->seg.ws_segno = 0;
+	state->segoff = 0;
+	state->readLen = 0;
+}
+
+/*
+ * Validate an XLOG record header.
+ *
+ * This is just a convenience subroutine to avoid duplicated code in
+ * XLogReadRecord.  It's not intended for use from anywhere else.
+ */
+static bool
+ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
+					  XLogRecPtr PrevRecPtr, XLogRecord *record,
+					  bool randAccess)
+{
+	if (record->xl_tot_len < SizeOfXLogRecord)
+	{
+		report_invalid_record(state,
+							  "invalid record length at %X/%X: wanted %u, got %u",
+							  LSN_FORMAT_ARGS(RecPtr),
+							  (uint32) SizeOfXLogRecord, record->xl_tot_len);
+		return false;
+	}
+	if (!RmgrIdIsValid(record->xl_rmid))
+	{
+		report_invalid_record(state,
+							  "invalid resource manager ID %u at %X/%X",
+							  record->xl_rmid, LSN_FORMAT_ARGS(RecPtr));
+		return false;
+	}
+	if (randAccess)
+	{
+		/*
+		 * We can't exactly verify the prev-link, but surely it should be less
+		 * than the record's own address.
+		 */
+		if (!(record->xl_prev < RecPtr))
+		{
+			report_invalid_record(state,
+								  "record with incorrect prev-link %X/%X at %X/%X",
+								  LSN_FORMAT_ARGS(record->xl_prev),
+								  LSN_FORMAT_ARGS(RecPtr));
+			return false;
+		}
+	}
+	else
+	{
+		/*
+		 * Record's prev-link should exactly match our previous location. This
+		 * check guards against torn WAL pages where a stale but valid-looking
+		 * WAL record starts on a sector boundary.
+		 */
+		if (record->xl_prev != PrevRecPtr)
+		{
+			report_invalid_record(state,
+								  "record with incorrect prev-link %X/%X at %X/%X",
+								  LSN_FORMAT_ARGS(record->xl_prev),
+								  LSN_FORMAT_ARGS(RecPtr));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+
+/*
+ * CRC-check an XLOG record.  We do not believe the contents of an XLOG
+ * record (other than to the minimal extent of computing the amount of
+ * data to read in) until we've checked the CRCs.
+ *
+ * We assume all of the record (that is, xl_tot_len bytes) has been read
+ * into memory at *record.  Also, ValidXLogRecordHeader() has accepted the
+ * record's header, which means in particular that xl_tot_len is at least
+ * SizeOfXLogRecord.
+ */
+static bool
+ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
+{
+	pg_crc32c	crc;
+
+	Assert(record->xl_tot_len >= SizeOfXLogRecord);
+
+	/* Calculate the CRC */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
+	/* include the record header last */
+	COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
+	FIN_CRC32C(crc);
+
+	if (!EQ_CRC32C(record->xl_crc, crc))
+	{
+		report_invalid_record(state,
+							  "incorrect resource manager data checksum in record at %X/%X",
+							  LSN_FORMAT_ARGS(recptr));
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Validate a page header.
+ *
+ * Check if 'phdr' is valid as the header of the XLog page at position
+ * 'recptr'.
+ */
+bool
+XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
+							 char *phdr)
+{
+	XLogRecPtr	recaddr;
+	XLogSegNo	segno;
+	int32		offset;
+	XLogPageHeader hdr = (XLogPageHeader) phdr;
+
+	Assert((recptr % XLOG_BLCKSZ) == 0);
+
+	XLByteToSeg(recptr, segno, state->segcxt.ws_segsize);
+	offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+	XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr);
+
+	if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
+	{
+		char		fname[MAXFNAMELEN];
+
+		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+		report_invalid_record(state,
+							  "invalid magic number %04X in log segment %s, offset %u",
+							  hdr->xlp_magic,
+							  fname,
+							  offset);
+		return false;
+	}
+
+	if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
+	{
+		char		fname[MAXFNAMELEN];
+
+		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+		report_invalid_record(state,
+							  "invalid info bits %04X in log segment %s, offset %u",
+							  hdr->xlp_info,
+							  fname,
+							  offset);
+		return false;
+	}
+
+	if (hdr->xlp_info & XLP_LONG_HEADER)
+	{
+		XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
+
+		if (state->system_identifier &&
+			longhdr->xlp_sysid != state->system_identifier)
+		{
+			report_invalid_record(state,
+								  "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu",
+								  (unsigned long long) longhdr->xlp_sysid,
+								  (unsigned long long) state->system_identifier);
+			return false;
+		}
+		else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize)
+		{
+			report_invalid_record(state,
+								  "WAL file is from different database system: incorrect segment size in page header");
+			return false;
+		}
+		else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
+		{
+			report_invalid_record(state,
+								  "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header");
+			return false;
+		}
+	}
+	else if (offset == 0)
+	{
+		char		fname[MAXFNAMELEN];
+
+		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+		/* hmm, first page of file doesn't have a long header? */
+		report_invalid_record(state,
+							  "invalid info bits %04X in log segment %s, offset %u",
+							  hdr->xlp_info,
+							  fname,
+							  offset);
+		return false;
+	}
+
+	/*
+	 * Check that the address on the page agrees with what we expected. This
+	 * check typically fails when an old WAL segment is recycled, and hasn't
+	 * yet been overwritten with new data yet.
+	 */
+	if (hdr->xlp_pageaddr != recaddr)
+	{
+		char		fname[MAXFNAMELEN];
+
+		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+		report_invalid_record(state,
+							  "unexpected pageaddr %X/%X in log segment %s, offset %u",
+							  LSN_FORMAT_ARGS(hdr->xlp_pageaddr),
+							  fname,
+							  offset);
+		return false;
+	}
+
+	/*
+	 * Since child timelines are always assigned a TLI greater than their
+	 * immediate parent's TLI, we should never see TLI go backwards across
+	 * successive pages of a consistent WAL sequence.
+	 *
+	 * Sometimes we re-read a segment that's already been (partially) read. So
+	 * we only verify TLIs for pages that are later than the last remembered
+	 * LSN.
+	 */
+	if (recptr > state->latestPagePtr)
+	{
+		if (hdr->xlp_tli < state->latestPageTLI)
+		{
+			char		fname[MAXFNAMELEN];
+
+			XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
+
+			report_invalid_record(state,
+								  "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u",
+								  hdr->xlp_tli,
+								  state->latestPageTLI,
+								  fname,
+								  offset);
+			return false;
+		}
+	}
+	state->latestPagePtr = recptr;
+	state->latestPageTLI = hdr->xlp_tli;
+
+	return true;
+}
+
+/*
+ * Forget about an error produced by XLogReaderValidatePageHeader().
+ */
+void
+XLogReaderResetError(XLogReaderState *state)
+{
+	state->errormsg_buf[0] = '\0';
+	state->errormsg_deferred = false;
+}
+
+/*
+ * Find the first record with an lsn >= RecPtr.
+ *
+ * This is different from XLogBeginRead() in that RecPtr doesn't need to point
+ * to a valid record boundary.  Useful for checking whether RecPtr is a valid
+ * xlog address for reading, and to find the first valid address after some
+ * address when dumping records for debugging purposes.
+ *
+ * This positions the reader, like XLogBeginRead(), so that the next call to
+ * XLogReadRecord() will read the next valid record.
+ */
+XLogRecPtr
+XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
+{
+	XLogRecPtr	tmpRecPtr;
+	XLogRecPtr	found = InvalidXLogRecPtr;
+	XLogPageHeader header;
+	char	   *errormsg;
+
+	Assert(!XLogRecPtrIsInvalid(RecPtr));
+
+	/* Make sure ReadPageInternal() can't return XLREAD_WOULDBLOCK. */
+	state->nonblocking = false;
+
+	/*
+	 * skip over potential continuation data, keeping in mind that it may span
+	 * multiple pages
+	 */
+	tmpRecPtr = RecPtr;
+	while (true)
+	{
+		XLogRecPtr	targetPagePtr;
+		int			targetRecOff;
+		uint32		pageHeaderSize;
+		int			readLen;
+
+		/*
+		 * Compute targetRecOff. It should typically be equal or greater than
+		 * short page-header since a valid record can't start anywhere before
+		 * that, except when caller has explicitly specified the offset that
+		 * falls somewhere there or when we are skipping multi-page
+		 * continuation record. It doesn't matter though because
+		 * ReadPageInternal() is prepared to handle that and will read at
+		 * least short page-header worth of data
+		 */
+		targetRecOff = tmpRecPtr % XLOG_BLCKSZ;
+
+		/* scroll back to page boundary */
+		targetPagePtr = tmpRecPtr - targetRecOff;
+
+		/* Read the page containing the record */
+		readLen = ReadPageInternal(state, targetPagePtr, targetRecOff);
+		if (readLen < 0)
+			goto err;
+
+		header = (XLogPageHeader) state->readBuf;
+
+		pageHeaderSize = XLogPageHeaderSize(header);
+
+		/* make sure we have enough data for the page header */
+		readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize);
+		if (readLen < 0)
+			goto err;
+
+		/* skip over potential continuation data */
+		if (header->xlp_info & XLP_FIRST_IS_CONTRECORD)
+		{
+			/*
+			 * If the length of the remaining continuation data is more than
+			 * what can fit in this page, the continuation record crosses over
+			 * this page. Read the next page and try again. xlp_rem_len in the
+			 * next page header will contain the remaining length of the
+			 * continuation data
+			 *
+			 * Note that record headers are MAXALIGN'ed
+			 */
+			if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize))
+				tmpRecPtr = targetPagePtr + XLOG_BLCKSZ;
+			else
+			{
+				/*
+				 * The previous continuation record ends in this page. Set
+				 * tmpRecPtr to point to the first valid record
+				 */
+				tmpRecPtr = targetPagePtr + pageHeaderSize
+					+ MAXALIGN(header->xlp_rem_len);
+				break;
+			}
+		}
+		else
+		{
+			tmpRecPtr = targetPagePtr + pageHeaderSize;
+			break;
+		}
+	}
+
+	/*
+	 * we know now that tmpRecPtr is an address pointing to a valid XLogRecord
+	 * because either we're at the first record after the beginning of a page
+	 * or we just jumped over the remaining data of a continuation.
+	 */
+	XLogBeginRead(state, tmpRecPtr);
+	while (XLogReadRecord(state, &errormsg) != NULL)
+	{
+		/* past the record we've found, break out */
+		if (RecPtr <= state->ReadRecPtr)
+		{
+			/* Rewind the reader to the beginning of the last record. */
+			found = state->ReadRecPtr;
+			XLogBeginRead(state, found);
+			return found;
+		}
+	}
+
+err:
+	XLogReaderInvalReadState(state);
+
+	return InvalidXLogRecPtr;
+}
+
+/*
+ * Helper function to ease writing of XLogRoutine->page_read callbacks.
+ * If this function is used, caller must supply a segment_open callback in
+ * 'state', as that is used here.
+ *
+ * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
+ * fetched from timeline 'tli'.
+ *
+ * Returns true if succeeded, false if an error occurs, in which case
+ * 'errinfo' receives error details.
+ *
+ * XXX probably this should be improved to suck data directly from the
+ * WAL buffers when possible.
+ */
+bool
+WALRead(XLogReaderState *state,
+		char *buf, XLogRecPtr startptr, Size count, TimeLineID tli,
+		WALReadError *errinfo)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+		/*
+		 * If the data we want is not in a segment we have open, close what we
+		 * have (if anything) and open the next one, using the caller's
+		 * provided openSegment callback.
+		 */
+		if (state->seg.ws_file < 0 ||
+			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
+			tli != state->seg.ws_tli)
+		{
+			XLogSegNo	nextSegNo;
+
+			if (state->seg.ws_file >= 0)
+				state->routine.segment_close(state);
+
+			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
+			state->routine.segment_open(state, nextSegNo, &tli);
+
+			/* This shouldn't happen -- indicates a bug in segment_open */
+			Assert(state->seg.ws_file >= 0);
+
+			/* Update the current segment info. */
+			state->seg.ws_tli = tli;
+			state->seg.ws_segno = nextSegNo;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (state->segcxt.ws_segsize - startoff))
+			segbytes = state->segcxt.ws_segsize - startoff;
+		else
+			segbytes = nbytes;
+
+#ifndef FRONTEND
+		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#endif
+
+		/* Reset errno first; eases reporting non-errno-affecting errors */
+		errno = 0;
+		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
+
+#ifndef FRONTEND
+		pgstat_report_wait_end();
+#endif
+
+		if (readbytes <= 0)
+		{
+			errinfo->wre_errno = errno;
+			errinfo->wre_req = segbytes;
+			errinfo->wre_read = readbytes;
+			errinfo->wre_off = startoff;
+			errinfo->wre_seg = state->seg;
+			return false;
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+
+	return true;
+}
+
+/* ----------------------------------------
+ * Functions for decoding the data and block references in a record.
+ * ----------------------------------------
+ */
+
+/*
+ * Private function to reset the state, forgetting all decoded records, if we
+ * are asked to move to a new read position.
+ */
+static void
+ResetDecoder(XLogReaderState *state)
+{
+	DecodedXLogRecord *r;
+
+	/* Reset the decoded record queue, freeing any oversized records. */
+	while ((r = state->decode_queue_head) != NULL)
+	{
+		state->decode_queue_head = r->next;
+		if (r->oversized)
+			pfree(r);
+	}
+	state->decode_queue_tail = NULL;
+	state->decode_queue_head = NULL;
+	state->record = NULL;
+
+	/* Reset the decode buffer to empty. */
+	state->decode_buffer_tail = state->decode_buffer;
+	state->decode_buffer_head = state->decode_buffer;
+
+	/* Clear error state. */
+	state->errormsg_buf[0] = '\0';
+	state->errormsg_deferred = false;
+}
+
+/*
+ * Compute the maximum possible amount of padding that could be required to
+ * decode a record, given xl_tot_len from the record's header.  This is the
+ * amount of output buffer space that we need to decode a record, though we
+ * might not finish up using it all.
+ *
+ * This computation is pessimistic and assumes the maximum possible number of
+ * blocks, due to lack of better information.
+ */
+size_t
+DecodeXLogRecordRequiredSpace(size_t xl_tot_len)
+{
+	size_t		size = 0;
+
+	/* Account for the fixed size part of the decoded record struct. */
+	size += offsetof(DecodedXLogRecord, blocks[0]);
+	/* Account for the flexible blocks array of maximum possible size. */
+	size += sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1);
+	/* Account for all the raw main and block data. */
+	size += xl_tot_len;
+	/* We might insert padding before main_data. */
+	size += (MAXIMUM_ALIGNOF - 1);
+	/* We might insert padding before each block's data. */
+	size += (MAXIMUM_ALIGNOF - 1) * (XLR_MAX_BLOCK_ID + 1);
+	/* We might insert padding at the end. */
+	size += (MAXIMUM_ALIGNOF - 1);
+
+	return size;
+}
+
+/*
+ * Decode a record.  "decoded" must point to a MAXALIGNed memory area that has
+ * space for at least DecodeXLogRecordRequiredSpace(record) bytes.  On
+ * success, decoded->size contains the actual space occupied by the decoded
+ * record, which may turn out to be less.
+ *
+ * Only decoded->oversized member must be initialized already, and will not be
+ * modified.  Other members will be initialized as required.
+ *
+ * On error, a human-readable error message is returned in *errormsg, and
+ * the return value is false.
+ */
+bool
+DecodeXLogRecord(XLogReaderState *state,
+				 DecodedXLogRecord *decoded,
+				 XLogRecord *record,
+				 XLogRecPtr lsn,
+				 char **errormsg)
+{
+	/*
+	 * read next _size bytes from record buffer, but check for overrun first.
+	 */
+#define COPY_HEADER_FIELD(_dst, _size)			\
+	do {										\
+		if (remaining < _size)					\
+			goto shortdata_err;					\
+		memcpy(_dst, ptr, _size);				\
+		ptr += _size;							\
+		remaining -= _size;						\
+	} while(0)
+
+	char	   *ptr;
+	char	   *out;
+	uint32		remaining;
+	uint32		datatotal;
+	RelFileNode *rnode = NULL;
+	uint8		block_id;
+
+	decoded->header = *record;
+	decoded->lsn = lsn;
+	decoded->next = NULL;
+	decoded->record_origin = InvalidRepOriginId;
+	decoded->toplevel_xid = InvalidTransactionId;
+	decoded->main_data = NULL;
+	decoded->main_data_len = 0;
+	decoded->max_block_id = -1;
+	ptr = (char *) record;
+	ptr += SizeOfXLogRecord;
+	remaining = record->xl_tot_len - SizeOfXLogRecord;
+
+	/* Decode the headers */
+	datatotal = 0;
+	while (remaining > datatotal)
+	{
+		COPY_HEADER_FIELD(&block_id, sizeof(uint8));
+
+		if (block_id == XLR_BLOCK_ID_DATA_SHORT)
+		{
+			/* XLogRecordDataHeaderShort */
+			uint8		main_data_len;
+
+			COPY_HEADER_FIELD(&main_data_len, sizeof(uint8));
+
+			decoded->main_data_len = main_data_len;
+			datatotal += main_data_len;
+			break;				/* by convention, the main data fragment is
+								 * always last */
+		}
+		else if (block_id == XLR_BLOCK_ID_DATA_LONG)
+		{
+			/* XLogRecordDataHeaderLong */
+			uint32		main_data_len;
+
+			COPY_HEADER_FIELD(&main_data_len, sizeof(uint32));
+			decoded->main_data_len = main_data_len;
+			datatotal += main_data_len;
+			break;				/* by convention, the main data fragment is
+								 * always last */
+		}
+		else if (block_id == XLR_BLOCK_ID_ORIGIN)
+		{
+			COPY_HEADER_FIELD(&decoded->record_origin, sizeof(RepOriginId));
+		}
+		else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID)
+		{
+			COPY_HEADER_FIELD(&decoded->toplevel_xid, sizeof(TransactionId));
+		}
+		else if (block_id <= XLR_MAX_BLOCK_ID)
+		{
+			/* XLogRecordBlockHeader */
+			DecodedBkpBlock *blk;
+			uint8		fork_flags;
+
+			/* mark any intervening block IDs as not in use */
+			for (int i = decoded->max_block_id + 1; i < block_id; ++i)
+				decoded->blocks[i].in_use = false;
+
+			if (block_id <= decoded->max_block_id)
+			{
+				report_invalid_record(state,
+									  "out-of-order block_id %u at %X/%X",
+									  block_id,
+									  LSN_FORMAT_ARGS(state->ReadRecPtr));
+				goto err;
+			}
+			decoded->max_block_id = block_id;
+
+			blk = &decoded->blocks[block_id];
+			blk->in_use = true;
+			blk->apply_image = false;
+
+			COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
+			blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
+			blk->flags = fork_flags;
+			blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0);
+			blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0);
+
+			blk->prefetch_buffer = InvalidBuffer;
+
+			COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16));
+			/* cross-check that the HAS_DATA flag is set iff data_length > 0 */
+			if (blk->has_data && blk->data_len == 0)
+			{
+				report_invalid_record(state,
+									  "BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
+									  LSN_FORMAT_ARGS(state->ReadRecPtr));
+				goto err;
+			}
+			if (!blk->has_data && blk->data_len != 0)
+			{
+				report_invalid_record(state,
+									  "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
+									  (unsigned int) blk->data_len,
+									  LSN_FORMAT_ARGS(state->ReadRecPtr));
+				goto err;
+			}
+			datatotal += blk->data_len;
+
+			if (blk->has_image)
+			{
+				COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
+				COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
+				COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
+
+				blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
+
+				if (BKPIMAGE_COMPRESSED(blk->bimg_info))
+				{
+					if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
+						COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16));
+					else
+						blk->hole_length = 0;
+				}
+				else
+					blk->hole_length = BLCKSZ - blk->bimg_len;
+				datatotal += blk->bimg_len;
+
+				/*
+				 * cross-check that hole_offset > 0, hole_length > 0 and
+				 * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
+				 */
+				if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+					(blk->hole_offset == 0 ||
+					 blk->hole_length == 0 ||
+					 blk->bimg_len == BLCKSZ))
+				{
+					report_invalid_record(state,
+										  "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+										  (unsigned int) blk->hole_offset,
+										  (unsigned int) blk->hole_length,
+										  (unsigned int) blk->bimg_len,
+										  LSN_FORMAT_ARGS(state->ReadRecPtr));
+					goto err;
+				}
+
+				/*
+				 * cross-check that hole_offset == 0 and hole_length == 0 if
+				 * the HAS_HOLE flag is not set.
+				 */
+				if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+					(blk->hole_offset != 0 || blk->hole_length != 0))
+				{
+					report_invalid_record(state,
+										  "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+										  (unsigned int) blk->hole_offset,
+										  (unsigned int) blk->hole_length,
+										  LSN_FORMAT_ARGS(state->ReadRecPtr));
+					goto err;
+				}
+
+				/*
+				 * Cross-check that bimg_len < BLCKSZ if it is compressed.
+				 */
+				if (BKPIMAGE_COMPRESSED(blk->bimg_info) &&
+					blk->bimg_len == BLCKSZ)
+				{
+					report_invalid_record(state,
+										  "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X",
+										  (unsigned int) blk->bimg_len,
+										  LSN_FORMAT_ARGS(state->ReadRecPtr));
+					goto err;
+				}
+
+				/*
+				 * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE is
+				 * set nor COMPRESSED().
+				 */
+				if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
+					!BKPIMAGE_COMPRESSED(blk->bimg_info) &&
+					blk->bimg_len != BLCKSZ)
+				{
+					report_invalid_record(state,
+										  "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X",
+										  (unsigned int) blk->data_len,
+										  LSN_FORMAT_ARGS(state->ReadRecPtr));
+					goto err;
+				}
+			}
+			if (!(fork_flags & BKPBLOCK_SAME_REL))
+			{
+				COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode));
+				rnode = &blk->rnode;
+			}
+			else
+			{
+				if (rnode == NULL)
+				{
+					report_invalid_record(state,
+										  "BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+										  LSN_FORMAT_ARGS(state->ReadRecPtr));
+					goto err;
+				}
+
+				blk->rnode = *rnode;
+			}
+			COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber));
+		}
+		else
+		{
+			report_invalid_record(state,
+								  "invalid block_id %u at %X/%X",
+								  block_id, LSN_FORMAT_ARGS(state->ReadRecPtr));
+			goto err;
+		}
+	}
+
+	if (remaining != datatotal)
+		goto shortdata_err;
+
+	/*
+	 * Ok, we've parsed the fragment headers, and verified that the total
+	 * length of the payload in the fragments is equal to the amount of data
+	 * left.  Copy the data of each fragment to contiguous space after the
+	 * blocks array, inserting alignment padding before the data fragments so
+	 * they can be cast to struct pointers by REDO routines.
+	 */
+	out = ((char *) decoded) +
+		offsetof(DecodedXLogRecord, blocks) +
+		sizeof(decoded->blocks[0]) * (decoded->max_block_id + 1);
+
+	/* block data first */
+	for (block_id = 0; block_id <= decoded->max_block_id; block_id++)
+	{
+		DecodedBkpBlock *blk = &decoded->blocks[block_id];
+
+		if (!blk->in_use)
+			continue;
+
+		Assert(blk->has_image || !blk->apply_image);
+
+		if (blk->has_image)
+		{
+			/* no need to align image */
+			blk->bkp_image = out;
+			memcpy(out, ptr, blk->bimg_len);
+			ptr += blk->bimg_len;
+			out += blk->bimg_len;
+		}
+		if (blk->has_data)
+		{
+			out = (char *) MAXALIGN(out);
+			blk->data = out;
+			memcpy(blk->data, ptr, blk->data_len);
+			ptr += blk->data_len;
+			out += blk->data_len;
+		}
+	}
+
+	/* and finally, the main data */
+	if (decoded->main_data_len > 0)
+	{
+		out = (char *) MAXALIGN(out);
+		decoded->main_data = out;
+		memcpy(decoded->main_data, ptr, decoded->main_data_len);
+		ptr += decoded->main_data_len;
+		out += decoded->main_data_len;
+	}
+
+	/* Report the actual size we used. */
+	decoded->size = MAXALIGN(out - (char *) decoded);
+	Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >=
+		   decoded->size);
+
+	return true;
+
+shortdata_err:
+	report_invalid_record(state,
+						  "record with invalid length at %X/%X",
+						  LSN_FORMAT_ARGS(state->ReadRecPtr));
+err:
+	*errormsg = state->errormsg_buf;
+
+	return false;
+}
+
+/*
+ * Returns information about the block that a block reference refers to.
+ *
+ * This is like XLogRecGetBlockTagExtended, except that the block reference
+ * must exist and there's no access to prefetch_buffer.
+ */
+void
+XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id,
+				   RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
+{
+	if (!XLogRecGetBlockTagExtended(record, block_id, rnode, forknum, blknum,
+									NULL))
+	{
+#ifndef FRONTEND
+		elog(ERROR, "could not locate backup block with ID %d in WAL record",
+			 block_id);
+#else
+		pg_fatal("could not locate backup block with ID %d in WAL record",
+				 block_id);
+#endif
+	}
+}
+
+/*
+ * Returns information about the block that a block reference refers to,
+ * optionally including the buffer that the block may already be in.
+ *
+ * If the WAL record contains a block reference with the given ID, *rnode,
+ * *forknum, *blknum and *prefetch_buffer are filled in (if not NULL), and
+ * returns true.  Otherwise returns false.
+ */
+bool
+XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id,
+						   RelFileNode *rnode, ForkNumber *forknum,
+						   BlockNumber *blknum,
+						   Buffer *prefetch_buffer)
+{
+	DecodedBkpBlock *bkpb;
+
+	if (!XLogRecHasBlockRef(record, block_id))
+		return false;
+
+	bkpb = &record->record->blocks[block_id];
+	if (rnode)
+		*rnode = bkpb->rnode;
+	if (forknum)
+		*forknum = bkpb->forknum;
+	if (blknum)
+		*blknum = bkpb->blkno;
+	if (prefetch_buffer)
+		*prefetch_buffer = bkpb->prefetch_buffer;
+	return true;
+}
+
+/*
+ * Returns the data associated with a block reference, or NULL if there is
+ * no data (e.g. because a full-page image was taken instead). The returned
+ * pointer points to a MAXALIGNed buffer.
+ */
+char *
+XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
+{
+	DecodedBkpBlock *bkpb;
+
+	if (block_id > record->record->max_block_id ||
+		!record->record->blocks[block_id].in_use)
+		return NULL;
+
+	bkpb = &record->record->blocks[block_id];
+
+	if (!bkpb->has_data)
+	{
+		if (len)
+			*len = 0;
+		return NULL;
+	}
+	else
+	{
+		if (len)
+			*len = bkpb->data_len;
+		return bkpb->data;
+	}
+}
+
+/*
+ * Restore a full-page image from a backup block attached to an XLOG record.
+ *
+ * Returns true if a full-page image is restored, and false on failure with
+ * an error to be consumed by the caller.
+ */
+bool
+RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
+{
+	DecodedBkpBlock *bkpb;
+	char	   *ptr;
+	PGAlignedBlock tmp;
+
+	if (block_id > record->record->max_block_id ||
+		!record->record->blocks[block_id].in_use)
+	{
+		report_invalid_record(record,
+							  "could not restore image at %X/%X with invalid block %d specified",
+							  LSN_FORMAT_ARGS(record->ReadRecPtr),
+							  block_id);
+		return false;
+	}
+	if (!record->record->blocks[block_id].has_image)
+	{
+		report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d",
+							  LSN_FORMAT_ARGS(record->ReadRecPtr),
+							  block_id);
+		return false;
+	}
+
+	bkpb = &record->record->blocks[block_id];
+	ptr = bkpb->bkp_image;
+
+	if (BKPIMAGE_COMPRESSED(bkpb->bimg_info))
+	{
+		/* If a backup block image is compressed, decompress it */
+		bool		decomp_success = true;
+
+		if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0)
+		{
+			if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data,
+								BLCKSZ - bkpb->hole_length, true) < 0)
+				decomp_success = false;
+		}
+		else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0)
+		{
+#ifdef USE_LZ4
+			if (LZ4_decompress_safe(ptr, tmp.data,
+									bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0)
+				decomp_success = false;
+#else
+			report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+								  LSN_FORMAT_ARGS(record->ReadRecPtr),
+								  "LZ4",
+								  block_id);
+			return false;
+#endif
+		}
+		else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0)
+		{
+#ifdef USE_ZSTD
+			size_t		decomp_result = ZSTD_decompress(tmp.data,
+														BLCKSZ - bkpb->hole_length,
+														ptr, bkpb->bimg_len);
+
+			if (ZSTD_isError(decomp_result))
+				decomp_success = false;
+#else
+			report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+								  LSN_FORMAT_ARGS(record->ReadRecPtr),
+								  "zstd",
+								  block_id);
+			return false;
+#endif
+		}
+		else
+		{
+			report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d",
+								  LSN_FORMAT_ARGS(record->ReadRecPtr),
+								  block_id);
+			return false;
+		}
+
+		if (!decomp_success)
+		{
+			report_invalid_record(record, "could not decompress image at %X/%X, block %d",
+								  LSN_FORMAT_ARGS(record->ReadRecPtr),
+								  block_id);
+			return false;
+		}
+
+		ptr = tmp.data;
+	}
+
+	/* generate page, taking into account hole if necessary */
+	if (bkpb->hole_length == 0)
+	{
+		memcpy(page, ptr, BLCKSZ);
+	}
+	else
+	{
+		memcpy(page, ptr, bkpb->hole_offset);
+		/* must zero-fill the hole */
+		MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length);
+		memcpy(page + (bkpb->hole_offset + bkpb->hole_length),
+			   ptr + bkpb->hole_offset,
+			   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
+	}
+
+	return true;
+}
+
+#ifndef FRONTEND
+
+/*
+ * Extract the FullTransactionId from a WAL record.
+ */
+FullTransactionId
+XLogRecGetFullXid(XLogReaderState *record)
+{
+	TransactionId xid,
+				next_xid;
+	uint32		epoch;
+
+	/*
+	 * This function is only safe during replay, because it depends on the
+	 * replay state.  See AdvanceNextFullTransactionIdPastXid() for more.
+	 */
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+
+	xid = XLogRecGetXid(record);
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	/*
+	 * If xid is numerically greater than next_xid, it has to be from the last
+	 * epoch.
+	 */
+	if (unlikely(xid > next_xid))
+		--epoch;
+
+	return FullTransactionIdFromEpochAndXid(epoch, xid);
+}
+
+#endif
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
new file mode 100644
index 0000000..166f7b7
--- /dev/null
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -0,0 +1,4699 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogrecovery.c
+ *		Functions for WAL recovery, standby mode
+ *
+ * This source file contains functions controlling WAL recovery.
+ * InitWalRecovery() initializes the system for crash or archive recovery,
+ * or standby mode, depending on configuration options and the state of
+ * the control file and possible backup label file.  PerformWalRecovery()
+ * performs the actual WAL replay, calling the rmgr-specific redo routines.
+ * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
+ * and prepares information needed to initialize the WAL for writes.  In
+ * addition to these three main functions, there are a bunch of functions
+ * for interrogating recovery state and controlling the recovery process.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogrecovery.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "backup/basebackup.h"
+#include "catalog/pg_control.h"
+#include "commands/tablespace.h"
+#include "common/file_utils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/pg_rusage.h"
+
+/* Unsupported old recovery command file names (relative to $PGDATA) */
+#define RECOVERY_COMMAND_FILE	"recovery.conf"
+#define RECOVERY_COMMAND_DONE	"recovery.done"
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry recovery_target_action_options[] = {
+	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
+	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
+	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
+	{NULL, 0, false}
+};
+
+/* options formerly taken from recovery.conf for archive recovery */
+char	   *recoveryRestoreCommand = NULL;
+char	   *recoveryEndCommand = NULL;
+char	   *archiveCleanupCommand = NULL;
+RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
+bool		recoveryTargetInclusive = true;
+int			recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
+TransactionId recoveryTargetXid;
+char	   *recovery_target_time_string;
+TimestampTz recoveryTargetTime;
+const char *recoveryTargetName;
+XLogRecPtr	recoveryTargetLSN;
+int			recovery_min_apply_delay = 0;
+
+/* options formerly taken from recovery.conf for XLOG streaming */
+char	   *PrimaryConnInfo = NULL;
+char	   *PrimarySlotName = NULL;
+char	   *PromoteTriggerFile = NULL;
+bool		wal_receiver_create_temp_slot = false;
+
+/*
+ * recoveryTargetTimeLineGoal: what the user requested, if any
+ *
+ * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
+ *
+ * recoveryTargetTLI: the currently understood target timeline; changes
+ *
+ * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
+ * the timelines of its known parents, newest first (so recoveryTargetTLI is
+ * always the first list member).  Only these TLIs are expected to be seen in
+ * the WAL segments we read, and indeed only these TLIs will be considered as
+ * candidate WAL files to open at all.
+ *
+ * curFileTLI: the TLI appearing in the name of the current input WAL file.
+ * (This is not necessarily the same as the timeline from which we are
+ * replaying WAL, which StartupXLOG calls replayTLI, because we could be
+ * scanning data that was copied from an ancestor timeline when the current
+ * file was created.)  During a sequential scan we do not allow this value
+ * to decrease.
+ */
+RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
+TimeLineID	recoveryTargetTLIRequested = 0;
+TimeLineID	recoveryTargetTLI = 0;
+static List *expectedTLEs;
+static TimeLineID curFileTLI;
+
+/*
+ * When ArchiveRecoveryRequested is set, archive recovery was requested,
+ * ie. signal files were present.  When InArchiveRecovery is set, we are
+ * currently recovering using offline XLOG archives.  These variables are only
+ * valid in the startup process.
+ *
+ * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
+ * currently performing crash recovery using only XLOG files in pg_wal, but
+ * will switch to using offline XLOG archives as soon as we reach the end of
+ * WAL in pg_wal.
+*/
+bool		ArchiveRecoveryRequested = false;
+bool		InArchiveRecovery = false;
+
+/*
+ * When StandbyModeRequested is set, standby mode was requested, i.e.
+ * standby.signal file was present.  When StandbyMode is set, we are currently
+ * in standby mode.  These variables are only valid in the startup process.
+ * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
+ */
+static bool StandbyModeRequested = false;
+bool		StandbyMode = false;
+
+/* was a signal file present at startup? */
+static bool standby_signal_file_found = false;
+static bool recovery_signal_file_found = false;
+
+/*
+ * CheckPointLoc is the position of the checkpoint record that determines
+ * where to start the replay.  It comes from the backup label file or the
+ * control file.
+ *
+ * RedoStartLSN is the checkpoint's REDO location, also from the backup label
+ * file or the control file.  In standby mode, XLOG streaming usually starts
+ * from the position where an invalid record was found.  But if we fail to
+ * read even the initial checkpoint record, we use the REDO location instead
+ * of the checkpoint location as the start position of XLOG streaming.
+ * Otherwise we would have to jump backwards to the REDO location after
+ * reading the checkpoint record, because the REDO record can precede the
+ * checkpoint record.
+ */
+static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
+static TimeLineID CheckPointTLI = 0;
+static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
+static TimeLineID RedoStartTLI = 0;
+
+/*
+ * Local copy of SharedHotStandbyActive variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalHotStandbyActive = false;
+
+/*
+ * Local copy of SharedPromoteIsTriggered variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalPromoteIsTriggered = false;
+
+/* Has the recovery code requested a walreceiver wakeup? */
+static bool doRequestWalReceiverReply;
+
+/* XLogReader object used to parse the WAL records */
+static XLogReaderState *xlogreader = NULL;
+
+/* XLogPrefetcher object used to consume WAL records with read-ahead */
+static XLogPrefetcher *xlogprefetcher = NULL;
+
+/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
+typedef struct XLogPageReadPrivate
+{
+	int			emode;
+	bool		fetching_ckpt;	/* are we fetching a checkpoint record? */
+	bool		randAccess;
+	TimeLineID	replayTLI;
+} XLogPageReadPrivate;
+
+/* flag to tell XLogPageRead that we have started replaying */
+static bool InRedo = false;
+
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.
+ */
+typedef enum
+{
+	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
+	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
+	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
+	XLOG_FROM_STREAM			/* streamed from primary */
+} XLogSource;
+
+/* human-readable names for XLogSources, for debugging output */
+static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
+
+/*
+ * readFile is -1 or a kernel FD for the log file segment that's currently
+ * open for reading.  readSegNo identifies the segment.  readOff is the offset
+ * of the page just read, readLen indicates how much of it has been read into
+ * readBuf, and readSource indicates where we got the currently open file from.
+ *
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
+ * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
+ * worthwhile, since the XLOG is not read by general-purpose sessions.
+ */
+static int	readFile = -1;
+static XLogSegNo readSegNo = 0;
+static uint32 readOff = 0;
+static uint32 readLen = 0;
+static XLogSource readSource = XLOG_FROM_ANY;
+
+/*
+ * Keeps track of which source we're currently reading from. This is
+ * different from readSource in that this is always set, even when we don't
+ * currently have a WAL file open. If lastSourceFailed is set, our last
+ * attempt to read from currentSource failed, and we should try another source
+ * next.
+ *
+ * pendingWalRcvRestart is set when a config change occurs that requires a
+ * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
+ */
+static XLogSource currentSource = XLOG_FROM_ANY;
+static bool lastSourceFailed = false;
+static bool pendingWalRcvRestart = false;
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from.  (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now.  It is also different from currentSource, which
+ * also changes when we try to read from a source and fail, while
+ * XLogReceiptSource tracks where we last successfully read some WAL.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
+
+/* Local copy of WalRcv->flushedUpto */
+static XLogRecPtr flushedUpto = 0;
+static TimeLineID receiveTLI = 0;
+
+/*
+ * Copy of minRecoveryPoint and backupEndPoint from the control file.
+ *
+ * In order to reach consistency, we must replay the WAL up to
+ * minRecoveryPoint.  If backupEndRequired is true, we must also reach
+ * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
+ * to backupStartPoint.
+ *
+ * Note: In archive recovery, after consistency has been reached, the
+ * functions in xlog.c will start updating minRecoveryPoint in the control
+ * file.  But this copy of minRecoveryPoint variable reflects the value at the
+ * beginning of recovery, and is *not* updated after consistency is reached.
+ */
+static XLogRecPtr minRecoveryPoint;
+static TimeLineID minRecoveryPointTLI;
+
+static XLogRecPtr backupStartPoint;
+static XLogRecPtr backupEndPoint;
+static bool backupEndRequired = false;
+
+/*
+ * Have we reached a consistent database state?  In crash recovery, we have
+ * to replay all the WAL, so reachedConsistency is never set.  During archive
+ * recovery, the database is consistent once minRecoveryPoint is reached.
+ *
+ * Consistent state means that the system is internally consistent, all
+ * the WAL has been replayed up to a certain point, and importantly, there
+ * is no trace of later actions on disk.
+ */
+bool		reachedConsistency = false;
+
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *primary_image_masked = NULL;
+
+
+/*
+ * Shared-memory state for WAL recovery.
+ */
+typedef struct XLogRecoveryCtlData
+{
+	/*
+	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
+	 * run.  Protected by info_lck.
+	 */
+	bool		SharedHotStandbyActive;
+
+	/*
+	 * SharedPromoteIsTriggered indicates if a standby promotion has been
+	 * triggered.  Protected by info_lck.
+	 */
+	bool		SharedPromoteIsTriggered;
+
+	/*
+	 * recoveryWakeupLatch is used to wake up the startup process to continue
+	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
+	 * to appear.
+	 *
+	 * Note that the startup process also uses another latch, its procLatch,
+	 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
+	 * signaling the startup process in favor of using its procLatch, which
+	 * comports better with possible generic signal handlers using that latch.
+	 * But we should not do that because the startup process doesn't assume
+	 * that it's waken up by walreceiver process or SIGHUP signal handler
+	 * while it's waiting for recovery conflict. The separate latches,
+	 * recoveryWakeupLatch and procLatch, should be used for inter-process
+	 * communication for WAL replay and recovery conflict, respectively.
+	 */
+	Latch		recoveryWakeupLatch;
+
+	/*
+	 * Last record successfully replayed.
+	 */
+	XLogRecPtr	lastReplayedReadRecPtr; /* start position */
+	XLogRecPtr	lastReplayedEndRecPtr;	/* end+1 position */
+	TimeLineID	lastReplayedTLI;	/* timeline */
+
+	/*
+	 * When we're currently replaying a record, ie. in a redo function,
+	 * replayEndRecPtr points to the end+1 of the record being replayed,
+	 * otherwise it's equal to lastReplayedEndRecPtr.
+	 */
+	XLogRecPtr	replayEndRecPtr;
+	TimeLineID	replayEndTLI;
+	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
+	TimestampTz recoveryLastXTime;
+
+	/*
+	 * timestamp of when we started replaying the current chunk of WAL data,
+	 * only relevant for replication or archive recovery
+	 */
+	TimestampTz currentChunkStartTime;
+	/* Recovery pause state */
+	RecoveryPauseState recoveryPauseState;
+	ConditionVariable recoveryNotPausedCV;
+
+	slock_t		info_lck;		/* locks shared variables shown above */
+} XLogRecoveryCtlData;
+
+static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
+
+/*
+ * abortedRecPtr is the start pointer of a broken record at end of WAL when
+ * recovery completes; missingContrecPtr is the location of the first
+ * contrecord that went missing.  See CreateOverwriteContrecordRecord for
+ * details.
+ */
+static XLogRecPtr abortedRecPtr;
+static XLogRecPtr missingContrecPtr;
+
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
+static TransactionId recoveryStopXid;
+static TimestampTz recoveryStopTime;
+static XLogRecPtr recoveryStopLSN;
+static char recoveryStopName[MAXFNAMELEN];
+static bool recoveryStopAfter;
+
+/* prototypes for local functions */
+static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
+
+static void EnableStandbyMode(void);
+static void readRecoverySignalFile(void);
+static void validateRecoveryParameters(void);
+static bool read_backup_label(XLogRecPtr *checkPointLoc,
+							  TimeLineID *backupLabelTLI,
+							  bool *backupEndRequired, bool *backupFromStandby);
+static bool read_tablespace_map(List **tablespaces);
+
+static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
+static void CheckRecoveryConsistency(void);
+static void rm_redo_error_callback(void *arg);
+#ifdef WAL_DEBUG
+static void xlog_outrec(StringInfo buf, XLogReaderState *record);
+#endif
+static void xlog_block_info(StringInfo buf, XLogReaderState *record);
+static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
+								TimeLineID prevTLI, TimeLineID replayTLI);
+static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
+static void verifyBackupPageConsistency(XLogReaderState *record);
+
+static bool recoveryStopsBefore(XLogReaderState *record);
+static bool recoveryStopsAfter(XLogReaderState *record);
+static char *getRecoveryStopReason(void);
+static void recoveryPausesHere(bool endOfRecovery);
+static bool recoveryApplyDelay(XLogReaderState *record);
+static void ConfirmRecoveryPaused(void);
+
+static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
+							  int emode, bool fetching_ckpt,
+							  TimeLineID replayTLI);
+
+static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
+						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
+static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
+													  bool randAccess,
+													  bool fetching_ckpt,
+													  XLogRecPtr tliRecPtr,
+													  TimeLineID replayTLI,
+													  XLogRecPtr replayLSN,
+													  bool nonblocking);
+static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
+static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
+										int whichChkpt, bool report, TimeLineID replayTLI);
+static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
+static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+						 XLogSource source, bool notfoundOk);
+static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
+
+static bool CheckForStandbyTrigger(void);
+static void SetPromoteIsTriggered(void);
+static bool HotStandbyActiveInReplay(void);
+
+static void SetCurrentChunkStartTime(TimestampTz xtime);
+static void SetLatestXTime(TimestampTz xtime);
+
+/*
+ * Initialization of shared memory for WAL recovery
+ */
+Size
+XLogRecoveryShmemSize(void)
+{
+	Size		size;
+
+	/* XLogRecoveryCtl */
+	size = sizeof(XLogRecoveryCtlData);
+
+	return size;
+}
+
+void
+XLogRecoveryShmemInit(void)
+{
+	bool		found;
+
+	XLogRecoveryCtl = (XLogRecoveryCtlData *)
+		ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
+	if (found)
+		return;
+	memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
+
+	SpinLockInit(&XLogRecoveryCtl->info_lck);
+	InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+	ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * A thin wrapper to enable StandbyMode and do other preparatory work as
+ * needed.
+ */
+static void
+EnableStandbyMode(void)
+{
+	StandbyMode = true;
+
+	/*
+	 * To avoid server log bloat, we don't report recovery progress in a
+	 * standby as it will always be in recovery unless promoted. We disable
+	 * startup progress timeout in standby mode to avoid calling
+	 * startup_progress_timeout_handler() unnecessarily.
+	 */
+	disable_startup_progress_timeout();
+}
+
+/*
+ * Prepare the system for WAL recovery, if needed.
+ *
+ * This is called by StartupXLOG() which coordinates the server startup
+ * sequence.  This function analyzes the control file and the backup label
+ * file, if any, and figures out whether we need to perform crash recovery or
+ * archive recovery, and how far we need to replay the WAL to reach a
+ * consistent state.
+ *
+ * This doesn't yet change the on-disk state, except for creating the symlinks
+ * from table space map file if any, and for fetching WAL files needed to find
+ * the checkpoint record.  On entry, the caller has already read the control
+ * file into memory, and passes it as argument.  This function updates it to
+ * reflect the recovery state, and the caller is expected to write it back to
+ * disk does after initializing other subsystems, but before calling
+ * PerformWalRecovery().
+ *
+ * This initializes some global variables like ArchiveModeRequested, and
+ * StandbyModeRequested and InRecovery.
+ */
+void
+InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
+				bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
+{
+	XLogPageReadPrivate *private;
+	struct stat st;
+	bool		wasShutdown;
+	XLogRecord *record;
+	DBState		dbstate_at_startup;
+	bool		haveTblspcMap = false;
+	bool		haveBackupLabel = false;
+	CheckPoint	checkPoint;
+	bool		backupFromStandby = false;
+
+	dbstate_at_startup = ControlFile->state;
+
+	/*
+	 * Initialize on the assumption we want to recover to the latest timeline
+	 * that's active according to pg_control.
+	 */
+	if (ControlFile->minRecoveryPointTLI >
+		ControlFile->checkPointCopy.ThisTimeLineID)
+		recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
+	else
+		recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+
+	/*
+	 * Check for signal files, and if so set up state for offline recovery
+	 */
+	readRecoverySignalFile();
+	validateRecoveryParameters();
+
+	if (ArchiveRecoveryRequested)
+	{
+		if (StandbyModeRequested)
+			ereport(LOG,
+					(errmsg("entering standby mode")));
+		else if (recoveryTarget == RECOVERY_TARGET_XID)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to XID %u",
+							recoveryTargetXid)));
+		else if (recoveryTarget == RECOVERY_TARGET_TIME)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to %s",
+							timestamptz_to_str(recoveryTargetTime))));
+		else if (recoveryTarget == RECOVERY_TARGET_NAME)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to \"%s\"",
+							recoveryTargetName)));
+		else if (recoveryTarget == RECOVERY_TARGET_LSN)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
+							LSN_FORMAT_ARGS(recoveryTargetLSN))));
+		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to earliest consistent point")));
+		else
+			ereport(LOG,
+					(errmsg("starting archive recovery")));
+	}
+
+	/*
+	 * Take ownership of the wakeup latch if we're going to sleep during
+	 * recovery.
+	 */
+	if (ArchiveRecoveryRequested)
+		OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+	private = palloc0(sizeof(XLogPageReadPrivate));
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.page_read = &XLogPageRead,
+									  .segment_open = NULL,
+									  .segment_close = wal_segment_close),
+						   private);
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while allocating a WAL reading processor.")));
+	xlogreader->system_identifier = ControlFile->system_identifier;
+
+	/*
+	 * Set the WAL decode buffer size.  This limits how far ahead we can read
+	 * in the WAL.
+	 */
+	XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
+
+	/* Create a WAL prefetcher. */
+	xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
+
+	/*
+	 * Allocate two page buffers dedicated to WAL consistency checks.  We do
+	 * it this way, rather than just making static arrays, for two reasons:
+	 * (1) no need to waste the storage in most instantiations of the backend;
+	 * (2) a static char array isn't guaranteed to have any particular
+	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
+	 */
+	replay_image_masked = (char *) palloc(BLCKSZ);
+	primary_image_masked = (char *) palloc(BLCKSZ);
+
+	if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
+						  &backupFromStandby))
+	{
+		List	   *tablespaces = NIL;
+
+		/*
+		 * Archive recovery was requested, and thanks to the backup label
+		 * file, we know how far we need to replay to reach consistency. Enter
+		 * archive recovery directly.
+		 */
+		InArchiveRecovery = true;
+		if (StandbyModeRequested)
+			EnableStandbyMode();
+
+		/*
+		 * When a backup_label file is present, we want to roll forward from
+		 * the checkpoint it identifies, rather than using pg_control.
+		 */
+		record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true,
+									  CheckPointTLI);
+		if (record != NULL)
+		{
+			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint record is at %X/%X",
+									 LSN_FORMAT_ARGS(CheckPointLoc))));
+			InRecovery = true;	/* force recovery even if SHUTDOWNED */
+
+			/*
+			 * Make sure that REDO location exists. This may not be the case
+			 * if there was a crash during an online backup, which left a
+			 * backup_label around that references a WAL segment that's
+			 * already been archived.
+			 */
+			if (checkPoint.redo < CheckPointLoc)
+			{
+				XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
+				if (!ReadRecord(xlogprefetcher, LOG, false,
+								checkPoint.ThisTimeLineID))
+					ereport(FATAL,
+							(errmsg("could not find redo location referenced by checkpoint record"),
+							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+									 DataDir, DataDir, DataDir)));
+			}
+		}
+		else
+		{
+			ereport(FATAL,
+					(errmsg("could not locate required checkpoint record"),
+					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+							 DataDir, DataDir, DataDir)));
+			wasShutdown = false;	/* keep compiler quiet */
+		}
+
+		/* Read the tablespace_map file if present and create symlinks. */
+		if (read_tablespace_map(&tablespaces))
+		{
+			ListCell   *lc;
+
+			foreach(lc, tablespaces)
+			{
+				tablespaceinfo *ti = lfirst(lc);
+				char	   *linkloc;
+
+				linkloc = psprintf("pg_tblspc/%s", ti->oid);
+
+				/*
+				 * Remove the existing symlink if any and Create the symlink
+				 * under PGDATA.
+				 */
+				remove_tablespace_symlink(linkloc);
+
+				if (symlink(ti->path, linkloc) < 0)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not create symbolic link \"%s\": %m",
+									linkloc)));
+
+				pfree(ti->oid);
+				pfree(ti->path);
+				pfree(ti);
+			}
+
+			/* tell the caller to delete it later */
+			haveTblspcMap = true;
+		}
+
+		/* tell the caller to delete it later */
+		haveBackupLabel = true;
+	}
+	else
+	{
+		/*
+		 * If tablespace_map file is present without backup_label file, there
+		 * is no use of such file.  There is no harm in retaining it, but it
+		 * is better to get rid of the map file so that we don't have any
+		 * redundant file in data directory and it will avoid any sort of
+		 * confusion.  It seems prudent though to just rename the file out of
+		 * the way rather than delete it completely, also we ignore any error
+		 * that occurs in rename operation as even if map file is present
+		 * without backup_label file, it is harmless.
+		 */
+		if (stat(TABLESPACE_MAP, &st) == 0)
+		{
+			unlink(TABLESPACE_MAP_OLD);
+			if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
+				ereport(LOG,
+						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+								TABLESPACE_MAP, BACKUP_LABEL_FILE),
+						 errdetail("File \"%s\" was renamed to \"%s\".",
+								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+			else
+				ereport(LOG,
+						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+								TABLESPACE_MAP, BACKUP_LABEL_FILE),
+						 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
+								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+		}
+
+		/*
+		 * It's possible that archive recovery was requested, but we don't
+		 * know how far we need to replay the WAL before we reach consistency.
+		 * This can happen for example if a base backup is taken from a
+		 * running server using an atomic filesystem snapshot, without calling
+		 * pg_backup_start/stop. Or if you just kill a running primary server
+		 * and put it into archive recovery by creating a recovery signal
+		 * file.
+		 *
+		 * Our strategy in that case is to perform crash recovery first,
+		 * replaying all the WAL present in pg_wal, and only enter archive
+		 * recovery after that.
+		 *
+		 * But usually we already know how far we need to replay the WAL (up
+		 * to minRecoveryPoint, up to backupEndPoint, or until we see an
+		 * end-of-backup record), and we can enter archive recovery directly.
+		 */
+		if (ArchiveRecoveryRequested &&
+			(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
+			 ControlFile->backupEndRequired ||
+			 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
+			 ControlFile->state == DB_SHUTDOWNED))
+		{
+			InArchiveRecovery = true;
+			if (StandbyModeRequested)
+				EnableStandbyMode();
+		}
+
+		/* Get the last valid checkpoint record. */
+		CheckPointLoc = ControlFile->checkPoint;
+		CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+		record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true,
+									  CheckPointTLI);
+		if (record != NULL)
+		{
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint record is at %X/%X",
+									 LSN_FORMAT_ARGS(CheckPointLoc))));
+		}
+		else
+		{
+			/*
+			 * We used to attempt to go back to a secondary checkpoint record
+			 * here, but only when not in standby mode. We now just fail if we
+			 * can't read the last checkpoint because this allows us to
+			 * simplify processing around checkpoints.
+			 */
+			ereport(PANIC,
+					(errmsg("could not locate a valid checkpoint record")));
+		}
+		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+	}
+
+	/*
+	 * If the location of the checkpoint record is not on the expected
+	 * timeline in the history of the requested timeline, we cannot proceed:
+	 * the backup is not part of the history of the requested timeline.
+	 */
+	Assert(expectedTLEs);		/* was initialized by reading checkpoint
+								 * record */
+	if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
+		CheckPointTLI)
+	{
+		XLogRecPtr	switchpoint;
+
+		/*
+		 * tliSwitchPoint will throw an error if the checkpoint's timeline is
+		 * not in expectedTLEs at all.
+		 */
+		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
+		ereport(FATAL,
+				(errmsg("requested timeline %u is not a child of this server's history",
+						recoveryTargetTLI),
+				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+						   LSN_FORMAT_ARGS(ControlFile->checkPoint),
+						   ControlFile->checkPointCopy.ThisTimeLineID,
+						   LSN_FORMAT_ARGS(switchpoint))));
+	}
+
+	/*
+	 * The min recovery point should be part of the requested timeline's
+	 * history, too.
+	 */
+	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
+		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+		ControlFile->minRecoveryPointTLI)
+		ereport(FATAL,
+				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
+						recoveryTargetTLI,
+						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+						ControlFile->minRecoveryPointTLI)));
+
+	ereport(DEBUG1,
+			(errmsg_internal("redo record is at %X/%X; shutdown %s",
+							 LSN_FORMAT_ARGS(checkPoint.redo),
+							 wasShutdown ? "true" : "false")));
+	ereport(DEBUG1,
+			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
+							 U64FromFullTransactionId(checkPoint.nextXid),
+							 checkPoint.nextOid)));
+	ereport(DEBUG1,
+			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
+							 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+	ereport(DEBUG1,
+			(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
+							 checkPoint.oldestXid, checkPoint.oldestXidDB)));
+	ereport(DEBUG1,
+			(errmsg_internal("oldest MultiXactId: %u, in database %u",
+							 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
+	ereport(DEBUG1,
+			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
+							 checkPoint.oldestCommitTsXid,
+							 checkPoint.newestCommitTsXid)));
+	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
+		ereport(PANIC,
+				(errmsg("invalid next transaction ID")));
+
+	/* sanity check */
+	if (checkPoint.redo > CheckPointLoc)
+		ereport(PANIC,
+				(errmsg("invalid redo in checkpoint record")));
+
+	/*
+	 * Check whether we need to force recovery from WAL.  If it appears to
+	 * have been a clean shutdown and we did not have a recovery signal file,
+	 * then assume no recovery needed.
+	 */
+	if (checkPoint.redo < CheckPointLoc)
+	{
+		if (wasShutdown)
+			ereport(PANIC,
+					(errmsg("invalid redo record in shutdown checkpoint")));
+		InRecovery = true;
+	}
+	else if (ControlFile->state != DB_SHUTDOWNED)
+		InRecovery = true;
+	else if (ArchiveRecoveryRequested)
+	{
+		/* force recovery due to presence of recovery signal file */
+		InRecovery = true;
+	}
+
+	/*
+	 * If recovery is needed, update our in-memory copy of pg_control to show
+	 * that we are recovering and to show the selected checkpoint as the place
+	 * we are starting from. We also mark pg_control with any minimum recovery
+	 * stop point obtained from a backup history file.
+	 *
+	 * We don't write the changes to disk yet, though. Only do that after
+	 * initializing various subsystems.
+	 */
+	if (InRecovery)
+	{
+		if (InArchiveRecovery)
+		{
+			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+		}
+		else
+		{
+			ereport(LOG,
+					(errmsg("database system was not properly shut down; "
+							"automatic recovery in progress")));
+			if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
+				ereport(LOG,
+						(errmsg("crash recovery starts in timeline %u "
+								"and has target timeline %u",
+								ControlFile->checkPointCopy.ThisTimeLineID,
+								recoveryTargetTLI)));
+			ControlFile->state = DB_IN_CRASH_RECOVERY;
+		}
+		ControlFile->checkPoint = CheckPointLoc;
+		ControlFile->checkPointCopy = checkPoint;
+		if (InArchiveRecovery)
+		{
+			/* initialize minRecoveryPoint if not set yet */
+			if (ControlFile->minRecoveryPoint < checkPoint.redo)
+			{
+				ControlFile->minRecoveryPoint = checkPoint.redo;
+				ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
+			}
+		}
+
+		/*
+		 * Set backupStartPoint if we're starting recovery from a base backup.
+		 *
+		 * Also set backupEndPoint and use minRecoveryPoint as the backup end
+		 * location if we're starting recovery from a base backup which was
+		 * taken from a standby. In this case, the database system status in
+		 * pg_control must indicate that the database was already in recovery.
+		 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
+		 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
+		 * before reaching this point; e.g. because restore_command or
+		 * primary_conninfo were faulty.
+		 *
+		 * Any other state indicates that the backup somehow became corrupted
+		 * and we can't sensibly continue with recovery.
+		 */
+		if (haveBackupLabel)
+		{
+			ControlFile->backupStartPoint = checkPoint.redo;
+			ControlFile->backupEndRequired = backupEndRequired;
+
+			if (backupFromStandby)
+			{
+				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
+					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
+					ereport(FATAL,
+							(errmsg("backup_label contains data inconsistent with control file"),
+							 errhint("This means that the backup is corrupted and you will "
+									 "have to use another backup for recovery.")));
+				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
+			}
+		}
+	}
+
+	/* remember these, so that we know when we have reached consistency */
+	backupStartPoint = ControlFile->backupStartPoint;
+	backupEndRequired = ControlFile->backupEndRequired;
+	backupEndPoint = ControlFile->backupEndPoint;
+	if (InArchiveRecovery)
+	{
+		minRecoveryPoint = ControlFile->minRecoveryPoint;
+		minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+	}
+	else
+	{
+		minRecoveryPoint = InvalidXLogRecPtr;
+		minRecoveryPointTLI = 0;
+	}
+
+	/*
+	 * Start recovery assuming that the final record isn't lost.
+	 */
+	abortedRecPtr = InvalidXLogRecPtr;
+	missingContrecPtr = InvalidXLogRecPtr;
+
+	*wasShutdown_ptr = wasShutdown;
+	*haveBackupLabel_ptr = haveBackupLabel;
+	*haveTblspcMap_ptr = haveTblspcMap;
+}
+
+/*
+ * See if there are any recovery signal files and if so, set state for
+ * recovery.
+ *
+ * See if there is a recovery command file (recovery.conf), and if so
+ * throw an ERROR since as of PG12 we no longer recognize that.
+ */
+static void
+readRecoverySignalFile(void)
+{
+	struct stat stat_buf;
+
+	if (IsBootstrapProcessingMode())
+		return;
+
+	/*
+	 * Check for old recovery API file: recovery.conf
+	 */
+	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("using recovery command file \"%s\" is not supported",
+						RECOVERY_COMMAND_FILE)));
+
+	/*
+	 * Remove unused .done file, if present. Ignore if absent.
+	 */
+	unlink(RECOVERY_COMMAND_DONE);
+
+	/*
+	 * Check for recovery signal files and if found, fsync them since they
+	 * represent server state information.  We don't sweat too much about the
+	 * possibility of fsync failure, however.
+	 *
+	 * If present, standby signal file takes precedence. If neither is present
+	 * then we won't enter archive recovery.
+	 */
+	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
+	{
+		int			fd;
+
+		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
+		if (fd >= 0)
+		{
+			(void) pg_fsync(fd);
+			close(fd);
+		}
+		standby_signal_file_found = true;
+	}
+	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
+	{
+		int			fd;
+
+		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
+		if (fd >= 0)
+		{
+			(void) pg_fsync(fd);
+			close(fd);
+		}
+		recovery_signal_file_found = true;
+	}
+
+	StandbyModeRequested = false;
+	ArchiveRecoveryRequested = false;
+	if (standby_signal_file_found)
+	{
+		StandbyModeRequested = true;
+		ArchiveRecoveryRequested = true;
+	}
+	else if (recovery_signal_file_found)
+	{
+		StandbyModeRequested = false;
+		ArchiveRecoveryRequested = true;
+	}
+	else
+		return;
+
+	/*
+	 * We don't support standby mode in standalone backends; that requires
+	 * other processes such as the WAL receiver to be alive.
+	 */
+	if (StandbyModeRequested && !IsUnderPostmaster)
+		ereport(FATAL,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("standby mode is not supported by single-user servers")));
+}
+
+static void
+validateRecoveryParameters(void)
+{
+	if (!ArchiveRecoveryRequested)
+		return;
+
+	/*
+	 * Check for compulsory parameters
+	 */
+	if (StandbyModeRequested)
+	{
+		if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
+			(recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
+			ereport(WARNING,
+					(errmsg("specified neither primary_conninfo nor restore_command"),
+					 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
+	}
+	else
+	{
+		if (recoveryRestoreCommand == NULL ||
+			strcmp(recoveryRestoreCommand, "") == 0)
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("must specify restore_command when standby mode is not enabled")));
+	}
+
+	/*
+	 * Override any inconsistent requests. Note that this is a change of
+	 * behaviour in 9.5; prior to this we simply ignored a request to pause if
+	 * hot_standby = off, which was surprising behaviour.
+	 */
+	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
+		!EnableHotStandby)
+		recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
+
+	/*
+	 * Final parsing of recovery_target_time string; see also
+	 * check_recovery_target_time().
+	 */
+	if (recoveryTarget == RECOVERY_TARGET_TIME)
+	{
+		recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+																	 CStringGetDatum(recovery_target_time_string),
+																	 ObjectIdGetDatum(InvalidOid),
+																	 Int32GetDatum(-1)));
+	}
+
+	/*
+	 * If user specified recovery_target_timeline, validate it or compute the
+	 * "latest" value.  We can't do this until after we've gotten the restore
+	 * command and set InArchiveRecovery, because we need to fetch timeline
+	 * history files from the archive.
+	 */
+	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
+	{
+		TimeLineID	rtli = recoveryTargetTLIRequested;
+
+		/* Timeline 1 does not have a history file, all else should */
+		if (rtli != 1 && !existsTimeLineHistory(rtli))
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("recovery target timeline %u does not exist",
+							rtli)));
+		recoveryTargetTLI = rtli;
+	}
+	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+	{
+		/* We start the "latest" search from pg_control's timeline */
+		recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
+	}
+	else
+	{
+		/*
+		 * else we just use the recoveryTargetTLI as already read from
+		 * ControlFile
+		 */
+		Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
+	}
+}
+
+/*
+ * read_backup_label: check to see if a backup_label file is present
+ *
+ * If we see a backup_label during recovery, we assume that we are recovering
+ * from a backup dump file, and we therefore roll forward from the checkpoint
+ * identified by the label file, NOT what pg_control says.  This avoids the
+ * problem that pg_control might have been archived one or more checkpoints
+ * later than the start of the dump, and so if we rely on it as the start
+ * point, we will fail to restore a consistent database state.
+ *
+ * Returns true if a backup_label was found (and fills the checkpoint
+ * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
+ * returns false if not. If this backup_label came from a streamed backup,
+ * *backupEndRequired is set to true. If this backup_label was created during
+ * recovery, *backupFromStandby is set to true.
+ *
+ * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
+ * and TLI read from the backup file.
+ */
+static bool
+read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
+				  bool *backupEndRequired, bool *backupFromStandby)
+{
+	char		startxlogfilename[MAXFNAMELEN];
+	TimeLineID	tli_from_walseg,
+				tli_from_file;
+	FILE	   *lfp;
+	char		ch;
+	char		backuptype[20];
+	char		backupfrom[20];
+	char		backuplabel[MAXPGPATH];
+	char		backuptime[128];
+	uint32		hi,
+				lo;
+
+	/* suppress possible uninitialized-variable warnings */
+	*checkPointLoc = InvalidXLogRecPtr;
+	*backupLabelTLI = 0;
+	*backupEndRequired = false;
+	*backupFromStandby = false;
+
+	/*
+	 * See if label file is present
+	 */
+	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+	if (!lfp)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							BACKUP_LABEL_FILE)));
+		return false;			/* it's not there, all is fine */
+	}
+
+	/*
+	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
+	 * is pretty crude, but we are not expecting any variability in the file
+	 * format).
+	 */
+	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	RedoStartLSN = ((uint64) hi) << 32 | lo;
+	RedoStartTLI = tli_from_walseg;
+	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+			   &hi, &lo, &ch) != 3 || ch != '\n')
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	*checkPointLoc = ((uint64) hi) << 32 | lo;
+	*backupLabelTLI = tli_from_walseg;
+
+	/*
+	 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
+	 * which could mean either pg_basebackup or the pg_backup_start/stop
+	 * method was used) or if this label came from somewhere else (the only
+	 * other option today being from pg_rewind).  If this was a streamed
+	 * backup then we know that we need to play through until we get to the
+	 * end of the WAL which was generated during the backup (at which point we
+	 * will have reached consistency and backupEndRequired will be reset to be
+	 * false).
+	 */
+	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
+	{
+		if (strcmp(backuptype, "streamed") == 0)
+			*backupEndRequired = true;
+	}
+
+	/*
+	 * BACKUP FROM lets us know if this was from a primary or a standby.  If
+	 * it was from a standby, we'll double-check that the control file state
+	 * matches that of a standby.
+	 */
+	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
+	{
+		if (strcmp(backupfrom, "standby") == 0)
+			*backupFromStandby = true;
+	}
+
+	/*
+	 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
+	 * but checking for their presence is useful for debugging and the next
+	 * sanity checks. Cope also with the fact that the result buffers have a
+	 * pre-allocated size, hence if the backup_label file has been generated
+	 * with strings longer than the maximum assumed here an incorrect parsing
+	 * happens. That's fine as only minor consistency checks are done
+	 * afterwards.
+	 */
+	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
+		ereport(DEBUG1,
+				(errmsg_internal("backup time %s in file \"%s\"",
+								 backuptime, BACKUP_LABEL_FILE)));
+
+	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
+		ereport(DEBUG1,
+				(errmsg_internal("backup label %s in file \"%s\"",
+								 backuplabel, BACKUP_LABEL_FILE)));
+
+	/*
+	 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
+	 * it as a sanity check if present.
+	 */
+	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
+	{
+		if (tli_from_walseg != tli_from_file)
+			ereport(FATAL,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
+					 errdetail("Timeline ID parsed is %u, but expected %u.",
+							   tli_from_file, tli_from_walseg)));
+
+		ereport(DEBUG1,
+				(errmsg_internal("backup timeline %u in file \"%s\"",
+								 tli_from_file, BACKUP_LABEL_FILE)));
+	}
+
+	if (ferror(lfp) || FreeFile(lfp))
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						BACKUP_LABEL_FILE)));
+
+	return true;
+}
+
+/*
+ * read_tablespace_map: check to see if a tablespace_map file is present
+ *
+ * If we see a tablespace_map file during recovery, we assume that we are
+ * recovering from a backup dump file, and we therefore need to create symlinks
+ * as per the information present in tablespace_map file.
+ *
+ * Returns true if a tablespace_map file was found (and fills *tablespaces
+ * with a tablespaceinfo struct for each tablespace listed in the file);
+ * returns false if not.
+ */
+static bool
+read_tablespace_map(List **tablespaces)
+{
+	tablespaceinfo *ti;
+	FILE	   *lfp;
+	char		str[MAXPGPATH];
+	int			ch,
+				i,
+				n;
+	bool		was_backslash;
+
+	/*
+	 * See if tablespace_map file is present
+	 */
+	lfp = AllocateFile(TABLESPACE_MAP, "r");
+	if (!lfp)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							TABLESPACE_MAP)));
+		return false;			/* it's not there, all is fine */
+	}
+
+	/*
+	 * Read and parse the link name and path lines from tablespace_map file
+	 * (this code is pretty crude, but we are not expecting any variability in
+	 * the file format).  De-escape any backslashes that were inserted.
+	 */
+	i = 0;
+	was_backslash = false;
+	while ((ch = fgetc(lfp)) != EOF)
+	{
+		if (!was_backslash && (ch == '\n' || ch == '\r'))
+		{
+			if (i == 0)
+				continue;		/* \r immediately followed by \n */
+
+			/*
+			 * The de-escaped line should contain an OID followed by exactly
+			 * one space followed by a path.  The path might start with
+			 * spaces, so don't be too liberal about parsing.
+			 */
+			str[i] = '\0';
+			n = 0;
+			while (str[n] && str[n] != ' ')
+				n++;
+			if (n < 1 || n >= i - 1)
+				ereport(FATAL,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+			str[n++] = '\0';
+
+			ti = palloc0(sizeof(tablespaceinfo));
+			ti->oid = pstrdup(str);
+			ti->path = pstrdup(str + n);
+			*tablespaces = lappend(*tablespaces, ti);
+
+			i = 0;
+			continue;
+		}
+		else if (!was_backslash && ch == '\\')
+			was_backslash = true;
+		else
+		{
+			if (i < sizeof(str) - 1)
+				str[i++] = ch;
+			was_backslash = false;
+		}
+	}
+
+	if (i != 0 || was_backslash)	/* last line not terminated? */
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+
+	if (ferror(lfp) || FreeFile(lfp))
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						TABLESPACE_MAP)));
+
+	return true;
+}
+
+/*
+ * Finish WAL recovery.
+ *
+ * This does not close the 'xlogreader' yet, because in some cases the caller
+ * still wants to re-read the last checkpoint record by calling
+ * ReadCheckPointRecord().
+ *
+ * Returns the position of the last valid or applied record, after which new
+ * WAL should be appended, information about why recovery was ended, and some
+ * other things. See the WalRecoveryResult struct for details.
+ */
+EndOfWalRecoveryInfo *
+FinishWalRecovery(void)
+{
+	EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
+	XLogRecPtr	lastRec;
+	TimeLineID	lastRecTLI;
+	XLogRecPtr	endOfLog;
+
+	/*
+	 * Kill WAL receiver, if it's still running, before we continue to write
+	 * the startup checkpoint and aborted-contrecord records. It will trump
+	 * over these records and subsequent ones if it's still alive when we
+	 * start writing WAL.
+	 */
+	XLogShutdownWalRcv();
+
+	/*
+	 * We are now done reading the xlog from stream. Turn off streaming
+	 * recovery to force fetching the files (which would be required at end of
+	 * recovery, e.g., timeline history file) from archive or pg_wal.
+	 *
+	 * Note that standby mode must be turned off after killing WAL receiver,
+	 * i.e., calling XLogShutdownWalRcv().
+	 */
+	Assert(!WalRcvStreaming());
+	StandbyMode = false;
+
+	/*
+	 * Determine where to start writing WAL next.
+	 *
+	 * Re-fetch the last valid or last applied record, so we can identify the
+	 * exact endpoint of what we consider the valid portion of WAL.  There may
+	 * be an incomplete continuation record after that, in which case
+	 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
+	 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
+	 * it is intentionally missing.  See CreateOverwriteContrecordRecord().
+	 *
+	 * An important side-effect of this is to load the last page into
+	 * xlogreader. The caller uses it to initialize the WAL for writing.
+	 */
+	if (!InRecovery)
+	{
+		lastRec = CheckPointLoc;
+		lastRecTLI = CheckPointTLI;
+	}
+	else
+	{
+		lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
+		lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
+	}
+	XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
+	(void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
+	endOfLog = xlogreader->EndRecPtr;
+
+	/*
+	 * Remember the TLI in the filename of the XLOG segment containing the
+	 * end-of-log.  It could be different from the timeline that endOfLog
+	 * nominally belongs to, if there was a timeline switch in that segment,
+	 * and we were reading the old WAL from a segment belonging to a higher
+	 * timeline.
+	 */
+	result->endOfLogTLI = xlogreader->seg.ws_tli;
+
+	if (ArchiveRecoveryRequested)
+	{
+		/*
+		 * We are no longer in archive recovery state.
+		 *
+		 * We are now done reading the old WAL.  Turn off archive fetching if
+		 * it was active.
+		 */
+		Assert(InArchiveRecovery);
+		InArchiveRecovery = false;
+
+		/*
+		 * If the ending log segment is still open, close it (to avoid
+		 * problems on Windows with trying to rename or delete an open file).
+		 */
+		if (readFile >= 0)
+		{
+			close(readFile);
+			readFile = -1;
+		}
+	}
+
+	/*
+	 * Copy the last partial block to the caller, for initializing the WAL
+	 * buffer for appending new WAL.
+	 */
+	if (endOfLog % XLOG_BLCKSZ != 0)
+	{
+		char	   *page;
+		int			len;
+		XLogRecPtr	pageBeginPtr;
+
+		pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
+		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+
+		/* Copy the valid part of the last block */
+		len = endOfLog % XLOG_BLCKSZ;
+		page = palloc(len);
+		memcpy(page, xlogreader->readBuf, len);
+
+		result->lastPageBeginPtr = pageBeginPtr;
+		result->lastPage = page;
+	}
+	else
+	{
+		/* There is no partial block to copy. */
+		result->lastPageBeginPtr = endOfLog;
+		result->lastPage = NULL;
+	}
+
+	/*
+	 * Create a comment for the history file to explain why and where timeline
+	 * changed.
+	 */
+	result->recoveryStopReason = getRecoveryStopReason();
+
+	result->lastRec = lastRec;
+	result->lastRecTLI = lastRecTLI;
+	result->endOfLog = endOfLog;
+
+	result->abortedRecPtr = abortedRecPtr;
+	result->missingContrecPtr = missingContrecPtr;
+
+	result->standby_signal_file_found = standby_signal_file_found;
+	result->recovery_signal_file_found = recovery_signal_file_found;
+
+	return result;
+}
+
+/*
+ * Clean up the WAL reader and leftovers from restoring WAL from archive
+ */
+void
+ShutdownWalRecovery(void)
+{
+	char		recoveryPath[MAXPGPATH];
+
+	/* Final update of pg_stat_recovery_prefetch. */
+	XLogPrefetcherComputeStats(xlogprefetcher);
+
+	/* Shut down xlogreader */
+	if (readFile >= 0)
+	{
+		close(readFile);
+		readFile = -1;
+	}
+	XLogReaderFree(xlogreader);
+	XLogPrefetcherFree(xlogprefetcher);
+
+	if (ArchiveRecoveryRequested)
+	{
+		/*
+		 * Since there might be a partial WAL segment named RECOVERYXLOG, get
+		 * rid of it.
+		 */
+		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
+		unlink(recoveryPath);	/* ignore any error */
+
+		/* Get rid of any remaining recovered timeline-history file, too */
+		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
+		unlink(recoveryPath);	/* ignore any error */
+	}
+
+	/*
+	 * We don't need the latch anymore. It's not strictly necessary to disown
+	 * it, but let's do it for the sake of tidiness.
+	 */
+	if (ArchiveRecoveryRequested)
+		DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Perform WAL recovery.
+ *
+ * If the system was shut down cleanly, this is never called.
+ */
+void
+PerformWalRecovery(void)
+{
+	XLogRecord *record;
+	bool		reachedRecoveryTarget = false;
+	TimeLineID	replayTLI;
+
+	/*
+	 * Initialize shared variables for tracking progress of WAL replay, as if
+	 * we had just replayed the record before the REDO location (or the
+	 * checkpoint record itself, if it's a shutdown checkpoint).
+	 */
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	if (RedoStartLSN < CheckPointLoc)
+	{
+		XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
+		XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
+		XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
+	}
+	else
+	{
+		XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+		XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+		XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
+	}
+	XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+	XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
+	XLogRecoveryCtl->recoveryLastXTime = 0;
+	XLogRecoveryCtl->currentChunkStartTime = 0;
+	XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	/* Also ensure XLogReceiptTime has a sane value */
+	XLogReceiptTime = GetCurrentTimestamp();
+
+	/*
+	 * Let postmaster know we've started redo now, so that it can launch the
+	 * archiver if necessary.
+	 */
+	if (IsUnderPostmaster)
+		SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+
+	/*
+	 * Allow read-only connections immediately if we're consistent already.
+	 */
+	CheckRecoveryConsistency();
+
+	/*
+	 * Find the first record that logically follows the checkpoint --- it
+	 * might physically precede it, though.
+	 */
+	if (RedoStartLSN < CheckPointLoc)
+	{
+		/* back up to find the record */
+		replayTLI = RedoStartTLI;
+		XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
+		record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
+	}
+	else
+	{
+		/* just have to read next record after CheckPoint */
+		Assert(xlogreader->ReadRecPtr == CheckPointLoc);
+		replayTLI = CheckPointTLI;
+		record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
+	}
+
+	if (record != NULL)
+	{
+		TimestampTz xtime;
+		PGRUsage	ru0;
+
+		pg_rusage_init(&ru0);
+
+		InRedo = true;
+
+		RmgrStartup();
+
+		ereport(LOG,
+				(errmsg("redo starts at %X/%X",
+						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+
+		/* Prepare to report progress of the redo phase. */
+		if (!StandbyMode)
+			begin_startup_progress_phase();
+
+		/*
+		 * main redo apply loop
+		 */
+		do
+		{
+			if (!StandbyMode)
+				ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
+										 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
+
+#ifdef WAL_DEBUG
+			if (XLOG_DEBUG ||
+				(record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+				(record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
+			{
+				StringInfoData buf;
+
+				initStringInfo(&buf);
+				appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+								 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+								 LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
+				xlog_outrec(&buf, xlogreader);
+				appendStringInfoString(&buf, " - ");
+				xlog_outdesc(&buf, xlogreader);
+				elog(LOG, "%s", buf.data);
+				pfree(buf.data);
+			}
+#endif
+
+			/* Handle interrupt signals of startup process */
+			HandleStartupProcInterrupts();
+
+			/*
+			 * Pause WAL replay, if requested by a hot-standby session via
+			 * SetRecoveryPause().
+			 *
+			 * Note that we intentionally don't take the info_lck spinlock
+			 * here.  We might therefore read a slightly stale value of the
+			 * recoveryPause flag, but it can't be very stale (no worse than
+			 * the last spinlock we did acquire).  Since a pause request is a
+			 * pretty asynchronous thing anyway, possibly responding to it one
+			 * WAL record later than we otherwise would is a minor issue, so
+			 * it doesn't seem worth adding another spinlock cycle to prevent
+			 * that.
+			 */
+			if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+				RECOVERY_NOT_PAUSED)
+				recoveryPausesHere(false);
+
+			/*
+			 * Have we reached our recovery target?
+			 */
+			if (recoveryStopsBefore(xlogreader))
+			{
+				reachedRecoveryTarget = true;
+				break;
+			}
+
+			/*
+			 * If we've been asked to lag the primary, wait on latch until
+			 * enough time has passed.
+			 */
+			if (recoveryApplyDelay(xlogreader))
+			{
+				/*
+				 * We test for paused recovery again here. If user sets
+				 * delayed apply, it may be because they expect to pause
+				 * recovery in case of problems, so we must test again here
+				 * otherwise pausing during the delay-wait wouldn't work.
+				 */
+				if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+					RECOVERY_NOT_PAUSED)
+					recoveryPausesHere(false);
+			}
+
+			/*
+			 * Apply the record
+			 */
+			ApplyWalRecord(xlogreader, record, &replayTLI);
+
+			/* Exit loop if we reached inclusive recovery target */
+			if (recoveryStopsAfter(xlogreader))
+			{
+				reachedRecoveryTarget = true;
+				break;
+			}
+
+			/* Else, try to fetch the next WAL record */
+			record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
+		} while (record != NULL);
+
+		/*
+		 * end of main redo apply loop
+		 */
+
+		if (reachedRecoveryTarget)
+		{
+			if (!reachedConsistency)
+				ereport(FATAL,
+						(errmsg("requested recovery stop point is before consistent recovery point")));
+
+			/*
+			 * This is the last point where we can restart recovery with a new
+			 * recovery target, if we shutdown and begin again. After this,
+			 * Resource Managers may choose to do permanent corrective actions
+			 * at end of recovery.
+			 */
+			switch (recoveryTargetAction)
+			{
+				case RECOVERY_TARGET_ACTION_SHUTDOWN:
+
+					/*
+					 * exit with special return code to request shutdown of
+					 * postmaster.  Log messages issued from postmaster.
+					 */
+					proc_exit(3);
+
+				case RECOVERY_TARGET_ACTION_PAUSE:
+					SetRecoveryPause(true);
+					recoveryPausesHere(true);
+
+					/* drop into promote */
+
+				case RECOVERY_TARGET_ACTION_PROMOTE:
+					break;
+			}
+		}
+
+		RmgrCleanup();
+
+		ereport(LOG,
+				(errmsg("redo done at %X/%X system usage: %s",
+						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+						pg_rusage_show(&ru0))));
+		xtime = GetLatestXTime();
+		if (xtime)
+			ereport(LOG,
+					(errmsg("last completed transaction was at log time %s",
+							timestamptz_to_str(xtime))));
+
+		InRedo = false;
+	}
+	else
+	{
+		/* there are no WAL records following the checkpoint */
+		ereport(LOG,
+				(errmsg("redo is not required")));
+	}
+
+	/*
+	 * This check is intentionally after the above log messages that indicate
+	 * how far recovery went.
+	 */
+	if (ArchiveRecoveryRequested &&
+		recoveryTarget != RECOVERY_TARGET_UNSET &&
+		!reachedRecoveryTarget)
+		ereport(FATAL,
+				(errmsg("recovery ended before configured recovery target was reached")));
+}
+
+/*
+ * Subroutine of PerformWalRecovery, to apply one WAL record.
+ */
+static void
+ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
+{
+	ErrorContextCallback errcallback;
+	bool		switchedTLI = false;
+
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = rm_redo_error_callback;
+	errcallback.arg = (void *) xlogreader;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/*
+	 * ShmemVariableCache->nextXid must be beyond record's xid.
+	 */
+	AdvanceNextFullTransactionIdPastXid(record->xl_xid);
+
+	/*
+	 * Before replaying this record, check if this record causes the current
+	 * timeline to change. The record is already considered to be part of the
+	 * new timeline, so we update replayTLI before replaying it. That's
+	 * important so that replayEndTLI, which is recorded as the minimum
+	 * recovery point's TLI if recovery stops after this record, is set
+	 * correctly.
+	 */
+	if (record->xl_rmid == RM_XLOG_ID)
+	{
+		TimeLineID	newReplayTLI = *replayTLI;
+		TimeLineID	prevReplayTLI = *replayTLI;
+		uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+		if (info == XLOG_CHECKPOINT_SHUTDOWN)
+		{
+			CheckPoint	checkPoint;
+
+			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+			newReplayTLI = checkPoint.ThisTimeLineID;
+			prevReplayTLI = checkPoint.PrevTimeLineID;
+		}
+		else if (info == XLOG_END_OF_RECOVERY)
+		{
+			xl_end_of_recovery xlrec;
+
+			memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
+			newReplayTLI = xlrec.ThisTimeLineID;
+			prevReplayTLI = xlrec.PrevTimeLineID;
+		}
+
+		if (newReplayTLI != *replayTLI)
+		{
+			/* Check that it's OK to switch to this TLI */
+			checkTimeLineSwitch(xlogreader->EndRecPtr,
+								newReplayTLI, prevReplayTLI, *replayTLI);
+
+			/* Following WAL records should be run with new TLI */
+			*replayTLI = newReplayTLI;
+			switchedTLI = true;
+		}
+	}
+
+	/*
+	 * Update shared replayEndRecPtr before replaying this record, so that
+	 * XLogFlush will update minRecoveryPoint correctly.
+	 */
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
+	XLogRecoveryCtl->replayEndTLI = *replayTLI;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	/*
+	 * If we are attempting to enter Hot Standby mode, process XIDs we see
+	 */
+	if (standbyState >= STANDBY_INITIALIZED &&
+		TransactionIdIsValid(record->xl_xid))
+		RecordKnownAssignedTransactionIds(record->xl_xid);
+
+	/*
+	 * Some XLOG record types that are related to recovery are processed
+	 * directly here, rather than in xlog_redo()
+	 */
+	if (record->xl_rmid == RM_XLOG_ID)
+		xlogrecovery_redo(xlogreader, *replayTLI);
+
+	/* Now apply the WAL record itself */
+	GetRmgr(record->xl_rmid).rm_redo(xlogreader);
+
+	/*
+	 * After redo, check whether the backup pages associated with the WAL
+	 * record are consistent with the existing pages. This check is done only
+	 * if consistency check is enabled for this record.
+	 */
+	if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+		verifyBackupPageConsistency(xlogreader);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+
+	/*
+	 * Update lastReplayedEndRecPtr after this record has been successfully
+	 * replayed.
+	 */
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+	XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+	XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	/*
+	 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
+	 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
+	 * a reply to the primary.
+	 */
+	if (doRequestWalReceiverReply)
+	{
+		doRequestWalReceiverReply = false;
+		WalRcvForceReply();
+	}
+
+	/* Allow read-only connections if we're consistent now */
+	CheckRecoveryConsistency();
+
+	/* Is this a timeline switch? */
+	if (switchedTLI)
+	{
+		/*
+		 * Before we continue on the new timeline, clean up any (possibly
+		 * bogus) future WAL segments on the old timeline.
+		 */
+		RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
+
+		/*
+		 * Wake up any walsenders to notice that we are on a new timeline.
+		 */
+		if (AllowCascadeReplication())
+			WalSndWakeup();
+
+		/* Reset the prefetcher. */
+		XLogPrefetchReconfigure();
+	}
+}
+
+/*
+ * Some XLOG RM record types that are directly related to WAL recovery are
+ * handled here rather than in the xlog_redo()
+ */
+static void
+xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	XLogRecPtr	lsn = record->EndRecPtr;
+
+	Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
+
+	if (info == XLOG_OVERWRITE_CONTRECORD)
+	{
+		/* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
+		xl_overwrite_contrecord xlrec;
+
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
+		if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
+			elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+				 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+				 LSN_FORMAT_ARGS(record->overwrittenRecPtr));
+
+		/* We have safely skipped the aborted record */
+		abortedRecPtr = InvalidXLogRecPtr;
+		missingContrecPtr = InvalidXLogRecPtr;
+
+		ereport(LOG,
+				(errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
+						LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+						timestamptz_to_str(xlrec.overwrite_time))));
+
+		/* Verifying the record should only happen once */
+		record->overwrittenRecPtr = InvalidXLogRecPtr;
+	}
+	else if (info == XLOG_BACKUP_END)
+	{
+		XLogRecPtr	startpoint;
+
+		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
+
+		if (backupStartPoint == startpoint)
+		{
+			/*
+			 * We have reached the end of base backup, the point where
+			 * pg_backup_stop() was done.  The data on disk is now consistent
+			 * (assuming we have also reached minRecoveryPoint).  Set
+			 * backupEndPoint to the current LSN, so that the next call to
+			 * CheckRecoveryConsistency() will notice it and do the
+			 * end-of-backup processing.
+			 */
+			elog(DEBUG1, "end of backup record reached");
+
+			backupEndPoint = lsn;
+		}
+		else
+			elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+				 LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
+	}
+}
+
+/*
+ * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
+ * directories.
+ *
+ * Replay of database creation XLOG records for databases that were later
+ * dropped can create fake directories in pg_tblspc.  By the time consistency
+ * is reached these directories should have been removed; here we verify
+ * that this did indeed happen.  This is to be called at the point where
+ * consistent state is reached.
+ *
+ * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
+ * useful for testing purposes, and also allows for an escape hatch in case
+ * things go south.
+ */
+static void
+CheckTablespaceDirectory(void)
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir("pg_tblspc");
+	while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
+	{
+		char		path[MAXPGPATH + 10];
+
+		/* Skip entries of non-oid names */
+		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+			continue;
+
+		snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
+
+		if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
+			ereport(allow_in_place_tablespaces ? WARNING : PANIC,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("unexpected directory entry \"%s\" found in %s",
+							de->d_name, "pg_tblspc/"),
+					 errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
+					 errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
+	}
+}
+
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+	XLogRecPtr	lastReplayedEndRecPtr;
+	TimeLineID	lastReplayedTLI;
+
+	/*
+	 * During crash recovery, we don't reach a consistent state until we've
+	 * replayed all the WAL.
+	 */
+	if (XLogRecPtrIsInvalid(minRecoveryPoint))
+		return;
+
+	Assert(InArchiveRecovery);
+
+	/*
+	 * assume that we are called in the startup process, and hence don't need
+	 * a lock to read lastReplayedEndRecPtr
+	 */
+	lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+	lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
+
+	/*
+	 * Have we reached the point where our base backup was completed?
+	 */
+	if (!XLogRecPtrIsInvalid(backupEndPoint) &&
+		backupEndPoint <= lastReplayedEndRecPtr)
+	{
+		elog(DEBUG1, "end of backup reached");
+
+		/*
+		 * We have reached the end of base backup, as indicated by pg_control.
+		 * Update the control file accordingly.
+		 */
+		ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
+		backupStartPoint = InvalidXLogRecPtr;
+		backupEndPoint = InvalidXLogRecPtr;
+		backupEndRequired = false;
+	}
+
+	/*
+	 * Have we passed our safe starting point? Note that minRecoveryPoint is
+	 * known to be incorrectly set if recovering from a backup, until the
+	 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
+	 * All we know prior to that is that we're not consistent yet.
+	 */
+	if (!reachedConsistency && !backupEndRequired &&
+		minRecoveryPoint <= lastReplayedEndRecPtr)
+	{
+		/*
+		 * Check to see if the XLOG sequence contained any unresolved
+		 * references to uninitialized pages.
+		 */
+		XLogCheckInvalidPages();
+
+		/*
+		 * Check that pg_tblspc doesn't contain any real directories. Replay
+		 * of Database/CREATE_* records may have created ficticious tablespace
+		 * directories that should have been removed by the time consistency
+		 * was reached.
+		 */
+		CheckTablespaceDirectory();
+
+		reachedConsistency = true;
+		ereport(LOG,
+				(errmsg("consistent recovery state reached at %X/%X",
+						LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+	}
+
+	/*
+	 * Have we got a valid starting snapshot that will allow queries to be
+	 * run? If so, we can tell postmaster that the database is consistent now,
+	 * enabling connections.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY &&
+		!LocalHotStandbyActive &&
+		reachedConsistency &&
+		IsUnderPostmaster)
+	{
+		SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+		XLogRecoveryCtl->SharedHotStandbyActive = true;
+		SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+		LocalHotStandbyActive = true;
+
+		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
+	}
+}
+
+/*
+ * Error context callback for errors occurring during rm_redo().
+ */
+static void
+rm_redo_error_callback(void *arg)
+{
+	XLogReaderState *record = (XLogReaderState *) arg;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+	xlog_outdesc(&buf, record);
+	xlog_block_info(&buf, record);
+
+	/* translator: %s is a WAL record description */
+	errcontext("WAL redo at %X/%X for %s",
+			   LSN_FORMAT_ARGS(record->ReadRecPtr),
+			   buf.data);
+
+	pfree(buf.data);
+}
+
+/*
+ * Returns a string describing an XLogRecord, consisting of its identity
+ * optionally followed by a colon, a space, and a further description.
+ */
+void
+xlog_outdesc(StringInfo buf, XLogReaderState *record)
+{
+	RmgrData	rmgr = GetRmgr(XLogRecGetRmid(record));
+	uint8		info = XLogRecGetInfo(record);
+	const char *id;
+
+	appendStringInfoString(buf, rmgr.rm_name);
+	appendStringInfoChar(buf, '/');
+
+	id = rmgr.rm_identify(info);
+	if (id == NULL)
+		appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
+	else
+		appendStringInfo(buf, "%s: ", id);
+
+	rmgr.rm_desc(buf, record);
+}
+
+#ifdef WAL_DEBUG
+
+static void
+xlog_outrec(StringInfo buf, XLogReaderState *record)
+{
+	appendStringInfo(buf, "prev %X/%X; xid %u",
+					 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
+					 XLogRecGetXid(record));
+
+	appendStringInfo(buf, "; len %u",
+					 XLogRecGetDataLen(record));
+
+	xlog_block_info(buf, record);
+}
+#endif							/* WAL_DEBUG */
+
+/*
+ * Returns a string giving information about all the blocks in an
+ * XLogRecord.
+ */
+static void
+xlog_block_info(StringInfo buf, XLogReaderState *record)
+{
+	int			block_id;
+
+	/* decode block references */
+	for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+	{
+		RelFileNode rnode;
+		ForkNumber	forknum;
+		BlockNumber blk;
+
+		if (!XLogRecGetBlockTagExtended(record, block_id,
+										&rnode, &forknum, &blk, NULL))
+			continue;
+
+		if (forknum != MAIN_FORKNUM)
+			appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
+							 block_id,
+							 rnode.spcNode, rnode.dbNode, rnode.relNode,
+							 forknum,
+							 blk);
+		else
+			appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
+							 block_id,
+							 rnode.spcNode, rnode.dbNode, rnode.relNode,
+							 blk);
+		if (XLogRecHasBlockImage(record, block_id))
+			appendStringInfoString(buf, " FPW");
+	}
+}
+
+
+/*
+ * Check that it's OK to switch to new timeline during recovery.
+ *
+ * 'lsn' is the address of the shutdown checkpoint record we're about to
+ * replay. (Currently, timeline can only change at a shutdown checkpoint).
+ */
+static void
+checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
+					TimeLineID replayTLI)
+{
+	/* Check that the record agrees on what the current (old) timeline is */
+	if (prevTLI != replayTLI)
+		ereport(PANIC,
+				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
+						prevTLI, replayTLI)));
+
+	/*
+	 * The new timeline better be in the list of timelines we expect to see,
+	 * according to the timeline history. It should also not decrease.
+	 */
+	if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
+		ereport(PANIC,
+				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+						newTLI, replayTLI)));
+
+	/*
+	 * If we have not yet reached min recovery point, and we're about to
+	 * switch to a timeline greater than the timeline of the min recovery
+	 * point: trouble. After switching to the new timeline, we could not
+	 * possibly visit the min recovery point on the correct timeline anymore.
+	 * This can happen if there is a newer timeline in the archive that
+	 * branched before the timeline the min recovery point is on, and you
+	 * attempt to do PITR to the new timeline.
+	 */
+	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
+		lsn < minRecoveryPoint &&
+		newTLI > minRecoveryPointTLI)
+		ereport(PANIC,
+				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
+						newTLI,
+						LSN_FORMAT_ARGS(minRecoveryPoint),
+						minRecoveryPointTLI)));
+
+	/* Looks good */
+}
+
+
+/*
+ * Extract timestamp from WAL record.
+ *
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
+ */
+static bool
+getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	uint8		xact_info = info & XLOG_XACT_OPMASK;
+	uint8		rmid = XLogRecGetRmid(record);
+
+	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+	{
+		*recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+		return true;
+	}
+	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
+							   xact_info == XLOG_XACT_COMMIT_PREPARED))
+	{
+		*recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+		return true;
+	}
+	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
+							   xact_info == XLOG_XACT_ABORT_PREPARED))
+	{
+		*recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+verifyBackupPageConsistency(XLogReaderState *record)
+{
+	RmgrData	rmgr = GetRmgr(XLogRecGetRmid(record));
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+	int			block_id;
+
+	/* Records with no backup blocks have no need for consistency checks. */
+	if (!XLogRecHasAnyBlockRefs(record))
+		return;
+
+	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+	for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+	{
+		Buffer		buf;
+		Page		page;
+
+		if (!XLogRecGetBlockTagExtended(record, block_id,
+										&rnode, &forknum, &blkno, NULL))
+		{
+			/*
+			 * WAL record doesn't contain a block reference with the given id.
+			 * Do nothing.
+			 */
+			continue;
+		}
+
+		Assert(XLogRecHasBlockImage(record, block_id));
+
+		if (XLogRecBlockImageApply(record, block_id))
+		{
+			/*
+			 * WAL record has already applied the page, so bypass the
+			 * consistency check as that would result in comparing the full
+			 * page stored in the record with itself.
+			 */
+			continue;
+		}
+
+		/*
+		 * Read the contents from the current buffer and store it in a
+		 * temporary page.
+		 */
+		buf = XLogReadBufferExtended(rnode, forknum, blkno,
+									 RBM_NORMAL_NO_LOG,
+									 InvalidBuffer);
+		if (!BufferIsValid(buf))
+			continue;
+
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buf);
+
+		/*
+		 * Take a copy of the local page where WAL has been applied to have a
+		 * comparison base before masking it...
+		 */
+		memcpy(replay_image_masked, page, BLCKSZ);
+
+		/* No need for this page anymore now that a copy is in. */
+		UnlockReleaseBuffer(buf);
+
+		/*
+		 * If the block LSN is already ahead of this WAL record, we can't
+		 * expect contents to match.  This can happen if recovery is
+		 * restarted.
+		 */
+		if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+			continue;
+
+		/*
+		 * Read the contents from the backup copy, stored in WAL record and
+		 * store it in a temporary page. There is no need to allocate a new
+		 * page here, a local buffer is fine to hold its contents and a mask
+		 * can be directly applied on it.
+		 */
+		if (!RestoreBlockImage(record, block_id, primary_image_masked))
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg_internal("%s", record->errormsg_buf)));
+
+		/*
+		 * If masking function is defined, mask both the primary and replay
+		 * images
+		 */
+		if (rmgr.rm_mask != NULL)
+		{
+			rmgr.rm_mask(replay_image_masked, blkno);
+			rmgr.rm_mask(primary_image_masked, blkno);
+		}
+
+		/* Time to compare the primary and replay images. */
+		if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
+		{
+			elog(FATAL,
+				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+				 rnode.spcNode, rnode.dbNode, rnode.relNode,
+				 forknum, blkno);
+		}
+	}
+}
+
+/*
+ * For point-in-time recovery, this function decides whether we want to
+ * stop applying the XLOG before the current record.
+ *
+ * Returns true if we are stopping, false otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
+ */
+static bool
+recoveryStopsBefore(XLogReaderState *record)
+{
+	bool		stopsHere = false;
+	uint8		xact_info;
+	bool		isCommit;
+	TimestampTz recordXtime = 0;
+	TransactionId recordXid;
+
+	/*
+	 * Ignore recovery target settings when not in archive recovery (meaning
+	 * we are in crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	/* Check if we should stop as soon as reaching consistency */
+	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+	{
+		ereport(LOG,
+				(errmsg("recovery stopping after reaching consistency")));
+
+		recoveryStopAfter = false;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		return true;
+	}
+
+	/* Check if target LSN has been reached */
+	if (recoveryTarget == RECOVERY_TARGET_LSN &&
+		!recoveryTargetInclusive &&
+		record->ReadRecPtr >= recoveryTargetLSN)
+	{
+		recoveryStopAfter = false;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = record->ReadRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		ereport(LOG,
+				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+						LSN_FORMAT_ARGS(recoveryStopLSN))));
+		return true;
+	}
+
+	/* Otherwise we only consider stopping before COMMIT or ABORT records. */
+	if (XLogRecGetRmid(record) != RM_XACT_ID)
+		return false;
+
+	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	if (xact_info == XLOG_XACT_COMMIT)
+	{
+		isCommit = true;
+		recordXid = XLogRecGetXid(record);
+	}
+	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+	{
+		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+		xl_xact_parsed_commit parsed;
+
+		isCommit = true;
+		ParseCommitRecord(XLogRecGetInfo(record),
+						  xlrec,
+						  &parsed);
+		recordXid = parsed.twophase_xid;
+	}
+	else if (xact_info == XLOG_XACT_ABORT)
+	{
+		isCommit = false;
+		recordXid = XLogRecGetXid(record);
+	}
+	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+	{
+		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+		xl_xact_parsed_abort parsed;
+
+		isCommit = false;
+		ParseAbortRecord(XLogRecGetInfo(record),
+						 xlrec,
+						 &parsed);
+		recordXid = parsed.twophase_xid;
+	}
+	else
+		return false;
+
+	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
+	{
+		/*
+		 * There can be only one transaction end record with this exact
+		 * transactionid
+		 *
+		 * when testing for an xid, we MUST test for equality only, since
+		 * transactions are numbered in the order they start, not the order
+		 * they complete. A higher numbered xid will complete before you about
+		 * 50% of the time...
+		 */
+		stopsHere = (recordXid == recoveryTargetXid);
+	}
+
+	/*
+	 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
+	 * We don't expect getRecordTimestamp ever to fail, since we already know
+	 * this is a commit or abort record; but test its result anyway.
+	 */
+	if (getRecordTimestamp(record, &recordXtime) &&
+		recoveryTarget == RECOVERY_TARGET_TIME)
+	{
+		/*
+		 * There can be many transactions that share the same commit time, so
+		 * we stop after the last one, if we are inclusive, or stop at the
+		 * first one if we are exclusive
+		 */
+		if (recoveryTargetInclusive)
+			stopsHere = (recordXtime > recoveryTargetTime);
+		else
+			stopsHere = (recordXtime >= recoveryTargetTime);
+	}
+
+	if (stopsHere)
+	{
+		recoveryStopAfter = false;
+		recoveryStopXid = recordXid;
+		recoveryStopTime = recordXtime;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopName[0] = '\0';
+
+		if (isCommit)
+		{
+			ereport(LOG,
+					(errmsg("recovery stopping before commit of transaction %u, time %s",
+							recoveryStopXid,
+							timestamptz_to_str(recoveryStopTime))));
+		}
+		else
+		{
+			ereport(LOG,
+					(errmsg("recovery stopping before abort of transaction %u, time %s",
+							recoveryStopXid,
+							timestamptz_to_str(recoveryStopTime))));
+		}
+	}
+
+	return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogRecoveryCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogReaderState *record)
+{
+	uint8		info;
+	uint8		xact_info;
+	uint8		rmid;
+	TimestampTz recordXtime = 0;
+
+	/*
+	 * Ignore recovery target settings when not in archive recovery (meaning
+	 * we are in crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	rmid = XLogRecGetRmid(record);
+
+	/*
+	 * There can be many restore points that share the same name; we stop at
+	 * the first one.
+	 */
+	if (recoveryTarget == RECOVERY_TARGET_NAME &&
+		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+	{
+		xl_restore_point *recordRestorePointData;
+
+		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+		{
+			recoveryStopAfter = true;
+			recoveryStopXid = InvalidTransactionId;
+			recoveryStopLSN = InvalidXLogRecPtr;
+			(void) getRecordTimestamp(record, &recoveryStopTime);
+			strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+			ereport(LOG,
+					(errmsg("recovery stopping at restore point \"%s\", time %s",
+							recoveryStopName,
+							timestamptz_to_str(recoveryStopTime))));
+			return true;
+		}
+	}
+
+	/* Check if the target LSN has been reached */
+	if (recoveryTarget == RECOVERY_TARGET_LSN &&
+		recoveryTargetInclusive &&
+		record->ReadRecPtr >= recoveryTargetLSN)
+	{
+		recoveryStopAfter = true;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = record->ReadRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		ereport(LOG,
+				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
+						LSN_FORMAT_ARGS(recoveryStopLSN))));
+		return true;
+	}
+
+	if (rmid != RM_XACT_ID)
+		return false;
+
+	xact_info = info & XLOG_XACT_OPMASK;
+
+	if (xact_info == XLOG_XACT_COMMIT ||
+		xact_info == XLOG_XACT_COMMIT_PREPARED ||
+		xact_info == XLOG_XACT_ABORT ||
+		xact_info == XLOG_XACT_ABORT_PREPARED)
+	{
+		TransactionId recordXid;
+
+		/* Update the last applied transaction timestamp */
+		if (getRecordTimestamp(record, &recordXtime))
+			SetLatestXTime(recordXtime);
+
+		/* Extract the XID of the committed/aborted transaction */
+		if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+		{
+			xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+			xl_xact_parsed_commit parsed;
+
+			ParseCommitRecord(XLogRecGetInfo(record),
+							  xlrec,
+							  &parsed);
+			recordXid = parsed.twophase_xid;
+		}
+		else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+		{
+			xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+			xl_xact_parsed_abort parsed;
+
+			ParseAbortRecord(XLogRecGetInfo(record),
+							 xlrec,
+							 &parsed);
+			recordXid = parsed.twophase_xid;
+		}
+		else
+			recordXid = XLogRecGetXid(record);
+
+		/*
+		 * There can be only one transaction end record with this exact
+		 * transactionid
+		 *
+		 * when testing for an xid, we MUST test for equality only, since
+		 * transactions are numbered in the order they start, not the order
+		 * they complete. A higher numbered xid will complete before you about
+		 * 50% of the time...
+		 */
+		if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+			recordXid == recoveryTargetXid)
+		{
+			recoveryStopAfter = true;
+			recoveryStopXid = recordXid;
+			recoveryStopTime = recordXtime;
+			recoveryStopLSN = InvalidXLogRecPtr;
+			recoveryStopName[0] = '\0';
+
+			if (xact_info == XLOG_XACT_COMMIT ||
+				xact_info == XLOG_XACT_COMMIT_PREPARED)
+			{
+				ereport(LOG,
+						(errmsg("recovery stopping after commit of transaction %u, time %s",
+								recoveryStopXid,
+								timestamptz_to_str(recoveryStopTime))));
+			}
+			else if (xact_info == XLOG_XACT_ABORT ||
+					 xact_info == XLOG_XACT_ABORT_PREPARED)
+			{
+				ereport(LOG,
+						(errmsg("recovery stopping after abort of transaction %u, time %s",
+								recoveryStopXid,
+								timestamptz_to_str(recoveryStopTime))));
+			}
+			return true;
+		}
+	}
+
+	/* Check if we should stop as soon as reaching consistency */
+	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+	{
+		ereport(LOG,
+				(errmsg("recovery stopping after reaching consistency")));
+
+		recoveryStopAfter = true;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopTime = 0;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopName[0] = '\0';
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Create a comment for the history file to explain why and where
+ * timeline changed.
+ */
+static char *
+getRecoveryStopReason(void)
+{
+	char		reason[200];
+
+	if (recoveryTarget == RECOVERY_TARGET_XID)
+		snprintf(reason, sizeof(reason),
+				 "%s transaction %u",
+				 recoveryStopAfter ? "after" : "before",
+				 recoveryStopXid);
+	else if (recoveryTarget == RECOVERY_TARGET_TIME)
+		snprintf(reason, sizeof(reason),
+				 "%s %s\n",
+				 recoveryStopAfter ? "after" : "before",
+				 timestamptz_to_str(recoveryStopTime));
+	else if (recoveryTarget == RECOVERY_TARGET_LSN)
+		snprintf(reason, sizeof(reason),
+				 "%s LSN %X/%X\n",
+				 recoveryStopAfter ? "after" : "before",
+				 LSN_FORMAT_ARGS(recoveryStopLSN));
+	else if (recoveryTarget == RECOVERY_TARGET_NAME)
+		snprintf(reason, sizeof(reason),
+				 "at restore point \"%s\"",
+				 recoveryStopName);
+	else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+		snprintf(reason, sizeof(reason), "reached consistency");
+	else
+		snprintf(reason, sizeof(reason), "no recovery target specified");
+
+	return pstrdup(reason);
+}
+
+/*
+ * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
+ *
+ * endOfRecovery is true if the recovery target is reached and
+ * the paused state starts at the end of recovery because of
+ * recovery_target_action=pause, and false otherwise.
+ */
+static void
+recoveryPausesHere(bool endOfRecovery)
+{
+	/* Don't pause unless users can connect! */
+	if (!LocalHotStandbyActive)
+		return;
+
+	/* Don't pause after standby promotion has been triggered */
+	if (LocalPromoteIsTriggered)
+		return;
+
+	if (endOfRecovery)
+		ereport(LOG,
+				(errmsg("pausing at the end of recovery"),
+				 errhint("Execute pg_wal_replay_resume() to promote.")));
+	else
+		ereport(LOG,
+				(errmsg("recovery has paused"),
+				 errhint("Execute pg_wal_replay_resume() to continue.")));
+
+	/* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
+	while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+	{
+		HandleStartupProcInterrupts();
+		if (CheckForStandbyTrigger())
+			return;
+
+		/*
+		 * If recovery pause is requested then set it paused.  While we are in
+		 * the loop, user might resume and pause again so set this every time.
+		 */
+		ConfirmRecoveryPaused();
+
+		/*
+		 * We wait on a condition variable that will wake us as soon as the
+		 * pause ends, but we use a timeout so we can check the above exit
+		 * condition periodically too.
+		 */
+		ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+									WAIT_EVENT_RECOVERY_PAUSE);
+	}
+	ConditionVariableCancelSleep();
+}
+
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the primary.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogReaderState *record)
+{
+	uint8		xact_info;
+	TimestampTz xtime;
+	TimestampTz delayUntil;
+	long		msecs;
+
+	/* nothing to do if no delay configured */
+	if (recovery_min_apply_delay <= 0)
+		return false;
+
+	/* no delay is applied on a database not yet consistent */
+	if (!reachedConsistency)
+		return false;
+
+	/* nothing to do if crash recovery is requested */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	/*
+	 * Is it a COMMIT record?
+	 *
+	 * We deliberately choose not to delay aborts since they have no effect on
+	 * MVCC. We already allow replay of records that don't have a timestamp,
+	 * so there is already opportunity for issues caused by early conflicts on
+	 * standbys.
+	 */
+	if (XLogRecGetRmid(record) != RM_XACT_ID)
+		return false;
+
+	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	if (xact_info != XLOG_XACT_COMMIT &&
+		xact_info != XLOG_XACT_COMMIT_PREPARED)
+		return false;
+
+	if (!getRecordTimestamp(record, &xtime))
+		return false;
+
+	delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+	/*
+	 * Exit without arming the latch if it's already past time to apply this
+	 * record
+	 */
+	msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
+	if (msecs <= 0)
+		return false;
+
+	while (true)
+	{
+		ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+		/*
+		 * This might change recovery_min_apply_delay or the trigger file's
+		 * location.
+		 */
+		HandleStartupProcInterrupts();
+
+		if (CheckForStandbyTrigger())
+			break;
+
+		/*
+		 * Recalculate delayUntil as recovery_min_apply_delay could have
+		 * changed while waiting in this loop.
+		 */
+		delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+		/*
+		 * Wait for difference between GetCurrentTimestamp() and delayUntil.
+		 */
+		msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
+												delayUntil);
+
+		if (msecs <= 0)
+			break;
+
+		elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
+
+		(void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+						 msecs,
+						 WAIT_EVENT_RECOVERY_APPLY_DELAY);
+	}
+	return true;
+}
+
+/*
+ * Get the current state of the recovery pause request.
+ */
+RecoveryPauseState
+GetRecoveryPauseState(void)
+{
+	RecoveryPauseState state;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	state = XLogRecoveryCtl->recoveryPauseState;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	return state;
+}
+
+/*
+ * Set the recovery pause state.
+ *
+ * If recovery pause is requested then sets the recovery pause state to
+ * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
+ * to 'not paused' to resume the recovery.  The recovery pause will be
+ * confirmed by the ConfirmRecoveryPaused.
+ */
+void
+SetRecoveryPause(bool recoveryPause)
+{
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+
+	if (!recoveryPause)
+		XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+	else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
+		XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
+
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	if (!recoveryPause)
+		ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Confirm the recovery pause by setting the recovery pause state to
+ * RECOVERY_PAUSED.
+ */
+static void
+ConfirmRecoveryPaused(void)
+{
+	/* If recovery pause is requested then set it paused */
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
+		XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+
+/*
+ * Attempt to read the next XLOG record.
+ *
+ * Before first call, the reader needs to be positioned to the first record
+ * by calling XLogPrefetcherBeginRead().
+ *
+ * If no valid record is available, returns NULL, or fails if emode is PANIC.
+ * (emode must be either PANIC, LOG). In standby mode, retries until a valid
+ * record is available.
+ */
+static XLogRecord *
+ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
+		   bool fetching_ckpt, TimeLineID replayTLI)
+{
+	XLogRecord *record;
+	XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
+	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
+
+	/* Pass through parameters to XLogPageRead */
+	private->fetching_ckpt = fetching_ckpt;
+	private->emode = emode;
+	private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
+	private->replayTLI = replayTLI;
+
+	/* This is the first attempt to read this page. */
+	lastSourceFailed = false;
+
+	for (;;)
+	{
+		char	   *errormsg;
+
+		record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
+		if (record == NULL)
+		{
+			/*
+			 * When we find that WAL ends in an incomplete record, keep track
+			 * of that record.  After recovery is done, we'll write a record to
+			 * indicate to downstream WAL readers that that portion is to be
+			 * ignored.
+			 *
+			 * However, when ArchiveRecoveryRequested = true, we're going to
+			 * switch to a new timeline at the end of recovery. We will only
+			 * copy WAL over to the new timeline up to the end of the last
+			 * complete record, so if we did this, we would later create an
+			 * overwrite contrecord in the wrong place, breaking everything.
+			 */
+			if (!ArchiveRecoveryRequested &&
+				!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+			{
+				abortedRecPtr = xlogreader->abortedRecPtr;
+				missingContrecPtr = xlogreader->missingContrecPtr;
+			}
+
+			if (readFile >= 0)
+			{
+				close(readFile);
+				readFile = -1;
+			}
+
+			/*
+			 * We only end up here without a message when XLogPageRead()
+			 * failed - in that case we already logged something. In
+			 * StandbyMode that only happens if we have been triggered, so we
+			 * shouldn't loop anymore in that case.
+			 */
+			if (errormsg)
+				ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+						(errmsg_internal("%s", errormsg) /* already translated */ ));
+		}
+
+		/*
+		 * Check page TLI is one of the expected values.
+		 */
+		else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
+		{
+			char		fname[MAXFNAMELEN];
+			XLogSegNo	segno;
+			int32		offset;
+
+			XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
+			offset = XLogSegmentOffset(xlogreader->latestPagePtr,
+									   wal_segment_size);
+			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
+						 wal_segment_size);
+			ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+					(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+							xlogreader->latestPageTLI,
+							fname,
+							offset)));
+			record = NULL;
+		}
+
+		if (record)
+		{
+			/* Great, got a record */
+			return record;
+		}
+		else
+		{
+			/* No valid record available from this source */
+			lastSourceFailed = true;
+
+			/*
+			 * If archive recovery was requested, but we were still doing
+			 * crash recovery, switch to archive recovery and retry using the
+			 * offline archive. We have now replayed all the valid WAL in
+			 * pg_wal, so we are presumably now consistent.
+			 *
+			 * We require that there's at least some valid WAL present in
+			 * pg_wal, however (!fetching_ckpt).  We could recover using the
+			 * WAL from the archive, even if pg_wal is completely empty, but
+			 * we'd have no idea how far we'd have to replay to reach
+			 * consistency.  So err on the safe side and give up.
+			 */
+			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
+				!fetching_ckpt)
+			{
+				ereport(DEBUG1,
+						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+				InArchiveRecovery = true;
+				if (StandbyModeRequested)
+					EnableStandbyMode();
+
+				SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
+				minRecoveryPoint = xlogreader->EndRecPtr;
+				minRecoveryPointTLI = replayTLI;
+
+				CheckRecoveryConsistency();
+
+				/*
+				 * Before we retry, reset lastSourceFailed and currentSource
+				 * so that we will check the archive next.
+				 */
+				lastSourceFailed = false;
+				currentSource = XLOG_FROM_ANY;
+
+				continue;
+			}
+
+			/* In standby mode, loop back to retry. Otherwise, give up. */
+			if (StandbyMode && !CheckForStandbyTrigger())
+				continue;
+			else
+				return NULL;
+		}
+	}
+}
+
+/*
+ * Read the XLOG page containing RecPtr into readBuf (if not read already).
+ * Returns number of bytes read, if the page is read successfully, or
+ * XLREAD_FAIL in case of errors.  When errors occur, they are ereport'ed, but
+ * only if they have not been previously reported.
+ *
+ * While prefetching, xlogreader->nonblocking may be set.  In that case,
+ * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
+ *
+ * This is responsible for restoring files from archive as needed, as well
+ * as for waiting for the requested WAL record to arrive in standby mode.
+ *
+ * 'emode' specifies the log level used for reporting "file not found" or
+ * "end of WAL" situations in archive recovery, or in standby mode when a
+ * trigger file is found. If set to WARNING or below, XLogPageRead() returns
+ * XLREAD_FAIL in those situations, on higher log levels the ereport() won't
+ * return.
+ *
+ * In standby mode, if after a successful return of XLogPageRead() the
+ * caller finds the record it's interested in to be broken, it should
+ * ereport the error with the level determined by
+ * emode_for_corrupt_record(), and then set lastSourceFailed
+ * and call XLogPageRead() again with the same arguments. This lets
+ * XLogPageRead() to try fetching the record from another source, or to
+ * sleep and retry.
+ */
+static int
+XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
+			 XLogRecPtr targetRecPtr, char *readBuf)
+{
+	XLogPageReadPrivate *private =
+	(XLogPageReadPrivate *) xlogreader->private_data;
+	int			emode = private->emode;
+	uint32		targetPageOff;
+	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+	int			r;
+
+	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
+	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
+
+	/*
+	 * See if we need to switch to a new segment because the requested record
+	 * is not in the currently open one.
+	 */
+	if (readFile >= 0 &&
+		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
+	{
+		/*
+		 * Request a restartpoint if we've replayed too much xlog since the
+		 * last one.
+		 */
+		if (ArchiveRecoveryRequested && IsUnderPostmaster)
+		{
+			if (XLogCheckpointNeeded(readSegNo))
+			{
+				(void) GetRedoRecPtr();
+				if (XLogCheckpointNeeded(readSegNo))
+					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+			}
+		}
+
+		close(readFile);
+		readFile = -1;
+		readSource = XLOG_FROM_ANY;
+	}
+
+	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
+
+retry:
+	/* See if we need to retrieve more data */
+	if (readFile < 0 ||
+		(readSource == XLOG_FROM_STREAM &&
+		 flushedUpto < targetPagePtr + reqLen))
+	{
+		if (readFile >= 0 &&
+			xlogreader->nonblocking &&
+			readSource == XLOG_FROM_STREAM &&
+			flushedUpto < targetPagePtr + reqLen)
+			return XLREAD_WOULDBLOCK;
+
+		switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
+											private->randAccess,
+											private->fetching_ckpt,
+											targetRecPtr,
+											private->replayTLI,
+											xlogreader->EndRecPtr,
+											xlogreader->nonblocking))
+		{
+			case XLREAD_WOULDBLOCK:
+				return XLREAD_WOULDBLOCK;
+			case XLREAD_FAIL:
+				if (readFile >= 0)
+					close(readFile);
+				readFile = -1;
+				readLen = 0;
+				readSource = XLOG_FROM_ANY;
+				return XLREAD_FAIL;
+			case XLREAD_SUCCESS:
+				break;
+		}
+	}
+
+	/*
+	 * At this point, we have the right segment open and if we're streaming we
+	 * know the requested record is in it.
+	 */
+	Assert(readFile != -1);
+
+	/*
+	 * If the current segment is being streamed from the primary, calculate
+	 * how much of the current page we have received already. We know the
+	 * requested record has been received, but this is for the benefit of
+	 * future calls, to allow quick exit at the top of this function.
+	 */
+	if (readSource == XLOG_FROM_STREAM)
+	{
+		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
+			readLen = XLOG_BLCKSZ;
+		else
+			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
+				targetPageOff;
+	}
+	else
+		readLen = XLOG_BLCKSZ;
+
+	/* Read the requested page */
+	readOff = targetPageOff;
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+	if (r != XLOG_BLCKSZ)
+	{
+		char		fname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		pgstat_report_wait_end();
+		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+		if (r < 0)
+		{
+			errno = save_errno;
+			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+					(errcode_for_file_access(),
+					 errmsg("could not read from log segment %s, offset %u: %m",
+							fname, readOff)));
+		}
+		else
+			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+							fname, readOff, r, (Size) XLOG_BLCKSZ)));
+		goto next_record_is_invalid;
+	}
+	pgstat_report_wait_end();
+
+	Assert(targetSegNo == readSegNo);
+	Assert(targetPageOff == readOff);
+	Assert(reqLen <= readLen);
+
+	xlogreader->seg.ws_tli = curFileTLI;
+
+	/*
+	 * Check the page header immediately, so that we can retry immediately if
+	 * it's not valid. This may seem unnecessary, because ReadPageInternal()
+	 * validates the page header anyway, and would propagate the failure up to
+	 * ReadRecord(), which would retry. However, there's a corner case with
+	 * continuation records, if a record is split across two pages such that
+	 * we would need to read the two pages from different sources. For
+	 * example, imagine a scenario where a streaming replica is started up,
+	 * and replay reaches a record that's split across two WAL segments. The
+	 * first page is only available locally, in pg_wal, because it's already
+	 * been recycled on the primary. The second page, however, is not present
+	 * in pg_wal, and we should stream it from the primary. There is a
+	 * recycled WAL segment present in pg_wal, with garbage contents, however.
+	 * We would read the first page from the local WAL segment, but when
+	 * reading the second page, we would read the bogus, recycled, WAL
+	 * segment. If we didn't catch that case here, we would never recover,
+	 * because ReadRecord() would retry reading the whole record from the
+	 * beginning.
+	 *
+	 * Of course, this only catches errors in the page header, which is what
+	 * happens in the case of a recycled WAL segment. Other kinds of errors or
+	 * corruption still has the same problem. But this at least fixes the
+	 * common case, which can happen as part of normal operation.
+	 *
+	 * Validating the page header is cheap enough that doing it twice
+	 * shouldn't be a big deal from a performance point of view.
+	 *
+	 * When not in standby mode, an invalid page header should cause recovery
+	 * to end, not retry reading the page, so we don't need to validate the
+	 * page header here for the retry. Instead, ReadPageInternal() is
+	 * responsible for the validation.
+	 */
+	if (StandbyMode &&
+		!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+	{
+		/*
+		 * Emit this error right now then retry this page immediately. Use
+		 * errmsg_internal() because the message was already translated.
+		 */
+		if (xlogreader->errormsg_buf[0])
+			ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+					(errmsg_internal("%s", xlogreader->errormsg_buf)));
+
+		/* reset any error XLogReaderValidatePageHeader() might have set */
+		XLogReaderResetError(xlogreader);
+		goto next_record_is_invalid;
+	}
+
+	return readLen;
+
+next_record_is_invalid:
+
+	/*
+	 * If we're reading ahead, give up fast.  Retries and error reporting will
+	 * be handled by a later read when recovery catches up to this point.
+	 */
+	if (xlogreader->nonblocking)
+		return XLREAD_WOULDBLOCK;
+
+	lastSourceFailed = true;
+
+	if (readFile >= 0)
+		close(readFile);
+	readFile = -1;
+	readLen = 0;
+	readSource = XLOG_FROM_ANY;
+
+	/* In standby-mode, keep trying */
+	if (StandbyMode)
+		goto retry;
+	else
+		return XLREAD_FAIL;
+}
+
+/*
+ * Open the WAL segment containing WAL location 'RecPtr'.
+ *
+ * The segment can be fetched via restore_command, or via walreceiver having
+ * streamed the record, or it can already be present in pg_wal. Checking
+ * pg_wal is mainly for crash recovery, but it will be polled in standby mode
+ * too, in case someone copies a new segment directly to pg_wal. That is not
+ * documented or recommended, though.
+ *
+ * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
+ * prepare to read WAL starting from RedoStartLSN after this.
+ *
+ * 'RecPtr' might not point to the beginning of the record we're interested
+ * in, it might also point to the page or segment header. In that case,
+ * 'tliRecPtr' is the position of the WAL record we're interested in. It is
+ * used to decide which timeline to stream the requested WAL from.
+ *
+ * 'replayLSN' is the current replay LSN, so that if we scan for new
+ * timelines, we can reject a switch to a timeline that branched off before
+ * this point.
+ *
+ * If the record is not immediately available, the function returns false
+ * if we're not in standby mode. In standby mode, waits for it to become
+ * available.
+ *
+ * When the requested record becomes available, the function opens the file
+ * containing it (if not open already), and returns XLREAD_SUCCESS. When end
+ * of standby mode is triggered by the user, and there is no more WAL
+ * available, returns XLREAD_FAIL.
+ *
+ * If nonblocking is true, then give up immediately if we can't satisfy the
+ * request, returning XLREAD_WOULDBLOCK instead of waiting.
+ */
+static XLogPageReadResult
+WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+							bool fetching_ckpt, XLogRecPtr tliRecPtr,
+							TimeLineID replayTLI, XLogRecPtr replayLSN,
+							bool nonblocking)
+{
+	static TimestampTz last_fail_time = 0;
+	TimestampTz now;
+	bool		streaming_reply_sent = false;
+
+	/*-------
+	 * Standby mode is implemented by a state machine:
+	 *
+	 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
+	 *	  pg_wal (XLOG_FROM_PG_WAL)
+	 * 2. Check trigger file
+	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+	 * 4. Rescan timelines
+	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
+	 *
+	 * Failure to read from the current source advances the state machine to
+	 * the next state.
+	 *
+	 * 'currentSource' indicates the current state. There are no currentSource
+	 * values for "check trigger", "rescan timelines", and "sleep" states,
+	 * those actions are taken when reading from the previous source fails, as
+	 * part of advancing to the next state.
+	 *
+	 * If standby mode is turned off while reading WAL from stream, we move
+	 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
+	 * the files (which would be required at end of recovery, e.g., timeline
+	 * history file) from archive or pg_wal. We don't need to kill WAL receiver
+	 * here because it's already stopped when standby mode is turned off at
+	 * the end of recovery.
+	 *-------
+	 */
+	if (!InArchiveRecovery)
+		currentSource = XLOG_FROM_PG_WAL;
+	else if (currentSource == XLOG_FROM_ANY ||
+			 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
+	{
+		lastSourceFailed = false;
+		currentSource = XLOG_FROM_ARCHIVE;
+	}
+
+	for (;;)
+	{
+		XLogSource	oldSource = currentSource;
+		bool		startWalReceiver = false;
+
+		/*
+		 * First check if we failed to read from the current source, and
+		 * advance the state machine if so. The failure to read might've
+		 * happened outside this function, e.g when a CRC check fails on a
+		 * record, or within this loop.
+		 */
+		if (lastSourceFailed)
+		{
+			/*
+			 * Don't allow any retry loops to occur during nonblocking
+			 * readahead.  Let the caller process everything that has been
+			 * decoded already first.
+			 */
+			if (nonblocking)
+				return XLREAD_WOULDBLOCK;
+
+			switch (currentSource)
+			{
+				case XLOG_FROM_ARCHIVE:
+				case XLOG_FROM_PG_WAL:
+
+					/*
+					 * Check to see if the trigger file exists. Note that we
+					 * do this only after failure, so when you create the
+					 * trigger file, we still finish replaying as much as we
+					 * can from archive and pg_wal before failover.
+					 */
+					if (StandbyMode && CheckForStandbyTrigger())
+					{
+						XLogShutdownWalRcv();
+						return XLREAD_FAIL;
+					}
+
+					/*
+					 * Not in standby mode, and we've now tried the archive
+					 * and pg_wal.
+					 */
+					if (!StandbyMode)
+						return XLREAD_FAIL;
+
+					/*
+					 * Move to XLOG_FROM_STREAM state, and set to start a
+					 * walreceiver if necessary.
+					 */
+					currentSource = XLOG_FROM_STREAM;
+					startWalReceiver = true;
+					break;
+
+				case XLOG_FROM_STREAM:
+
+					/*
+					 * Failure while streaming. Most likely, we got here
+					 * because streaming replication was terminated, or
+					 * promotion was triggered. But we also get here if we
+					 * find an invalid record in the WAL streamed from the
+					 * primary, in which case something is seriously wrong.
+					 * There's little chance that the problem will just go
+					 * away, but PANIC is not good for availability either,
+					 * especially in hot standby mode. So, we treat that the
+					 * same as disconnection, and retry from archive/pg_wal
+					 * again. The WAL in the archive should be identical to
+					 * what was streamed, so it's unlikely that it helps, but
+					 * one can hope...
+					 */
+
+					/*
+					 * We should be able to move to XLOG_FROM_STREAM only in
+					 * standby mode.
+					 */
+					Assert(StandbyMode);
+
+					/*
+					 * Before we leave XLOG_FROM_STREAM state, make sure that
+					 * walreceiver is not active, so that it won't overwrite
+					 * WAL that we restore from archive.
+					 */
+					XLogShutdownWalRcv();
+
+					/*
+					 * Before we sleep, re-scan for possible new timelines if
+					 * we were requested to recover to the latest timeline.
+					 */
+					if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+					{
+						if (rescanLatestTimeLine(replayTLI, replayLSN))
+						{
+							currentSource = XLOG_FROM_ARCHIVE;
+							break;
+						}
+					}
+
+					/*
+					 * XLOG_FROM_STREAM is the last state in our state
+					 * machine, so we've exhausted all the options for
+					 * obtaining the requested WAL. We're going to loop back
+					 * and retry from the archive, but if it hasn't been long
+					 * since last attempt, sleep wal_retrieve_retry_interval
+					 * milliseconds to avoid busy-waiting.
+					 */
+					now = GetCurrentTimestamp();
+					if (!TimestampDifferenceExceeds(last_fail_time, now,
+													wal_retrieve_retry_interval))
+					{
+						long		wait_time;
+
+						wait_time = wal_retrieve_retry_interval -
+							TimestampDifferenceMilliseconds(last_fail_time, now);
+
+						elog(LOG, "waiting for WAL to become available at %X/%X",
+							 LSN_FORMAT_ARGS(RecPtr));
+
+						/* Do background tasks that might benefit us later. */
+						KnownAssignedTransactionIdsIdleMaintenance();
+
+						(void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+										 WL_LATCH_SET | WL_TIMEOUT |
+										 WL_EXIT_ON_PM_DEATH,
+										 wait_time,
+										 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
+						ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+						now = GetCurrentTimestamp();
+
+						/* Handle interrupt signals of startup process */
+						HandleStartupProcInterrupts();
+					}
+					last_fail_time = now;
+					currentSource = XLOG_FROM_ARCHIVE;
+					break;
+
+				default:
+					elog(ERROR, "unexpected WAL source %d", currentSource);
+			}
+		}
+		else if (currentSource == XLOG_FROM_PG_WAL)
+		{
+			/*
+			 * We just successfully read a file in pg_wal. We prefer files in
+			 * the archive over ones in pg_wal, so try the next file again
+			 * from the archive first.
+			 */
+			if (InArchiveRecovery)
+				currentSource = XLOG_FROM_ARCHIVE;
+		}
+
+		if (currentSource != oldSource)
+			elog(DEBUG2, "switched WAL source from %s to %s after %s",
+				 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
+				 lastSourceFailed ? "failure" : "success");
+
+		/*
+		 * We've now handled possible failure. Try to read from the chosen
+		 * source.
+		 */
+		lastSourceFailed = false;
+
+		switch (currentSource)
+		{
+			case XLOG_FROM_ARCHIVE:
+			case XLOG_FROM_PG_WAL:
+
+				/*
+				 * WAL receiver must not be running when reading WAL from
+				 * archive or pg_wal.
+				 */
+				Assert(!WalRcvStreaming());
+
+				/* Close any old file we might have open. */
+				if (readFile >= 0)
+				{
+					close(readFile);
+					readFile = -1;
+				}
+				/* Reset curFileTLI if random fetch. */
+				if (randAccess)
+					curFileTLI = 0;
+
+				/*
+				 * Try to restore the file from archive, or read an existing
+				 * file from pg_wal.
+				 */
+				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+											  currentSource);
+				if (readFile >= 0)
+					return XLREAD_SUCCESS;	/* success! */
+
+				/*
+				 * Nope, not found in archive or pg_wal.
+				 */
+				lastSourceFailed = true;
+				break;
+
+			case XLOG_FROM_STREAM:
+				{
+					bool		havedata;
+
+					/*
+					 * We should be able to move to XLOG_FROM_STREAM only in
+					 * standby mode.
+					 */
+					Assert(StandbyMode);
+
+					/*
+					 * First, shutdown walreceiver if its restart has been
+					 * requested -- but no point if we're already slated for
+					 * starting it.
+					 */
+					if (pendingWalRcvRestart && !startWalReceiver)
+					{
+						XLogShutdownWalRcv();
+
+						/*
+						 * Re-scan for possible new timelines if we were
+						 * requested to recover to the latest timeline.
+						 */
+						if (recoveryTargetTimeLineGoal ==
+							RECOVERY_TARGET_TIMELINE_LATEST)
+							rescanLatestTimeLine(replayTLI, replayLSN);
+
+						startWalReceiver = true;
+					}
+					pendingWalRcvRestart = false;
+
+					/*
+					 * Launch walreceiver if needed.
+					 *
+					 * If fetching_ckpt is true, RecPtr points to the initial
+					 * checkpoint location. In that case, we use RedoStartLSN
+					 * as the streaming start position instead of RecPtr, so
+					 * that when we later jump backwards to start redo at
+					 * RedoStartLSN, we will have the logs streamed already.
+					 */
+					if (startWalReceiver &&
+						PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
+					{
+						XLogRecPtr	ptr;
+						TimeLineID	tli;
+
+						if (fetching_ckpt)
+						{
+							ptr = RedoStartLSN;
+							tli = RedoStartTLI;
+						}
+						else
+						{
+							ptr = RecPtr;
+
+							/*
+							 * Use the record begin position to determine the
+							 * TLI, rather than the position we're reading.
+							 */
+							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
+
+							if (curFileTLI > 0 && tli < curFileTLI)
+								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+									 LSN_FORMAT_ARGS(tliRecPtr),
+									 tli, curFileTLI);
+						}
+						curFileTLI = tli;
+						SetInstallXLogFileSegmentActive();
+						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+											 PrimarySlotName,
+											 wal_receiver_create_temp_slot);
+						flushedUpto = 0;
+					}
+
+					/*
+					 * Check if WAL receiver is active or wait to start up.
+					 */
+					if (!WalRcvStreaming())
+					{
+						lastSourceFailed = true;
+						break;
+					}
+
+					/*
+					 * Walreceiver is active, so see if new data has arrived.
+					 *
+					 * We only advance XLogReceiptTime when we obtain fresh
+					 * WAL from walreceiver and observe that we had already
+					 * processed everything before the most recent "chunk"
+					 * that it flushed to disk.  In steady state where we are
+					 * keeping up with the incoming data, XLogReceiptTime will
+					 * be updated on each cycle. When we are behind,
+					 * XLogReceiptTime will not advance, so the grace time
+					 * allotted to conflicting queries will decrease.
+					 */
+					if (RecPtr < flushedUpto)
+						havedata = true;
+					else
+					{
+						XLogRecPtr	latestChunkStart;
+
+						flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
+						if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
+						{
+							havedata = true;
+							if (latestChunkStart <= RecPtr)
+							{
+								XLogReceiptTime = GetCurrentTimestamp();
+								SetCurrentChunkStartTime(XLogReceiptTime);
+							}
+						}
+						else
+							havedata = false;
+					}
+					if (havedata)
+					{
+						/*
+						 * Great, streamed far enough.  Open the file if it's
+						 * not open already.  Also read the timeline history
+						 * file if we haven't initialized timeline history
+						 * yet; it should be streamed over and present in
+						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
+						 * info is set correctly and XLogReceiptTime isn't
+						 * changed.
+						 *
+						 * NB: We must set readTimeLineHistory based on
+						 * recoveryTargetTLI, not receiveTLI. Normally they'll
+						 * be the same, but if recovery_target_timeline is
+						 * 'latest' and archiving is configured, then it's
+						 * possible that we managed to retrieve one or more
+						 * new timeline history files from the archive,
+						 * updating recoveryTargetTLI.
+						 */
+						if (readFile < 0)
+						{
+							if (!expectedTLEs)
+								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+							readFile = XLogFileRead(readSegNo, PANIC,
+													receiveTLI,
+													XLOG_FROM_STREAM, false);
+							Assert(readFile >= 0);
+						}
+						else
+						{
+							/* just make sure source info is correct... */
+							readSource = XLOG_FROM_STREAM;
+							XLogReceiptSource = XLOG_FROM_STREAM;
+							return XLREAD_SUCCESS;
+						}
+						break;
+					}
+
+					/* In nonblocking mode, return rather than sleeping. */
+					if (nonblocking)
+						return XLREAD_WOULDBLOCK;
+
+					/*
+					 * Data not here yet. Check for trigger, then wait for
+					 * walreceiver to wake us up when new WAL arrives.
+					 */
+					if (CheckForStandbyTrigger())
+					{
+						/*
+						 * Note that we don't return XLREAD_FAIL immediately
+						 * here. After being triggered, we still want to
+						 * replay all the WAL that was already streamed. It's
+						 * in pg_wal now, so we just treat this as a failure,
+						 * and the state machine will move on to replay the
+						 * streamed WAL from pg_wal, and then recheck the
+						 * trigger and exit replay.
+						 */
+						lastSourceFailed = true;
+						break;
+					}
+
+					/*
+					 * Since we have replayed everything we have received so
+					 * far and are about to start waiting for more WAL, let's
+					 * tell the upstream server our replay location now so
+					 * that pg_stat_replication doesn't show stale
+					 * information.
+					 */
+					if (!streaming_reply_sent)
+					{
+						WalRcvForceReply();
+						streaming_reply_sent = true;
+					}
+
+					/* Do any background tasks that might benefit us later. */
+					KnownAssignedTransactionIdsIdleMaintenance();
+
+					/* Update pg_stat_recovery_prefetch before sleeping. */
+					XLogPrefetcherComputeStats(xlogprefetcher);
+
+					/*
+					 * Wait for more WAL to arrive. Time out after 5 seconds
+					 * to react to a trigger file promptly and to check if the
+					 * WAL receiver is still active.
+					 */
+					(void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+									 WL_LATCH_SET | WL_TIMEOUT |
+									 WL_EXIT_ON_PM_DEATH,
+									 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
+					ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+					break;
+				}
+
+			default:
+				elog(ERROR, "unexpected WAL source %d", currentSource);
+		}
+
+		/*
+		 * Check for recovery pause here so that we can confirm more quickly
+		 * that a requested pause has actually taken effect.
+		 */
+		if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+			RECOVERY_NOT_PAUSED)
+			recoveryPausesHere(false);
+
+		/*
+		 * This possibly-long loop needs to handle interrupts of startup
+		 * process.
+		 */
+		HandleStartupProcInterrupts();
+	}
+
+	return XLREAD_FAIL;			/* not reached */
+}
+
+
+/*
+ * Determine what log level should be used to report a corrupt WAL record
+ * in the current WAL page, previously read by XLogPageRead().
+ *
+ * 'emode' is the error mode that would be used to report a file-not-found
+ * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
+ * we're retrying the exact same record that we've tried previously, only
+ * complain the first time to keep the noise down.  However, we only do when
+ * reading from pg_wal, because we don't expect any invalid records in archive
+ * or in records streamed from the primary. Files in the archive should be complete,
+ * and we should never hit the end of WAL because we stop and wait for more WAL
+ * to arrive before replaying it.
+ *
+ * NOTE: This function remembers the RecPtr value it was last called with,
+ * to suppress repeated messages about the same record. Only call this when
+ * you are about to ereport(), or you might cause a later message to be
+ * erroneously suppressed.
+ */
+static int
+emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
+{
+	static XLogRecPtr lastComplaint = 0;
+
+	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
+	{
+		if (RecPtr == lastComplaint)
+			emode = DEBUG1;
+		else
+			lastComplaint = RecPtr;
+	}
+	return emode;
+}
+
+
+/*
+ * Subroutine to try to fetch and validate a prior checkpoint record.
+ *
+ * whichChkpt identifies the checkpoint (merely for reporting purposes).
+ * 1 for "primary", 0 for "other" (backup_label)
+ */
+static XLogRecord *
+ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
+					 int whichChkpt, bool report, TimeLineID replayTLI)
+{
+	XLogRecord *record;
+	uint8		info;
+
+	Assert(xlogreader != NULL);
+
+	if (!XRecOffIsValid(RecPtr))
+	{
+		if (!report)
+			return NULL;
+
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid primary checkpoint link in control file")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid checkpoint link in backup_label file")));
+				break;
+		}
+		return NULL;
+	}
+
+	XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
+	record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
+
+	if (record == NULL)
+	{
+		if (!report)
+			return NULL;
+
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	if (record->xl_rmid != RM_XLOG_ID)
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid resource manager ID in primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid resource manager ID in checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	info = record->xl_info & ~XLR_INFO_MASK;
+	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
+		info != XLOG_CHECKPOINT_ONLINE)
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid xl_info in primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid xl_info in checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid length of primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid length of checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	return record;
+}
+
+/*
+ * Scan for new timelines that might have appeared in the archive since we
+ * started recovery.
+ *
+ * If there are any, the function changes recovery target TLI to the latest
+ * one and returns 'true'.
+ */
+static bool
+rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
+{
+	List	   *newExpectedTLEs;
+	bool		found;
+	ListCell   *cell;
+	TimeLineID	newtarget;
+	TimeLineID	oldtarget = recoveryTargetTLI;
+	TimeLineHistoryEntry *currentTle = NULL;
+
+	newtarget = findNewestTimeLine(recoveryTargetTLI);
+	if (newtarget == recoveryTargetTLI)
+	{
+		/* No new timelines found */
+		return false;
+	}
+
+	/*
+	 * Determine the list of expected TLIs for the new TLI
+	 */
+
+	newExpectedTLEs = readTimeLineHistory(newtarget);
+
+	/*
+	 * If the current timeline is not part of the history of the new timeline,
+	 * we cannot proceed to it.
+	 */
+	found = false;
+	foreach(cell, newExpectedTLEs)
+	{
+		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
+
+		if (currentTle->tli == recoveryTargetTLI)
+		{
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u is not a child of database system timeline %u",
+						newtarget,
+						replayTLI)));
+		return false;
+	}
+
+	/*
+	 * The current timeline was found in the history file, but check that the
+	 * next timeline was forked off from it *after* the current recovery
+	 * location.
+	 */
+	if (currentTle->end < replayLSN)
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
+						newtarget,
+						replayTLI,
+						LSN_FORMAT_ARGS(replayLSN))));
+		return false;
+	}
+
+	/* The new timeline history seems valid. Switch target */
+	recoveryTargetTLI = newtarget;
+	list_free_deep(expectedTLEs);
+	expectedTLEs = newExpectedTLEs;
+
+	/*
+	 * As in StartupXLOG(), try to ensure we have all the history files
+	 * between the old target and new target in pg_wal.
+	 */
+	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
+	ereport(LOG,
+			(errmsg("new target timeline is %u",
+					recoveryTargetTLI)));
+
+	return true;
+}
+
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
+ * Otherwise, it's assumed to be already available in pg_wal.
+ */
+static int
+XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+			 XLogSource source, bool notfoundOk)
+{
+	char		xlogfname[MAXFNAMELEN];
+	char		activitymsg[MAXFNAMELEN + 16];
+	char		path[MAXPGPATH];
+	int			fd;
+
+	XLogFileName(xlogfname, tli, segno, wal_segment_size);
+
+	switch (source)
+	{
+		case XLOG_FROM_ARCHIVE:
+			/* Report recovery progress in PS display */
+			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+					 xlogfname);
+			set_ps_display(activitymsg);
+
+			if (!RestoreArchivedFile(path, xlogfname,
+									 "RECOVERYXLOG",
+									 wal_segment_size,
+									 InRedo))
+				return -1;
+			break;
+
+		case XLOG_FROM_PG_WAL:
+		case XLOG_FROM_STREAM:
+			XLogFilePath(path, tli, segno, wal_segment_size);
+			break;
+
+		default:
+			elog(ERROR, "invalid XLogFileRead source %d", source);
+	}
+
+	/*
+	 * If the segment was fetched from archival storage, replace the existing
+	 * xlog segment (if any) with the archival version.
+	 */
+	if (source == XLOG_FROM_ARCHIVE)
+	{
+		Assert(!IsInstallXLogFileSegmentActive());
+		KeepFileRestoredFromArchive(path, xlogfname);
+
+		/*
+		 * Set path to point at the new file in pg_wal.
+		 */
+		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+	}
+
+	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (fd >= 0)
+	{
+		/* Success! */
+		curFileTLI = tli;
+
+		/* Report recovery progress in PS display */
+		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
+				 xlogfname);
+		set_ps_display(activitymsg);
+
+		/* Track source of data in assorted state variables */
+		readSource = source;
+		XLogReceiptSource = source;
+		/* In FROM_STREAM case, caller tracks receipt time, not me */
+		if (source != XLOG_FROM_STREAM)
+			XLogReceiptTime = GetCurrentTimestamp();
+
+		return fd;
+	}
+	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+	return -1;
+}
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * This version searches for the segment with any TLI listed in expectedTLEs.
+ */
+static int
+XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+{
+	char		path[MAXPGPATH];
+	ListCell   *cell;
+	int			fd;
+	List	   *tles;
+
+	/*
+	 * Loop looking for a suitable timeline ID: we might need to read any of
+	 * the timelines listed in expectedTLEs.
+	 *
+	 * We expect curFileTLI on entry to be the TLI of the preceding file in
+	 * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
+	 * to go backwards; this prevents us from picking up the wrong file when a
+	 * parent timeline extends to higher segment numbers than the child we
+	 * want to read.
+	 *
+	 * If we haven't read the timeline history file yet, read it now, so that
+	 * we know which TLIs to scan.  We don't save the list in expectedTLEs,
+	 * however, unless we actually find a valid segment.  That way if there is
+	 * neither a timeline history file nor a WAL segment in the archive, and
+	 * streaming replication is set up, we'll read the timeline history file
+	 * streamed from the primary when we start streaming, instead of
+	 * recovering with a dummy history generated here.
+	 */
+	if (expectedTLEs)
+		tles = expectedTLEs;
+	else
+		tles = readTimeLineHistory(recoveryTargetTLI);
+
+	foreach(cell, tles)
+	{
+		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
+		TimeLineID	tli = hent->tli;
+
+		if (tli < curFileTLI)
+			break;				/* don't bother looking at too-old TLIs */
+
+		/*
+		 * Skip scanning the timeline ID that the logfile segment to read
+		 * doesn't belong to
+		 */
+		if (hent->begin != InvalidXLogRecPtr)
+		{
+			XLogSegNo	beginseg = 0;
+
+			XLByteToSeg(hent->begin, beginseg, wal_segment_size);
+
+			/*
+			 * The logfile segment that doesn't belong to the timeline is
+			 * older or newer than the segment that the timeline started or
+			 * ended at, respectively. It's sufficient to check only the
+			 * starting segment of the timeline here. Since the timelines are
+			 * scanned in descending order in this loop, any segments newer
+			 * than the ending segment should belong to newer timeline and
+			 * have already been read before. So it's not necessary to check
+			 * the ending segment of the timeline here.
+			 */
+			if (segno < beginseg)
+				continue;
+		}
+
+		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
+		{
+			fd = XLogFileRead(segno, emode, tli,
+							  XLOG_FROM_ARCHIVE, true);
+			if (fd != -1)
+			{
+				elog(DEBUG1, "got WAL segment from archive");
+				if (!expectedTLEs)
+					expectedTLEs = tles;
+				return fd;
+			}
+		}
+
+		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
+		{
+			fd = XLogFileRead(segno, emode, tli,
+							  XLOG_FROM_PG_WAL, true);
+			if (fd != -1)
+			{
+				if (!expectedTLEs)
+					expectedTLEs = tles;
+				return fd;
+			}
+		}
+	}
+
+	/* Couldn't find it.  For simplicity, complain about front timeline */
+	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
+	errno = ENOENT;
+	ereport(emode,
+			(errcode_for_file_access(),
+			 errmsg("could not open file \"%s\": %m", path)));
+	return -1;
+}
+
+/*
+ * Set flag to signal the walreceiver to restart.  (The startup process calls
+ * this on noticing a relevant configuration change.)
+ */
+void
+StartupRequestWalReceiverRestart(void)
+{
+	if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
+	{
+		ereport(LOG,
+				(errmsg("WAL receiver process shutdown requested")));
+
+		pendingWalRcvRestart = true;
+	}
+}
+
+
+/*
+ * Has a standby promotion already been triggered?
+ *
+ * Unlike CheckForStandbyTrigger(), this works in any process
+ * that's connected to shared memory.
+ */
+bool
+PromoteIsTriggered(void)
+{
+	/*
+	 * We check shared state each time only until a standby promotion is
+	 * triggered. We can't trigger a promotion again, so there's no need to
+	 * keep checking after the shared variable has once been seen true.
+	 */
+	if (LocalPromoteIsTriggered)
+		return true;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	return LocalPromoteIsTriggered;
+}
+
+static void
+SetPromoteIsTriggered(void)
+{
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	XLogRecoveryCtl->SharedPromoteIsTriggered = true;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	/*
+	 * Mark the recovery pause state as 'not paused' because the paused state
+	 * ends and promotion continues if a promotion is triggered while recovery
+	 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
+	 * return 'paused' while a promotion is ongoing.
+	 */
+	SetRecoveryPause(false);
+
+	LocalPromoteIsTriggered = true;
+}
+
+/*
+ * Check to see whether the user-specified trigger file exists and whether a
+ * promote request has arrived.  If either condition holds, return true.
+ */
+static bool
+CheckForStandbyTrigger(void)
+{
+	struct stat stat_buf;
+
+	if (LocalPromoteIsTriggered)
+		return true;
+
+	if (IsPromoteSignaled() && CheckPromoteSignal())
+	{
+		ereport(LOG, (errmsg("received promote request")));
+		RemovePromoteSignalFiles();
+		ResetPromoteSignaled();
+		SetPromoteIsTriggered();
+		return true;
+	}
+
+	if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
+		return false;
+
+	if (stat(PromoteTriggerFile, &stat_buf) == 0)
+	{
+		ereport(LOG,
+				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
+		unlink(PromoteTriggerFile);
+		SetPromoteIsTriggered();
+		return true;
+	}
+	else if (errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat promote trigger file \"%s\": %m",
+						PromoteTriggerFile)));
+
+	return false;
+}
+
+/*
+ * Remove the files signaling a standby promotion request.
+ */
+void
+RemovePromoteSignalFiles(void)
+{
+	unlink(PROMOTE_SIGNAL_FILE);
+}
+
+/*
+ * Check to see if a promote request has arrived.
+ */
+bool
+CheckPromoteSignal(void)
+{
+	struct stat stat_buf;
+
+	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * Wake up startup process to replay newly arrived WAL, or to notice that
+ * failover has been requested.
+ */
+void
+WakeupRecovery(void)
+{
+	SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Schedule a walreceiver wakeup in the main recovery loop.
+ */
+void
+XLogRequestWalReceiverReply(void)
+{
+	doRequestWalReceiverReply = true;
+}
+
+/*
+ * Is HotStandby active yet? This is only important in special backends
+ * since normal backends won't ever be able to connect until this returns
+ * true. Postmaster knows this by way of signal, not via shared memory.
+ *
+ * Unlike testing standbyState, this works in any process that's connected to
+ * shared memory.  (And note that standbyState alone doesn't tell the truth
+ * anyway.)
+ */
+bool
+HotStandbyActive(void)
+{
+	/*
+	 * We check shared state each time only until Hot Standby is active. We
+	 * can't de-activate Hot Standby, so there's no need to keep checking
+	 * after the shared variable has once been seen true.
+	 */
+	if (LocalHotStandbyActive)
+		return true;
+	else
+	{
+		/* spinlock is essential on machines with weak memory ordering! */
+		SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+		LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
+		SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+		return LocalHotStandbyActive;
+	}
+}
+
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+static bool
+HotStandbyActiveInReplay(void)
+{
+	Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+	return LocalHotStandbyActive;
+}
+
+/*
+ * Get latest redo apply position.
+ *
+ * Exported to allow WALReceiver to read the pointer directly.
+ */
+XLogRecPtr
+GetXLogReplayRecPtr(TimeLineID *replayTLI)
+{
+	XLogRecPtr	recptr;
+	TimeLineID	tli;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+	tli = XLogRecoveryCtl->lastReplayedTLI;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	if (replayTLI)
+		*replayTLI = tli;
+	return recptr;
+}
+
+
+/*
+ * Get position of last applied, or the record being applied.
+ *
+ * This is different from GetXLogReplayRecPtr() in that if a WAL
+ * record is currently being applied, this includes that record.
+ */
+XLogRecPtr
+GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
+{
+	XLogRecPtr	recptr;
+	TimeLineID	tli;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	recptr = XLogRecoveryCtl->replayEndRecPtr;
+	tli = XLogRecoveryCtl->replayEndTLI;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	if (replayEndTLI)
+		*replayEndTLI = tli;
+	return recptr;
+}
+
+/*
+ * Save timestamp of latest processed commit/abort record.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by processes other than the startup process.  Note in particular
+ * that CreateRestartPoint is executed in the checkpointer.
+ */
+static void
+SetLatestXTime(TimestampTz xtime)
+{
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	XLogRecoveryCtl->recoveryLastXTime = xtime;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ */
+TimestampTz
+GetLatestXTime(void)
+{
+	TimestampTz xtime;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	xtime = XLogRecoveryCtl->recoveryLastXTime;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	return xtime;
+}
+
+/*
+ * Save timestamp of the next chunk of WAL records to apply.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by all backends.
+ */
+static void
+SetCurrentChunkStartTime(TimestampTz xtime)
+{
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	XLogRecoveryCtl->currentChunkStartTime = xtime;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ * Startup process maintains an accurate local copy in XLogReceiptTime
+ */
+TimestampTz
+GetCurrentChunkReplayStartTime(void)
+{
+	TimestampTz xtime;
+
+	SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+	xtime = XLogRecoveryCtl->currentChunkStartTime;
+	SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+	return xtime;
+}
+
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+	/*
+	 * This must be executed in the startup process, since we don't export the
+	 * relevant state to shared memory.
+	 */
+	Assert(InRecovery);
+
+	*rtime = XLogReceiptTime;
+	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
+/*
+ * Note that text field supplied is a parameter name and does not require
+ * translation
+ */
+void
+RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
+{
+	if (currValue < minValue)
+	{
+		if (HotStandbyActiveInReplay())
+		{
+			bool		warned_for_promote = false;
+
+			ereport(WARNING,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("hot standby is not possible because of insufficient parameter settings"),
+					 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+							   param_name,
+							   currValue,
+							   minValue)));
+
+			SetRecoveryPause(true);
+
+			ereport(LOG,
+					(errmsg("recovery has paused"),
+					 errdetail("If recovery is unpaused, the server will shut down."),
+					 errhint("You can then restart the server after making the necessary configuration changes.")));
+
+			while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+			{
+				HandleStartupProcInterrupts();
+
+				if (CheckForStandbyTrigger())
+				{
+					if (!warned_for_promote)
+						ereport(WARNING,
+								(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+								 errmsg("promotion is not possible because of insufficient parameter settings"),
+
+						/*
+						 * Repeat the detail from above so it's easy to find
+						 * in the log.
+						 */
+								 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+										   param_name,
+										   currValue,
+										   minValue),
+								 errhint("Restart the server after making the necessary configuration changes.")));
+					warned_for_promote = true;
+				}
+
+				/*
+				 * If recovery pause is requested then set it paused.  While
+				 * we are in the loop, user might resume and pause again so
+				 * set this every time.
+				 */
+				ConfirmRecoveryPaused();
+
+				/*
+				 * We wait on a condition variable that will wake us as soon
+				 * as the pause ends, but we use a timeout so we can check the
+				 * above conditions periodically too.
+				 */
+				ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+											WAIT_EVENT_RECOVERY_PAUSE);
+			}
+			ConditionVariableCancelSleep();
+		}
+
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("recovery aborted because of insufficient parameter settings"),
+		/* Repeat the detail from above so it's easy to find in the log. */
+				 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+						   param_name,
+						   currValue,
+						   minValue),
+				 errhint("You can restart the server after making the necessary configuration changes.")));
+	}
+}
diff --git a/src/backend/access/transam/xlogstats.c b/src/backend/access/transam/xlogstats.c
new file mode 100644
index 0000000..5141817
--- /dev/null
+++ b/src/backend/access/transam/xlogstats.c
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogstats.c
+ *		Functions for WAL Statitstics
+ *
+ * Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/backend/access/transam/xlogstats.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlogreader.h"
+#include "access/xlogstats.h"
+
+/*
+ * Calculate the size of a record, split into !FPI and FPI parts.
+ */
+void
+XLogRecGetLen(XLogReaderState *record, uint32 *rec_len,
+			  uint32 *fpi_len)
+{
+	int			block_id;
+
+	/*
+	 * Calculate the amount of FPI data in the record.
+	 *
+	 * XXX: We peek into xlogreader's private decoded backup blocks for the
+	 * bimg_len indicating the length of FPI data.
+	 */
+	*fpi_len = 0;
+	for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
+	{
+		if (!XLogRecHasBlockRef(record, block_id))
+			continue;
+
+		if (XLogRecHasBlockImage(record, block_id))
+			*fpi_len += XLogRecGetBlock(record, block_id)->bimg_len;
+	}
+
+	/*
+	 * Calculate the length of the record as the total length - the length of
+	 * all the block images.
+	 */
+	*rec_len = XLogRecGetTotalLen(record) - *fpi_len;
+}
+
+/*
+ * Store per-rmgr and per-record statistics for a given record.
+ */
+void
+XLogRecStoreStats(XLogStats *stats, XLogReaderState *record)
+{
+	RmgrId		rmid;
+	uint8		recid;
+	uint32		rec_len;
+	uint32		fpi_len;
+
+	Assert(stats != NULL && record != NULL);
+
+	stats->count++;
+
+	rmid = XLogRecGetRmid(record);
+
+	XLogRecGetLen(record, &rec_len, &fpi_len);
+
+	/* Update per-rmgr statistics */
+
+	stats->rmgr_stats[rmid].count++;
+	stats->rmgr_stats[rmid].rec_len += rec_len;
+	stats->rmgr_stats[rmid].fpi_len += fpi_len;
+
+	/*
+	 * Update per-record statistics, where the record is identified by a
+	 * combination of the RmgrId and the four bits of the xl_info field that
+	 * are the rmgr's domain (resulting in sixteen possible entries per
+	 * RmgrId).
+	 */
+
+	recid = XLogRecGetInfo(record) >> 4;
+
+	/*
+	 * XACT records need to be handled differently. Those records use the
+	 * first bit of those four bits for an optional flag variable and the
+	 * following three bits for the opcode. We filter opcode out of xl_info
+	 * and use it as the identifier of the record.
+	 */
+	if (rmid == RM_XACT_ID)
+		recid &= 0x07;
+
+	stats->record_stats[rmid][recid].count++;
+	stats->record_stats[rmid][recid].rec_len += rec_len;
+	stats->record_stats[rmid][recid].fpi_len += fpi_len;
+}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
new file mode 100644
index 0000000..702c8c1
--- /dev/null
+++ b/src/backend/access/transam/xlogutils.c
@@ -0,0 +1,1064 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogutils.c
+ *
+ * PostgreSQL write-ahead log manager utility routines
+ *
+ * This file contains support routines that are used by XLOG replay functions.
+ * None of this code is used during normal system operation.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/xlogrecovery.h"
+#include "access/xlog_internal.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/smgr.h"
+#include "utils/guc.h"
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+
+
+/* GUC variable */
+bool		ignore_invalid_pages = false;
+
+/*
+ * Are we doing recovery from XLOG?
+ *
+ * This is only ever true in the startup process; it should be read as meaning
+ * "this process is replaying WAL records", rather than "the system is in
+ * recovery mode".  It should be examined primarily by functions that need
+ * to act differently when called from a WAL redo function (e.g., to skip WAL
+ * logging).  To check whether the system is in recovery regardless of which
+ * process you're running in, use RecoveryInProgress() but only after shared
+ * memory startup and lock initialization.
+ *
+ * This is updated from xlog.c and xlogrecovery.c, but lives here because
+ * it's mostly read by WAL redo functions.
+ */
+bool		InRecovery = false;
+
+/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
+HotStandbyState standbyState = STANDBY_DISABLED;
+
+/*
+ * During XLOG replay, we may see XLOG records for incremental updates of
+ * pages that no longer exist, because their relation was later dropped or
+ * truncated.  (Note: this is only possible when full_page_writes = OFF,
+ * since when it's ON, the first reference we see to a page should always
+ * be a full-page rewrite not an incremental update.)  Rather than simply
+ * ignoring such records, we make a note of the referenced page, and then
+ * complain if we don't actually see a drop or truncate covering the page
+ * later in replay.
+ */
+typedef struct xl_invalid_page_key
+{
+	RelFileNode node;			/* the relation */
+	ForkNumber	forkno;			/* the fork number */
+	BlockNumber blkno;			/* the page */
+} xl_invalid_page_key;
+
+typedef struct xl_invalid_page
+{
+	xl_invalid_page_key key;	/* hash key ... must be first */
+	bool		present;		/* page existed but contained zeroes */
+} xl_invalid_page;
+
+static HTAB *invalid_page_tab = NULL;
+
+static int	read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
+									  int reqLen, XLogRecPtr targetRecPtr,
+									  char *cur_page, bool wait_for_wal);
+
+/* Report a reference to an invalid page */
+static void
+report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
+					BlockNumber blkno, bool present)
+{
+	char	   *path = relpathperm(node, forkno);
+
+	if (present)
+		elog(elevel, "page %u of relation %s is uninitialized",
+			 blkno, path);
+	else
+		elog(elevel, "page %u of relation %s does not exist",
+			 blkno, path);
+	pfree(path);
+}
+
+/* Log a reference to an invalid page */
+static void
+log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
+				 bool present)
+{
+	xl_invalid_page_key key;
+	xl_invalid_page *hentry;
+	bool		found;
+
+	/*
+	 * Once recovery has reached a consistent state, the invalid-page table
+	 * should be empty and remain so. If a reference to an invalid page is
+	 * found after consistency is reached, PANIC immediately. This might seem
+	 * aggressive, but it's better than letting the invalid reference linger
+	 * in the hash table until the end of recovery and PANIC there, which
+	 * might come only much later if this is a standby server.
+	 */
+	if (reachedConsistency)
+	{
+		report_invalid_page(WARNING, node, forkno, blkno, present);
+		elog(ignore_invalid_pages ? WARNING : PANIC,
+			 "WAL contains references to invalid pages");
+	}
+
+	/*
+	 * Log references to invalid pages at DEBUG1 level.  This allows some
+	 * tracing of the cause (note the elog context mechanism will tell us
+	 * something about the XLOG record that generated the reference).
+	 */
+	if (message_level_is_interesting(DEBUG1))
+		report_invalid_page(DEBUG1, node, forkno, blkno, present);
+
+	if (invalid_page_tab == NULL)
+	{
+		/* create hash table when first needed */
+		HASHCTL		ctl;
+
+		ctl.keysize = sizeof(xl_invalid_page_key);
+		ctl.entrysize = sizeof(xl_invalid_page);
+
+		invalid_page_tab = hash_create("XLOG invalid-page table",
+									   100,
+									   &ctl,
+									   HASH_ELEM | HASH_BLOBS);
+	}
+
+	/* we currently assume xl_invalid_page_key contains no padding */
+	key.node = node;
+	key.forkno = forkno;
+	key.blkno = blkno;
+	hentry = (xl_invalid_page *)
+		hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
+
+	if (!found)
+	{
+		/* hash_search already filled in the key */
+		hentry->present = present;
+	}
+	else
+	{
+		/* repeat reference ... leave "present" as it was */
+	}
+}
+
+/* Forget any invalid pages >= minblkno, because they've been dropped */
+static void
+forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
+{
+	HASH_SEQ_STATUS status;
+	xl_invalid_page *hentry;
+
+	if (invalid_page_tab == NULL)
+		return;					/* nothing to do */
+
+	hash_seq_init(&status, invalid_page_tab);
+
+	while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+	{
+		if (RelFileNodeEquals(hentry->key.node, node) &&
+			hentry->key.forkno == forkno &&
+			hentry->key.blkno >= minblkno)
+		{
+			if (message_level_is_interesting(DEBUG2))
+			{
+				char	   *path = relpathperm(hentry->key.node, forkno);
+
+				elog(DEBUG2, "page %u of relation %s has been dropped",
+					 hentry->key.blkno, path);
+				pfree(path);
+			}
+
+			if (hash_search(invalid_page_tab,
+							(void *) &hentry->key,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "hash table corrupted");
+		}
+	}
+}
+
+/* Forget any invalid pages in a whole database */
+static void
+forget_invalid_pages_db(Oid dbid)
+{
+	HASH_SEQ_STATUS status;
+	xl_invalid_page *hentry;
+
+	if (invalid_page_tab == NULL)
+		return;					/* nothing to do */
+
+	hash_seq_init(&status, invalid_page_tab);
+
+	while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+	{
+		if (hentry->key.node.dbNode == dbid)
+		{
+			if (message_level_is_interesting(DEBUG2))
+			{
+				char	   *path = relpathperm(hentry->key.node, hentry->key.forkno);
+
+				elog(DEBUG2, "page %u of relation %s has been dropped",
+					 hentry->key.blkno, path);
+				pfree(path);
+			}
+
+			if (hash_search(invalid_page_tab,
+							(void *) &hentry->key,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "hash table corrupted");
+		}
+	}
+}
+
+/* Are there any unresolved references to invalid pages? */
+bool
+XLogHaveInvalidPages(void)
+{
+	if (invalid_page_tab != NULL &&
+		hash_get_num_entries(invalid_page_tab) > 0)
+		return true;
+	return false;
+}
+
+/* Complain about any remaining invalid-page entries */
+void
+XLogCheckInvalidPages(void)
+{
+	HASH_SEQ_STATUS status;
+	xl_invalid_page *hentry;
+	bool		foundone = false;
+
+	if (invalid_page_tab == NULL)
+		return;					/* nothing to do */
+
+	hash_seq_init(&status, invalid_page_tab);
+
+	/*
+	 * Our strategy is to emit WARNING messages for all remaining entries and
+	 * only PANIC after we've dumped all the available info.
+	 */
+	while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
+	{
+		report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
+							hentry->key.blkno, hentry->present);
+		foundone = true;
+	}
+
+	if (foundone)
+		elog(ignore_invalid_pages ? WARNING : PANIC,
+			 "WAL contains references to invalid pages");
+
+	hash_destroy(invalid_page_tab);
+	invalid_page_tab = NULL;
+}
+
+
+/*
+ * XLogReadBufferForRedo
+ *		Read a page during XLOG replay
+ *
+ * Reads a block referenced by a WAL record into shared buffer cache, and
+ * determines what needs to be done to redo the changes to it.  If the WAL
+ * record includes a full-page image of the page, it is restored.
+ *
+ * 'record.EndRecPtr' is compared to the page's LSN to determine if the record
+ * has already been replayed.  'block_id' is the ID number the block was
+ * registered with, when the WAL record was created.
+ *
+ * Returns one of the following:
+ *
+ *	BLK_NEEDS_REDO	- changes from the WAL record need to be applied
+ *	BLK_DONE		- block doesn't need replaying
+ *	BLK_RESTORED	- block was restored from a full-page image included in
+ *					  the record
+ *	BLK_NOTFOUND	- block was not found (because it was truncated away by
+ *					  an operation later in the WAL stream)
+ *
+ * On return, the buffer is locked in exclusive-mode, and returned in *buf.
+ * Note that the buffer is locked and returned even if it doesn't need
+ * replaying.  (Getting the buffer lock is not really necessary during
+ * single-process crash recovery, but some subroutines such as MarkBufferDirty
+ * will complain if we don't have the lock.  In hot standby mode it's
+ * definitely necessary.)
+ *
+ * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
+ * set, we restore it, even if the page in the database appears newer.  This
+ * is to protect ourselves against database pages that were partially or
+ * incorrectly written during a crash.  We assume that the XLOG data must be
+ * good because it has passed a CRC check, while the database page might not
+ * be.  This will force us to replay all subsequent modifications of the page
+ * that appear in XLOG, rather than possibly ignoring them as already
+ * applied, but that's not a huge drawback.
+ */
+XLogRedoAction
+XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
+					  Buffer *buf)
+{
+	return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL,
+										 false, buf);
+}
+
+/*
+ * Pin and lock a buffer referenced by a WAL record, for the purpose of
+ * re-initializing it.
+ */
+Buffer
+XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
+{
+	Buffer		buf;
+
+	XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false,
+								  &buf);
+	return buf;
+}
+
+/*
+ * XLogReadBufferForRedoExtended
+ *		Like XLogReadBufferForRedo, but with extra options.
+ *
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the referenced block number.  In
+ * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
+ * is always BLK_NEEDS_REDO.
+ *
+ * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
+ * parameter. Do not use an inconsistent combination!)
+ *
+ * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
+ * using LockBufferForCleanup(), instead of a regular exclusive lock.
+ */
+XLogRedoAction
+XLogReadBufferForRedoExtended(XLogReaderState *record,
+							  uint8 block_id,
+							  ReadBufferMode mode, bool get_cleanup_lock,
+							  Buffer *buf)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+	Buffer		prefetch_buffer;
+	Page		page;
+	bool		zeromode;
+	bool		willinit;
+
+	if (!XLogRecGetBlockTagExtended(record, block_id, &rnode, &forknum, &blkno,
+									&prefetch_buffer))
+	{
+		/* Caller specified a bogus block_id */
+		elog(PANIC, "failed to locate backup block with ID %d in WAL record",
+			 block_id);
+	}
+
+	/*
+	 * Make sure that if the block is marked with WILL_INIT, the caller is
+	 * going to initialize it. And vice versa.
+	 */
+	zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
+	willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0;
+	if (willinit && !zeromode)
+		elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
+	if (!willinit && zeromode)
+		elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
+
+	/* If it has a full-page image and it should be restored, do it. */
+	if (XLogRecBlockImageApply(record, block_id))
+	{
+		Assert(XLogRecHasBlockImage(record, block_id));
+		*buf = XLogReadBufferExtended(rnode, forknum, blkno,
+									  get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK,
+									  prefetch_buffer);
+		page = BufferGetPage(*buf);
+		if (!RestoreBlockImage(record, block_id, page))
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg_internal("%s", record->errormsg_buf)));
+
+		/*
+		 * The page may be uninitialized. If so, we can't set the LSN because
+		 * that would corrupt the page.
+		 */
+		if (!PageIsNew(page))
+		{
+			PageSetLSN(page, lsn);
+		}
+
+		MarkBufferDirty(*buf);
+
+		/*
+		 * At the end of crash recovery the init forks of unlogged relations
+		 * are copied, without going through shared buffers. So we need to
+		 * force the on-disk state of init forks to always be in sync with the
+		 * state in shared buffers.
+		 */
+		if (forknum == INIT_FORKNUM)
+			FlushOneBuffer(*buf);
+
+		return BLK_RESTORED;
+	}
+	else
+	{
+		*buf = XLogReadBufferExtended(rnode, forknum, blkno, mode, prefetch_buffer);
+		if (BufferIsValid(*buf))
+		{
+			if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
+			{
+				if (get_cleanup_lock)
+					LockBufferForCleanup(*buf);
+				else
+					LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
+			}
+			if (lsn <= PageGetLSN(BufferGetPage(*buf)))
+				return BLK_DONE;
+			else
+				return BLK_NEEDS_REDO;
+		}
+		else
+			return BLK_NOTFOUND;
+	}
+}
+
+/*
+ * XLogReadBufferExtended
+ *		Read a page during XLOG replay
+ *
+ * This is functionally comparable to ReadBufferExtended. There's some
+ * differences in the behavior wrt. the "mode" argument:
+ *
+ * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
+ * return InvalidBuffer. In this case the caller should silently skip the
+ * update on this page. (In this situation, we expect that the page was later
+ * dropped or truncated. If we don't see evidence of that later in the WAL
+ * sequence, we'll complain at the end of WAL replay.)
+ *
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes.  Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
+ *
+ * Optionally, recent_buffer can be used to provide a hint about the location
+ * of the page in the buffer pool; it does not have to be correct, but avoids
+ * a buffer mapping table probe if it is.
+ *
+ * NB: A redo function should normally not call this directly. To get a page
+ * to modify, use XLogReadBufferForRedoExtended instead. It is important that
+ * all pages modified by a WAL record are registered in the WAL records, or
+ * they will be invisible to tools that need to know which pages are modified.
+ */
+Buffer
+XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
+					   BlockNumber blkno, ReadBufferMode mode,
+					   Buffer recent_buffer)
+{
+	BlockNumber lastblock;
+	Buffer		buffer;
+	SMgrRelation smgr;
+
+	Assert(blkno != P_NEW);
+
+	/* Do we have a clue where the buffer might be already? */
+	if (BufferIsValid(recent_buffer) &&
+		mode == RBM_NORMAL &&
+		ReadRecentBuffer(rnode, forknum, blkno, recent_buffer))
+	{
+		buffer = recent_buffer;
+		goto recent_buffer_fast_path;
+	}
+
+	/* Open the relation at smgr level */
+	smgr = smgropen(rnode, InvalidBackendId);
+
+	/*
+	 * Create the target file if it doesn't already exist.  This lets us cope
+	 * if the replay sequence contains writes to a relation that is later
+	 * deleted.  (The original coding of this routine would instead suppress
+	 * the writes, but that seems like it risks losing valuable data if the
+	 * filesystem loses an inode during a crash.  Better to write the data
+	 * until we are actually told to delete the file.)
+	 */
+	smgrcreate(smgr, forknum, true);
+
+	lastblock = smgrnblocks(smgr, forknum);
+
+	if (blkno < lastblock)
+	{
+		/* page exists in file */
+		buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+										   mode, NULL, true);
+	}
+	else
+	{
+		/* hm, page doesn't exist in file */
+		if (mode == RBM_NORMAL)
+		{
+			log_invalid_page(rnode, forknum, blkno, false);
+			return InvalidBuffer;
+		}
+		if (mode == RBM_NORMAL_NO_LOG)
+			return InvalidBuffer;
+		/* OK to extend the file */
+		/* we do this in recovery only - no rel-extension lock needed */
+		Assert(InRecovery);
+		buffer = InvalidBuffer;
+		do
+		{
+			if (buffer != InvalidBuffer)
+			{
+				if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+					LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+				ReleaseBuffer(buffer);
+			}
+			buffer = ReadBufferWithoutRelcache(rnode, forknum,
+											   P_NEW, mode, NULL, true);
+		}
+		while (BufferGetBlockNumber(buffer) < blkno);
+		/* Handle the corner case that P_NEW returns non-consecutive pages */
+		if (BufferGetBlockNumber(buffer) != blkno)
+		{
+			if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+			ReleaseBuffer(buffer);
+			buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
+											   mode, NULL, true);
+		}
+	}
+
+recent_buffer_fast_path:
+	if (mode == RBM_NORMAL)
+	{
+		/* check that page has been initialized */
+		Page		page = (Page) BufferGetPage(buffer);
+
+		/*
+		 * We assume that PageIsNew is safe without a lock. During recovery,
+		 * there should be no other backends that could modify the buffer at
+		 * the same time.
+		 */
+		if (PageIsNew(page))
+		{
+			ReleaseBuffer(buffer);
+			log_invalid_page(rnode, forknum, blkno, true);
+			return InvalidBuffer;
+		}
+	}
+
+	return buffer;
+}
+
+/*
+ * Struct actually returned by CreateFakeRelcacheEntry, though the declared
+ * return type is Relation.
+ */
+typedef struct
+{
+	RelationData reldata;		/* Note: this must be first */
+	FormData_pg_class pgc;
+} FakeRelCacheEntryData;
+
+typedef FakeRelCacheEntryData *FakeRelCacheEntry;
+
+/*
+ * Create a fake relation cache entry for a physical relation
+ *
+ * It's often convenient to use the same functions in XLOG replay as in the
+ * main codepath, but those functions typically work with a relcache entry.
+ * We don't have a working relation cache during XLOG replay, but this
+ * function can be used to create a fake relcache entry instead. Only the
+ * fields related to physical storage, like rd_rel, are initialized, so the
+ * fake entry is only usable in low-level operations like ReadBuffer().
+ *
+ * This is also used for syncing WAL-skipped files.
+ *
+ * Caller must free the returned entry with FreeFakeRelcacheEntry().
+ */
+Relation
+CreateFakeRelcacheEntry(RelFileNode rnode)
+{
+	FakeRelCacheEntry fakeentry;
+	Relation	rel;
+
+	/* Allocate the Relation struct and all related space in one block. */
+	fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
+	rel = (Relation) fakeentry;
+
+	rel->rd_rel = &fakeentry->pgc;
+	rel->rd_node = rnode;
+
+	/*
+	 * We will never be working with temp rels during recovery or while
+	 * syncing WAL-skipped files.
+	 */
+	rel->rd_backend = InvalidBackendId;
+
+	/* It must be a permanent table here */
+	rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
+
+	/* We don't know the name of the relation; use relfilenode instead */
+	sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
+
+	/*
+	 * We set up the lockRelId in case anything tries to lock the dummy
+	 * relation.  Note that this is fairly bogus since relNode may be
+	 * different from the relation's OID.  It shouldn't really matter though.
+	 * In recovery, we are running by ourselves and can't have any lock
+	 * conflicts.  While syncing, we already hold AccessExclusiveLock.
+	 */
+	rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
+	rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
+
+	rel->rd_smgr = NULL;
+
+	return rel;
+}
+
+/*
+ * Free a fake relation cache entry.
+ */
+void
+FreeFakeRelcacheEntry(Relation fakerel)
+{
+	/* make sure the fakerel is not referenced by the SmgrRelation anymore */
+	if (fakerel->rd_smgr != NULL)
+		smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
+	pfree(fakerel);
+}
+
+/*
+ * Drop a relation during XLOG replay
+ *
+ * This is called when the relation is about to be deleted; we need to remove
+ * any open "invalid-page" records for the relation.
+ */
+void
+XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
+{
+	forget_invalid_pages(rnode, forknum, 0);
+}
+
+/*
+ * Drop a whole database during XLOG replay
+ *
+ * As above, but for DROP DATABASE instead of dropping a single rel
+ */
+void
+XLogDropDatabase(Oid dbid)
+{
+	/*
+	 * This is unnecessarily heavy-handed, as it will close SMgrRelation
+	 * objects for other databases as well. DROP DATABASE occurs seldom enough
+	 * that it's not worth introducing a variant of smgrclose for just this
+	 * purpose. XXX: Or should we rather leave the smgr entries dangling?
+	 */
+	smgrcloseall();
+
+	forget_invalid_pages_db(dbid);
+}
+
+/*
+ * Truncate a relation during XLOG replay
+ *
+ * We need to clean up any open "invalid-page" records for the dropped pages.
+ */
+void
+XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
+					 BlockNumber nblocks)
+{
+	forget_invalid_pages(rnode, forkNum, nblocks);
+}
+
+/*
+ * Determine which timeline to read an xlog page from and set the
+ * XLogReaderState's currTLI to that timeline ID.
+ *
+ * We care about timelines in xlogreader when we might be reading xlog
+ * generated prior to a promotion, either if we're currently a standby in
+ * recovery or if we're a promoted primary reading xlogs generated by the old
+ * primary before our promotion.
+ *
+ * wantPage must be set to the start address of the page to read and
+ * wantLength to the amount of the page that will be read, up to
+ * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
+ *
+ * The currTLI argument should be the system-wide current timeline.
+ * Note that this may be different from state->currTLI, which is the timeline
+ * from which the caller is currently reading previous xlog records.
+ *
+ * We switch to an xlog segment from the new timeline eagerly when on a
+ * historical timeline, as soon as we reach the start of the xlog segment
+ * containing the timeline switch.  The server copied the segment to the new
+ * timeline so all the data up to the switch point is the same, but there's no
+ * guarantee the old segment will still exist. It may have been deleted or
+ * renamed with a .partial suffix so we can't necessarily keep reading from
+ * the old TLI even though tliSwitchPoint says it's OK.
+ *
+ * We can't just check the timeline when we read a page on a different segment
+ * to the last page. We could've received a timeline switch from a cascading
+ * upstream, so the current segment ends abruptly (possibly getting renamed to
+ * .partial) and we have to switch to a new one.  Even in the middle of reading
+ * a page we could have to dump the cached page and switch to a new TLI.
+ *
+ * Because of this, callers MAY NOT assume that currTLI is the timeline that
+ * will be in a page's xlp_tli; the page may begin on an older timeline or we
+ * might be reading from historical timeline data on a segment that's been
+ * copied to a new timeline.
+ *
+ * The caller must also make sure it doesn't read past the current replay
+ * position (using GetXLogReplayRecPtr) if executing in recovery, so it
+ * doesn't fail to notice that the current timeline became historical.
+ */
+void
+XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
+						  uint32 wantLength, TimeLineID currTLI)
+{
+	const XLogRecPtr lastReadPage = (state->seg.ws_segno *
+									 state->segcxt.ws_segsize + state->segoff);
+
+	Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
+	Assert(wantLength <= XLOG_BLCKSZ);
+	Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
+	Assert(currTLI != 0);
+
+	/*
+	 * If the desired page is currently read in and valid, we have nothing to
+	 * do.
+	 *
+	 * The caller should've ensured that it didn't previously advance readOff
+	 * past the valid limit of this timeline, so it doesn't matter if the
+	 * current TLI has since become historical.
+	 */
+	if (lastReadPage == wantPage &&
+		state->readLen != 0 &&
+		lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
+		return;
+
+	/*
+	 * If we're reading from the current timeline, it hasn't become historical
+	 * and the page we're reading is after the last page read, we can again
+	 * just carry on. (Seeking backwards requires a check to make sure the
+	 * older page isn't on a prior timeline).
+	 *
+	 * currTLI might've become historical since the caller obtained the value,
+	 * but the caller is required not to read past the flush limit it saw at
+	 * the time it looked up the timeline. There's nothing we can do about it
+	 * if StartupXLOG() renames it to .partial concurrently.
+	 */
+	if (state->currTLI == currTLI && wantPage >= lastReadPage)
+	{
+		Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
+		return;
+	}
+
+	/*
+	 * If we're just reading pages from a previously validated historical
+	 * timeline and the timeline we're reading from is valid until the end of
+	 * the current segment we can just keep reading.
+	 */
+	if (state->currTLIValidUntil != InvalidXLogRecPtr &&
+		state->currTLI != currTLI &&
+		state->currTLI != 0 &&
+		((wantPage + wantLength) / state->segcxt.ws_segsize) <
+		(state->currTLIValidUntil / state->segcxt.ws_segsize))
+		return;
+
+	/*
+	 * If we reach this point we're either looking up a page for random
+	 * access, the current timeline just became historical, or we're reading
+	 * from a new segment containing a timeline switch. In all cases we need
+	 * to determine the newest timeline on the segment.
+	 *
+	 * If it's the current timeline we can just keep reading from here unless
+	 * we detect a timeline switch that makes the current timeline historical.
+	 * If it's a historical timeline we can read all the segment on the newest
+	 * timeline because it contains all the old timelines' data too. So only
+	 * one switch check is required.
+	 */
+	{
+		/*
+		 * We need to re-read the timeline history in case it's been changed
+		 * by a promotion or replay from a cascaded replica.
+		 */
+		List	   *timelineHistory = readTimeLineHistory(currTLI);
+		XLogRecPtr	endOfSegment;
+
+		endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) *
+			state->segcxt.ws_segsize - 1;
+		Assert(wantPage / state->segcxt.ws_segsize ==
+			   endOfSegment / state->segcxt.ws_segsize);
+
+		/*
+		 * Find the timeline of the last LSN on the segment containing
+		 * wantPage.
+		 */
+		state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
+		state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
+												  &state->nextTLI);
+
+		Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
+			   wantPage + wantLength < state->currTLIValidUntil);
+
+		list_free_deep(timelineHistory);
+
+		elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+			 state->currTLI,
+			 LSN_FORMAT_ARGS(state->currTLIValidUntil));
+	}
+}
+
+/* XLogReaderRoutine->segment_open callback for local pg_wal files */
+void
+wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
+				 TimeLineID *tli_p)
+{
+	TimeLineID	tli = *tli_p;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return;
+
+	if (errno == ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("requested WAL segment %s has already been removed",
+						path)));
+	else
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m",
+						path)));
+}
+
+/* stock XLogReaderRoutine->segment_close callback */
+void
+wal_segment_close(XLogReaderState *state)
+{
+	close(state->seg.ws_file);
+	/* need to check errno? */
+	state->seg.ws_file = -1;
+}
+
+/*
+ * XLogReaderRoutine->page_read callback for reading local xlog files
+ *
+ * Public because it would likely be very helpful for someone writing another
+ * output method outside walsender, e.g. in a bgworker.
+ *
+ * TODO: The walsender has its own version of this, but it relies on the
+ * walsender's latch being set whenever WAL is flushed. No such infrastructure
+ * exists for normal backends, so we have to do a check/sleep/repeat style of
+ * loop for now.
+ */
+int
+read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
+					 int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
+{
+	return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
+									 targetRecPtr, cur_page, true);
+}
+
+/*
+ * Same as read_local_xlog_page except that it doesn't wait for future WAL
+ * to be available.
+ */
+int
+read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr,
+							 int reqLen, XLogRecPtr targetRecPtr,
+							 char *cur_page)
+{
+	return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
+									 targetRecPtr, cur_page, false);
+}
+
+/*
+ * Implementation of read_local_xlog_page and its no wait version.
+ */
+static int
+read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
+						  int reqLen, XLogRecPtr targetRecPtr,
+						  char *cur_page, bool wait_for_wal)
+{
+	XLogRecPtr	read_upto,
+				loc;
+	TimeLineID	tli;
+	int			count;
+	WALReadError errinfo;
+	TimeLineID	currTLI;
+
+	loc = targetPagePtr + reqLen;
+
+	/* Loop waiting for xlog to be available if necessary */
+	while (1)
+	{
+		/*
+		 * Determine the limit of xlog we can currently read to, and what the
+		 * most recent timeline is.
+		 */
+		if (!RecoveryInProgress())
+			read_upto = GetFlushRecPtr(&currTLI);
+		else
+			read_upto = GetXLogReplayRecPtr(&currTLI);
+		tli = currTLI;
+
+		/*
+		 * Check which timeline to get the record from.
+		 *
+		 * We have to do it each time through the loop because if we're in
+		 * recovery as a cascading standby, the current timeline might've
+		 * become historical. We can't rely on RecoveryInProgress() because in
+		 * a standby configuration like
+		 *
+		 * A => B => C
+		 *
+		 * if we're a logical decoding session on C, and B gets promoted, our
+		 * timeline will change while we remain in recovery.
+		 *
+		 * We can't just keep reading from the old timeline as the last WAL
+		 * archive in the timeline will get renamed to .partial by
+		 * StartupXLOG().
+		 *
+		 * If that happens after our caller determined the TLI but before we
+		 * actually read the xlog page, we might still try to read from the
+		 * old (now renamed) segment and fail. There's not much we can do
+		 * about this, but it can only happen when we're a leaf of a cascading
+		 * standby whose primary gets promoted while we're decoding, so a
+		 * one-off ERROR isn't too bad.
+		 */
+		XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli);
+
+		if (state->currTLI == currTLI)
+		{
+
+			if (loc <= read_upto)
+				break;
+
+			/* If asked, let's not wait for future WAL. */
+			if (!wait_for_wal)
+			{
+				ReadLocalXLogPageNoWaitPrivate *private_data;
+
+				/*
+				 * Inform the caller of read_local_xlog_page_no_wait that the
+				 * end of WAL has been reached.
+				 */
+				private_data = (ReadLocalXLogPageNoWaitPrivate *)
+					state->private_data;
+				private_data->end_of_wal = true;
+				break;
+			}
+
+			CHECK_FOR_INTERRUPTS();
+			pg_usleep(1000L);
+		}
+		else
+		{
+			/*
+			 * We're on a historical timeline, so limit reading to the switch
+			 * point where we moved to the next timeline.
+			 *
+			 * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
+			 * about the new timeline, so we must've received past the end of
+			 * it.
+			 */
+			read_upto = state->currTLIValidUntil;
+
+			/*
+			 * Setting tli to our wanted record's TLI is slightly wrong; the
+			 * page might begin on an older timeline if it contains a timeline
+			 * switch, since its xlog segment will have been copied from the
+			 * prior timeline. This is pretty harmless though, as nothing
+			 * cares so long as the timeline doesn't go backwards.  We should
+			 * read the page header instead; FIXME someday.
+			 */
+			tli = state->currTLI;
+
+			/* No need to wait on a historical timeline */
+			break;
+		}
+	}
+
+	if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
+	{
+		/*
+		 * more than one block available; read only that block, have caller
+		 * come back if they need more.
+		 */
+		count = XLOG_BLCKSZ;
+	}
+	else if (targetPagePtr + reqLen > read_upto)
+	{
+		/* not enough data there */
+		return -1;
+	}
+	else
+	{
+		/* enough bytes available to satisfy the request */
+		count = read_upto - targetPagePtr;
+	}
+
+	/*
+	 * Even though we just determined how much of the page can be validly read
+	 * as 'count', read the whole page anyway. It's guaranteed to be
+	 * zero-padded up to the page boundary if it's incomplete.
+	 */
+	if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli,
+				 &errinfo))
+		WALReadRaiseError(&errinfo);
+
+	/* number of valid bytes in the buffer */
+	return count;
+}
+
+/*
+ * Backend-specific convenience code to handle read errors encountered by
+ * WALRead().
+ */
+void
+WALReadRaiseError(WALReadError *errinfo)
+{
+	WALOpenSegment *seg = &errinfo->wre_seg;
+	char		fname[MAXFNAMELEN];
+
+	XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size);
+
+	if (errinfo->wre_read < 0)
+	{
+		errno = errinfo->wre_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from log segment %s, offset %d: %m",
+						fname, errinfo->wre_off)));
+	}
+	else if (errinfo->wre_read == 0)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("could not read from log segment %s, offset %d: read %d of %d",
+						fname, errinfo->wre_off, errinfo->wre_read,
+						errinfo->wre_req)));
+	}
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
commit	5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree	739caf8c461053357daa9f162bef34516c7bf452 /src/backend/access/transam
parent	Initial commit. (diff)
download	postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip