diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
commit | 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch) | |
tree | 739caf8c461053357daa9f162bef34516c7bf452 /src/backend/access/transam | |
parent | Initial commit. (diff) | |
download | postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip |
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/transam')
26 files changed, 42364 insertions, 0 deletions
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile new file mode 100644 index 0000000..3e5444a --- /dev/null +++ b/src/backend/access/transam/Makefile @@ -0,0 +1,43 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/transam +# +# IDENTIFICATION +# src/backend/access/transam/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/transam +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + clog.o \ + commit_ts.o \ + generic_xlog.o \ + multixact.o \ + parallel.o \ + rmgr.o \ + slru.o \ + subtrans.o \ + timeline.o \ + transam.o \ + twophase.o \ + twophase_rmgr.o \ + varsup.o \ + xact.o \ + xlog.o \ + xlogarchive.o \ + xlogfuncs.o \ + xloginsert.o \ + xlogprefetcher.o \ + xlogreader.o \ + xlogrecovery.o \ + xlogstats.o \ + xlogutils.o + +include $(top_srcdir)/src/backend/common.mk + +# ensure that version checks in xlog.c get recompiled when catversion.h changes +xlog.o: xlog.c $(top_srcdir)/src/include/catalog/catversion.h diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README new file mode 100644 index 0000000..26fd77b --- /dev/null +++ b/src/backend/access/transam/README @@ -0,0 +1,896 @@ +src/backend/access/transam/README + +The Transaction System +====================== + +PostgreSQL's transaction system is a three-layer system. The bottom layer +implements low-level transactions and subtransactions, on top of which rests +the mainloop's control code, which in turn implements user-visible +transactions and savepoints. + +The middle layer of code is called by postgres.c before and after the +processing of each query, or after detecting an error: + + StartTransactionCommand + CommitTransactionCommand + AbortCurrentTransaction + +Meanwhile, the user can alter the system's state by issuing the SQL commands +BEGIN, COMMIT, ROLLBACK, SAVEPOINT, ROLLBACK TO or RELEASE. The traffic cop +redirects these calls to the toplevel routines + + BeginTransactionBlock + EndTransactionBlock + UserAbortTransactionBlock + DefineSavepoint + RollbackToSavepoint + ReleaseSavepoint + +respectively. Depending on the current state of the system, these functions +call low level functions to activate the real transaction system: + + StartTransaction + CommitTransaction + AbortTransaction + CleanupTransaction + StartSubTransaction + CommitSubTransaction + AbortSubTransaction + CleanupSubTransaction + +Additionally, within a transaction, CommandCounterIncrement is called to +increment the command counter, which allows future commands to "see" the +effects of previous commands within the same transaction. Note that this is +done automatically by CommitTransactionCommand after each query inside a +transaction block, but some utility functions also do it internally to allow +some operations (usually in the system catalogs) to be seen by future +operations in the same utility command. (For example, in DefineRelation it is +done after creating the heap so the pg_class row is visible, to be able to +lock it.) + + +For example, consider the following sequence of user commands: + +1) BEGIN +2) SELECT * FROM foo +3) INSERT INTO foo VALUES (...) +4) COMMIT + +In the main processing loop, this results in the following function call +sequence: + + / StartTransactionCommand; + / StartTransaction; +1) < ProcessUtility; << BEGIN + \ BeginTransactionBlock; + \ CommitTransactionCommand; + + / StartTransactionCommand; +2) / PortalRunSelect; << SELECT ... + \ CommitTransactionCommand; + \ CommandCounterIncrement; + + / StartTransactionCommand; +3) / ProcessQuery; << INSERT ... + \ CommitTransactionCommand; + \ CommandCounterIncrement; + + / StartTransactionCommand; + / ProcessUtility; << COMMIT +4) < EndTransactionBlock; + \ CommitTransactionCommand; + \ CommitTransaction; + +The point of this example is to demonstrate the need for +StartTransactionCommand and CommitTransactionCommand to be state smart -- they +should call CommandCounterIncrement between the calls to BeginTransactionBlock +and EndTransactionBlock and outside these calls they need to do normal start, +commit or abort processing. + +Furthermore, suppose the "SELECT * FROM foo" caused an abort condition. In +this case AbortCurrentTransaction is called, and the transaction is put in +aborted state. In this state, any user input is ignored except for +transaction-termination statements, or ROLLBACK TO <savepoint> commands. + +Transaction aborts can occur in two ways: + +1) system dies from some internal cause (syntax error, etc) +2) user types ROLLBACK + +The reason we have to distinguish them is illustrated by the following two +situations: + + case 1 case 2 + ------ ------ +1) user types BEGIN 1) user types BEGIN +2) user does something 2) user does something +3) user does not like what 3) system aborts for some reason + she sees and types ABORT (syntax error, etc) + +In case 1, we want to abort the transaction and return to the default state. +In case 2, there may be more commands coming our way which are part of the +same transaction block; we have to ignore these commands until we see a COMMIT +or ROLLBACK. + +Internal aborts are handled by AbortCurrentTransaction, while user aborts are +handled by UserAbortTransactionBlock. Both of them rely on AbortTransaction +to do all the real work. The only difference is what state we enter after +AbortTransaction does its work: + +* AbortCurrentTransaction leaves us in TBLOCK_ABORT, +* UserAbortTransactionBlock leaves us in TBLOCK_ABORT_END + +Low-level transaction abort handling is divided in two phases: +* AbortTransaction executes as soon as we realize the transaction has + failed. It should release all shared resources (locks etc) so that we do + not delay other backends unnecessarily. +* CleanupTransaction executes when we finally see a user COMMIT + or ROLLBACK command; it cleans things up and gets us out of the transaction + completely. In particular, we mustn't destroy TopTransactionContext until + this point. + +Also, note that when a transaction is committed, we don't close it right away. +Rather it's put in TBLOCK_END state, which means that when +CommitTransactionCommand is called after the query has finished processing, +the transaction has to be closed. The distinction is subtle but important, +because it means that control will leave the xact.c code with the transaction +open, and the main loop will be able to keep processing inside the same +transaction. So, in a sense, transaction commit is also handled in two +phases, the first at EndTransactionBlock and the second at +CommitTransactionCommand (which is where CommitTransaction is actually +called). + +The rest of the code in xact.c are routines to support the creation and +finishing of transactions and subtransactions. For example, AtStart_Memory +takes care of initializing the memory subsystem at main transaction start. + + +Subtransaction Handling +----------------------- + +Subtransactions are implemented using a stack of TransactionState structures, +each of which has a pointer to its parent transaction's struct. When a new +subtransaction is to be opened, PushTransaction is called, which creates a new +TransactionState, with its parent link pointing to the current transaction. +StartSubTransaction is in charge of initializing the new TransactionState to +sane values, and properly initializing other subsystems (AtSubStart routines). + +When closing a subtransaction, either CommitSubTransaction has to be called +(if the subtransaction is committing), or AbortSubTransaction and +CleanupSubTransaction (if it's aborting). In either case, PopTransaction is +called so the system returns to the parent transaction. + +One important point regarding subtransaction handling is that several may need +to be closed in response to a single user command. That's because savepoints +have names, and we allow to commit or rollback a savepoint by name, which is +not necessarily the one that was last opened. Also a COMMIT or ROLLBACK +command must be able to close out the entire stack. We handle this by having +the utility command subroutine mark all the state stack entries as commit- +pending or abort-pending, and then when the main loop reaches +CommitTransactionCommand, the real work is done. The main point of doing +things this way is that if we get an error while popping state stack entries, +the remaining stack entries still show what we need to do to finish up. + +In the case of ROLLBACK TO <savepoint>, we abort all the subtransactions up +through the one identified by the savepoint name, and then re-create that +subtransaction level with the same name. So it's a completely new +subtransaction as far as the internals are concerned. + +Other subsystems are allowed to start "internal" subtransactions, which are +handled by BeginInternalSubTransaction. This is to allow implementing +exception handling, e.g. in PL/pgSQL. ReleaseCurrentSubTransaction and +RollbackAndReleaseCurrentSubTransaction allows the subsystem to close said +subtransactions. The main difference between this and the savepoint/release +path is that we execute the complete state transition immediately in each +subroutine, rather than deferring some work until CommitTransactionCommand. +Another difference is that BeginInternalSubTransaction is allowed when no +explicit transaction block has been established, while DefineSavepoint is not. + + +Transaction and Subtransaction Numbering +---------------------------------------- + +Transactions and subtransactions are assigned permanent XIDs only when/if +they first do something that requires one --- typically, insert/update/delete +a tuple, though there are a few other places that need an XID assigned. +If a subtransaction requires an XID, we always first assign one to its +parent. This maintains the invariant that child transactions have XIDs later +than their parents, which is assumed in a number of places. + +The subsidiary actions of obtaining a lock on the XID and entering it into +pg_subtrans and PG_PROC are done at the time it is assigned. + +A transaction that has no XID still needs to be identified for various +purposes, notably holding locks. For this purpose we assign a "virtual +transaction ID" or VXID to each top-level transaction. VXIDs are formed from +two fields, the backendID and a backend-local counter; this arrangement allows +assignment of a new VXID at transaction start without any contention for +shared memory. To ensure that a VXID isn't re-used too soon after backend +exit, we store the last local counter value into shared memory at backend +exit, and initialize it from the previous value for the same backendID slot +at backend start. All these counters go back to zero at shared memory +re-initialization, but that's OK because VXIDs never appear anywhere on-disk. + +Internally, a backend needs a way to identify subtransactions whether or not +they have XIDs; but this need only lasts as long as the parent top transaction +endures. Therefore, we have SubTransactionId, which is somewhat like +CommandId in that it's generated from a counter that we reset at the start of +each top transaction. The top-level transaction itself has SubTransactionId 1, +and subtransactions have IDs 2 and up. (Zero is reserved for +InvalidSubTransactionId.) Note that subtransactions do not have their +own VXIDs; they use the parent top transaction's VXID. + + +Interlocking Transaction Begin, Transaction End, and Snapshots +-------------------------------------------------------------- + +We try hard to minimize the amount of overhead and lock contention involved +in the frequent activities of beginning/ending a transaction and taking a +snapshot. Unfortunately, we must have some interlocking for this, because +we must ensure consistency about the commit order of transactions. +For example, suppose an UPDATE in xact A is blocked by xact B's prior +update of the same row, and xact B is doing commit while xact C gets a +snapshot. Xact A can complete and commit as soon as B releases its locks. +If xact C's GetSnapshotData sees xact B as still running, then it had +better see xact A as still running as well, or it will be able to see two +tuple versions - one deleted by xact B and one inserted by xact A. Another +reason why this would be bad is that C would see (in the row inserted by A) +earlier changes by B, and it would be inconsistent for C not to see any +of B's changes elsewhere in the database. + +Formally, the correctness requirement is "if a snapshot A considers +transaction X as committed, and any of transaction X's snapshots considered +transaction Y as committed, then snapshot A must consider transaction Y as +committed". + +What we actually enforce is strict serialization of commits and rollbacks +with snapshot-taking: we do not allow any transaction to exit the set of +running transactions while a snapshot is being taken. (This rule is +stronger than necessary for consistency, but is relatively simple to +enforce, and it assists with some other issues as explained below.) The +implementation of this is that GetSnapshotData takes the ProcArrayLock in +shared mode (so that multiple backends can take snapshots in parallel), +but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode +while clearing the ProcGlobal->xids[] entry at transaction end (either +commit or abort). (To reduce context switching, when multiple transactions +commit nearly simultaneously, we have one backend take ProcArrayLock and +clear the XIDs of multiple processes at once.) + +ProcArrayEndTransaction also holds the lock while advancing the shared +latestCompletedXid variable. This allows GetSnapshotData to use +latestCompletedXid + 1 as xmax for its snapshot: there can be no +transaction >= this xid value that the snapshot needs to consider as +completed. + +In short, then, the rule is that no transaction may exit the set of +currently-running transactions between the time we fetch latestCompletedXid +and the time we finish building our snapshot. However, this restriction +only applies to transactions that have an XID --- read-only transactions +can end without acquiring ProcArrayLock, since they don't affect anyone +else's snapshot nor latestCompletedXid. + +Transaction start, per se, doesn't have any interlocking with these +considerations, since we no longer assign an XID immediately at transaction +start. But when we do decide to allocate an XID, GetNewTransactionId must +store the new XID into the shared ProcArray before releasing XidGenLock. +This ensures that all top-level XIDs <= latestCompletedXid are either +present in the ProcArray, or not running anymore. (This guarantee doesn't +apply to subtransaction XIDs, because of the possibility that there's not +room for them in the subxid array; instead we guarantee that they are +present or the overflow flag is set.) If a backend released XidGenLock +before storing its XID into ProcGlobal->xids[], then it would be possible for +another backend to allocate and commit a later XID, causing latestCompletedXid +to pass the first backend's XID, before that value became visible in the +ProcArray. That would break ComputeXidHorizons, as discussed below. + +We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the +subxid array) without taking ProcArrayLock. This was once necessary to +avoid deadlock; while that is no longer the case, it's still beneficial for +performance. We are thereby relying on fetch/store of an XID to be atomic, +else other backends might see a partially-set XID. This also means that +readers of the ProcArray xid fields must be careful to fetch a value only +once, rather than assume they can read it multiple times and get the same +answer each time. (Use volatile-qualified pointers when doing this, to +ensure that the C compiler does exactly what you tell it to.) + +Another important activity that uses the shared ProcArray is +ComputeXidHorizons, which must determine a lower bound for the oldest xmin +of any active MVCC snapshot, system-wide. Each individual backend +advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero +if it currently has no live snapshots (eg, if it's between transactions or +hasn't yet set a snapshot for a new transaction). ComputeXidHorizons takes +the MIN() of the valid xmin fields. It does this with only shared lock on +ProcArrayLock, which means there is a potential race condition against other +backends doing GetSnapshotData concurrently: we must be certain that a +concurrent backend that is about to set its xmin does not compute an xmin +less than what ComputeXidHorizons determines. We ensure that by including +all the active XIDs into the MIN() calculation, along with the valid xmins. +The rule that transactions can't exit without taking exclusive ProcArrayLock +ensures that concurrent holders of shared ProcArrayLock will compute the +same minimum of currently-active XIDs: no xact, in particular not the +oldest, can exit while we hold shared ProcArrayLock. So +ComputeXidHorizons's view of the minimum active XID will be the same as that +of any concurrent GetSnapshotData, and so it can't produce an overestimate. +If there is no active transaction at all, ComputeXidHorizons uses +latestCompletedXid + 1, which is a lower bound for the xmin that might +be computed by concurrent or later GetSnapshotData calls. (We know that no +XID less than this could be about to appear in the ProcArray, because of the +XidGenLock interlock discussed above.) + +As GetSnapshotData is performance critical, it does not perform an accurate +oldest-xmin calculation (it used to, until v14). The contents of a snapshot +only depend on the xids of other backends, not their xmin. As backend's xmin +changes much more often than its xid, having GetSnapshotData look at xmins +can lead to a lot of unnecessary cacheline ping-pong. Instead +GetSnapshotData updates approximate thresholds (one that guarantees that all +deleted rows older than it can be removed, another determining that deleted +rows newer than it can not be removed). GlobalVisTest* uses those thresholds +to make invisibility decision, falling back to ComputeXidHorizons if +necessary. + +Note that while it is certain that two concurrent executions of +GetSnapshotData will compute the same xmin for their own snapshots, there is +no such guarantee for the horizons computed by ComputeXidHorizons. This is +because we allow XID-less transactions to clear their MyProc->xmin +asynchronously (without taking ProcArrayLock), so one execution might see +what had been the oldest xmin, and another not. This is OK since the +thresholds need only be a valid lower bound. As noted above, we are already +assuming that fetch/store of the xid fields is atomic, so assuming it for +xmin as well is no extra risk. + + +pg_xact and pg_subtrans +----------------------- + +pg_xact and pg_subtrans are permanent (on-disk) storage of transaction related +information. There is a limited number of pages of each kept in memory, so +in many cases there is no need to actually read from disk. However, if +there's a long running transaction or a backend sitting idle with an open +transaction, it may be necessary to be able to read and write this information +from disk. They also allow information to be permanent across server restarts. + +pg_xact records the commit status for each transaction that has been assigned +an XID. A transaction can be in progress, committed, aborted, or +"sub-committed". This last state means that it's a subtransaction that's no +longer running, but its parent has not updated its state yet. It is not +necessary to update a subtransaction's transaction status to subcommit, so we +can just defer it until main transaction commit. The main role of marking +transactions as sub-committed is to provide an atomic commit protocol when +transaction status is spread across multiple clog pages. As a result, whenever +transaction status spreads across multiple pages we must use a two-phase commit +protocol: the first phase is to mark the subtransactions as sub-committed, then +we mark the top level transaction and all its subtransactions committed (in +that order). Thus, subtransactions that have not aborted appear as in-progress +even when they have already finished, and the subcommit status appears as a +very short transitory state during main transaction commit. Subtransaction +abort is always marked in clog as soon as it occurs. When the transaction +status all fit in a single CLOG page, we atomically mark them all as committed +without bothering with the intermediate sub-commit state. + +Savepoints are implemented using subtransactions. A subtransaction is a +transaction inside a transaction; its commit or abort status is not only +dependent on whether it committed itself, but also whether its parent +transaction committed. To implement multiple savepoints in a transaction we +allow unlimited transaction nesting depth, so any particular subtransaction's +commit state is dependent on the commit status of each and every ancestor +transaction. + +The "subtransaction parent" (pg_subtrans) mechanism records, for each +transaction with an XID, the TransactionId of its parent transaction. This +information is stored as soon as the subtransaction is assigned an XID. +Top-level transactions do not have a parent, so they leave their pg_subtrans +entries set to the default value of zero (InvalidTransactionId). + +pg_subtrans is used to check whether the transaction in question is still +running --- the main Xid of a transaction is recorded in ProcGlobal->xids[], +with a copy in PGPROC->xid, but since we allow arbitrary nesting of +subtransactions, we can't fit all Xids in shared memory, so we have to store +them on disk. Note, however, that for each transaction we keep a "cache" of +Xids that are known to be part of the transaction tree, so we can skip looking +at pg_subtrans unless we know the cache has been overflowed. See +storage/ipc/procarray.c for the gory details. + +slru.c is the supporting mechanism for both pg_xact and pg_subtrans. It +implements the LRU policy for in-memory buffer pages. The high-level routines +for pg_xact are implemented in transam.c, while the low-level functions are in +clog.c. pg_subtrans is contained completely in subtrans.c. + + +Write-Ahead Log Coding +---------------------- + +The WAL subsystem (also called XLOG in the code) exists to guarantee crash +recovery. It can also be used to provide point-in-time recovery, as well as +hot-standby replication via log shipping. Here are some notes about +non-obvious aspects of its design. + +A basic assumption of a write AHEAD log is that log entries must reach stable +storage before the data-page changes they describe. This ensures that +replaying the log to its end will bring us to a consistent state where there +are no partially-performed transactions. To guarantee this, each data page +(either heap or index) is marked with the LSN (log sequence number --- in +practice, a WAL file location) of the latest XLOG record affecting the page. +Before the bufmgr can write out a dirty page, it must ensure that xlog has +been flushed to disk at least up to the page's LSN. This low-level +interaction improves performance by not waiting for XLOG I/O until necessary. +The LSN check exists only in the shared-buffer manager, not in the local +buffer manager used for temp tables; hence operations on temp tables must not +be WAL-logged. + +During WAL replay, we can check the LSN of a page to detect whether the change +recorded by the current log entry is already applied (it has been, if the page +LSN is >= the log entry's WAL location). + +Usually, log entries contain just enough information to redo a single +incremental update on a page (or small group of pages). This will work only +if the filesystem and hardware implement data page writes as atomic actions, +so that a page is never left in a corrupt partly-written state. Since that's +often an untenable assumption in practice, we log additional information to +allow complete reconstruction of modified pages. The first WAL record +affecting a given page after a checkpoint is made to contain a copy of the +entire page, and we implement replay by restoring that page copy instead of +redoing the update. (This is more reliable than the data storage itself would +be because we can check the validity of the WAL record's CRC.) We can detect +the "first change after checkpoint" by noting whether the page's old LSN +precedes the end of WAL as of the last checkpoint (the RedoRecPtr). + +The general schema for executing a WAL-logged action is + +1. Pin and exclusive-lock the shared buffer(s) containing the data page(s) +to be modified. + +2. START_CRIT_SECTION() (Any error during the next three steps must cause a +PANIC because the shared buffers will contain unlogged changes, which we +have to ensure don't get to disk. Obviously, you should check conditions +such as whether there's enough free space on the page before you start the +critical section.) + +3. Apply the required changes to the shared buffer(s). + +4. Mark the shared buffer(s) as dirty with MarkBufferDirty(). (This must +happen before the WAL record is inserted; see notes in SyncOneBuffer().) +Note that marking a buffer dirty with MarkBufferDirty() should only +happen iff you write a WAL record; see Writing Hints below. + +5. If the relation requires WAL-logging, build a WAL record using +XLogBeginInsert and XLogRegister* functions, and insert it. (See +"Constructing a WAL record" below). Then update the page's LSN using the +returned XLOG location. For instance, + + XLogBeginInsert(); + XLogRegisterBuffer(...) + XLogRegisterData(...) + recptr = XLogInsert(rmgr_id, info); + + PageSetLSN(dp, recptr); + +6. END_CRIT_SECTION() + +7. Unlock and unpin the buffer(s). + +Complex changes (such as a multilevel index insertion) normally need to be +described by a series of atomic-action WAL records. The intermediate states +must be self-consistent, so that if the replay is interrupted between any +two actions, the system is fully functional. In btree indexes, for example, +a page split requires a new page to be allocated, and an insertion of a new +key in the parent btree level, but for locking reasons this has to be +reflected by two separate WAL records. Replaying the first record, to +allocate the new page and move tuples to it, sets a flag on the page to +indicate that the key has not been inserted to the parent yet. Replaying the +second record clears the flag. This intermediate state is never seen by +other backends during normal operation, because the lock on the child page +is held across the two actions, but will be seen if the operation is +interrupted before writing the second WAL record. The search algorithm works +with the intermediate state as normal, but if an insertion encounters a page +with the incomplete-split flag set, it will finish the interrupted split by +inserting the key to the parent, before proceeding. + + +Constructing a WAL record +------------------------- + +A WAL record consists of a header common to all WAL record types, +record-specific data, and information about the data blocks modified. Each +modified data block is identified by an ID number, and can optionally have +more record-specific data associated with the block. If XLogInsert decides +that a full-page image of a block needs to be taken, the data associated +with that block is not included. + +The API for constructing a WAL record consists of five functions: +XLogBeginInsert, XLogRegisterBuffer, XLogRegisterData, XLogRegisterBufData, +and XLogInsert. First, call XLogBeginInsert(). Then register all the buffers +modified, and data needed to replay the changes, using XLogRegister* +functions. Finally, insert the constructed record to the WAL by calling +XLogInsert(). + + XLogBeginInsert(); + + /* register buffers modified as part of this WAL-logged action */ + XLogRegisterBuffer(0, lbuffer, REGBUF_STANDARD); + XLogRegisterBuffer(1, rbuffer, REGBUF_STANDARD); + + /* register data that is always included in the WAL record */ + XLogRegisterData(&xlrec, SizeOfFictionalAction); + + /* + * register data associated with a buffer. This will not be included + * in the record if a full-page image is taken. + */ + XLogRegisterBufData(0, tuple->data, tuple->len); + + /* more data associated with the buffer */ + XLogRegisterBufData(0, data2, len2); + + /* + * Ok, all the data and buffers to include in the WAL record have + * been registered. Insert the record. + */ + recptr = XLogInsert(RM_FOO_ID, XLOG_FOOBAR_DO_STUFF); + +Details of the API functions: + +void XLogBeginInsert(void) + + Must be called before XLogRegisterBuffer and XLogRegisterData. + +void XLogResetInsertion(void) + + Clear any currently registered data and buffers from the WAL record + construction workspace. This is only needed if you have already called + XLogBeginInsert(), but decide to not insert the record after all. + +void XLogEnsureRecordSpace(int max_block_id, int ndatas) + + Normally, the WAL record construction buffers have the following limits: + + * highest block ID that can be used is 4 (allowing five block references) + * Max 20 chunks of registered data + + These default limits are enough for most record types that change some + on-disk structures. For the odd case that requires more data, or needs to + modify more buffers, these limits can be raised by calling + XLogEnsureRecordSpace(). XLogEnsureRecordSpace() must be called before + XLogBeginInsert(), and outside a critical section. + +void XLogRegisterBuffer(uint8 block_id, Buffer buf, uint8 flags); + + XLogRegisterBuffer adds information about a data block to the WAL record. + block_id is an arbitrary number used to identify this page reference in + the redo routine. The information needed to re-find the page at redo - + relfilenode, fork, and block number - are included in the WAL record. + + XLogInsert will automatically include a full copy of the page contents, if + this is the first modification of the buffer since the last checkpoint. + It is important to register every buffer modified by the action with + XLogRegisterBuffer, to avoid torn-page hazards. + + The flags control when and how the buffer contents are included in the + WAL record. Normally, a full-page image is taken only if the page has not + been modified since the last checkpoint, and only if full_page_writes=on + or an online backup is in progress. The REGBUF_FORCE_IMAGE flag can be + used to force a full-page image to always be included; that is useful + e.g. for an operation that rewrites most of the page, so that tracking the + details is not worth it. For the rare case where it is not necessary to + protect from torn pages, REGBUF_NO_IMAGE flag can be used to suppress + full page image from being taken. REGBUF_WILL_INIT also suppresses a full + page image, but the redo routine must re-generate the page from scratch, + without looking at the old page contents. Re-initializing the page + protects from torn page hazards like a full page image does. + + The REGBUF_STANDARD flag can be specified together with the other flags to + indicate that the page follows the standard page layout. It causes the + area between pd_lower and pd_upper to be left out from the image, reducing + WAL volume. + + If the REGBUF_KEEP_DATA flag is given, any per-buffer data registered with + XLogRegisterBufData() is included in the WAL record even if a full-page + image is taken. + +void XLogRegisterData(char *data, int len); + + XLogRegisterData is used to include arbitrary data in the WAL record. If + XLogRegisterData() is called multiple times, the data are appended, and + will be made available to the redo routine as one contiguous chunk. + +void XLogRegisterBufData(uint8 block_id, char *data, int len); + + XLogRegisterBufData is used to include data associated with a particular + buffer that was registered earlier with XLogRegisterBuffer(). If + XLogRegisterBufData() is called multiple times with the same block ID, the + data are appended, and will be made available to the redo routine as one + contiguous chunk. + + If a full-page image of the buffer is taken at insertion, the data is not + included in the WAL record, unless the REGBUF_KEEP_DATA flag is used. + + +Writing a REDO routine +---------------------- + +A REDO routine uses the data and page references included in the WAL record +to reconstruct the new state of the page. The record decoding functions +and macros in xlogreader.c/h can be used to extract the data from the record. + +When replaying a WAL record that describes changes on multiple pages, you +must be careful to lock the pages properly to prevent concurrent Hot Standby +queries from seeing an inconsistent state. If this requires that two +or more buffer locks be held concurrently, you must lock the pages in +appropriate order, and not release the locks until all the changes are done. + +Note that we must only use PageSetLSN/PageGetLSN() when we know the action +is serialised. Only Startup process may modify data blocks during recovery, +so Startup process may execute PageGetLSN() without fear of serialisation +problems. All other processes must only call PageSet/GetLSN when holding +either an exclusive buffer lock or a shared lock plus buffer header lock, +or be writing the data block directly rather than through shared buffers +while holding AccessExclusiveLock on the relation. + + +Writing Hints +------------- + +In some cases, we write additional information to data blocks without +writing a preceding WAL record. This should only happen iff the data can +be reconstructed later following a crash and the action is simply a way +of optimising for performance. When a hint is written we use +MarkBufferDirtyHint() to mark the block dirty. + +If the buffer is clean and checksums are in use then MarkBufferDirtyHint() +inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image +that includes the hint. We do this to avoid a partial page write, when we +write the dirtied page. WAL is not written during recovery, so we simply skip +dirtying blocks because of hints when in recovery. + +If you do decide to optimise away a WAL record, then any calls to +MarkBufferDirty() must be replaced by MarkBufferDirtyHint(), +otherwise you will expose the risk of partial page writes. + + +Write-Ahead Logging for Filesystem Actions +------------------------------------------ + +The previous section described how to WAL-log actions that only change page +contents within shared buffers. For that type of action it is generally +possible to check all likely error cases (such as insufficient space on the +page) before beginning to make the actual change. Therefore we can make +the change and the creation of the associated WAL log record "atomic" by +wrapping them into a critical section --- the odds of failure partway +through are low enough that PANIC is acceptable if it does happen. + +Clearly, that approach doesn't work for cases where there's a significant +probability of failure within the action to be logged, such as creation +of a new file or database. We don't want to PANIC, and we especially don't +want to PANIC after having already written a WAL record that says we did +the action --- if we did, replay of the record would probably fail again +and PANIC again, making the failure unrecoverable. This means that the +ordinary WAL rule of "write WAL before the changes it describes" doesn't +work, and we need a different design for such cases. + +There are several basic types of filesystem actions that have this +issue. Here is how we deal with each: + +1. Adding a disk page to an existing table. + +This action isn't WAL-logged at all. We extend a table by writing a page +of zeroes at its end. We must actually do this write so that we are sure +the filesystem has allocated the space. If the write fails we can just +error out normally. Once the space is known allocated, we can initialize +and fill the page via one or more normal WAL-logged actions. Because it's +possible that we crash between extending the file and writing out the WAL +entries, we have to treat discovery of an all-zeroes page in a table or +index as being a non-error condition. In such cases we can just reclaim +the space for re-use. + +2. Creating a new table, which requires a new file in the filesystem. + +We try to create the file, and if successful we make a WAL record saying +we did it. If not successful, we can just throw an error. Notice that +there is a window where we have created the file but not yet written any +WAL about it to disk. If we crash during this window, the file remains +on disk as an "orphan". It would be possible to clean up such orphans +by having database restart search for files that don't have any committed +entry in pg_class, but that currently isn't done because of the possibility +of deleting data that is useful for forensic analysis of the crash. +Orphan files are harmless --- at worst they waste a bit of disk space --- +because we check for on-disk collisions when allocating new relfilenode +OIDs. So cleaning up isn't really necessary. + +3. Deleting a table, which requires an unlink() that could fail. + +Our approach here is to WAL-log the operation first, but to treat failure +of the actual unlink() call as a warning rather than error condition. +Again, this can leave an orphan file behind, but that's cheap compared to +the alternatives. Since we can't actually do the unlink() until after +we've committed the DROP TABLE transaction, throwing an error would be out +of the question anyway. (It may be worth noting that the WAL entry about +the file deletion is actually part of the commit record for the dropping +transaction.) + +4. Creating and deleting databases and tablespaces, which requires creating +and deleting directories and entire directory trees. + +These cases are handled similarly to creating individual files, ie, we +try to do the action first and then write a WAL entry if it succeeded. +The potential amount of wasted disk space is rather larger, of course. +In the creation case we try to delete the directory tree again if creation +fails, so as to reduce the risk of wasted space. Failure partway through +a deletion operation results in a corrupt database: the DROP failed, but +some of the data is gone anyway. There is little we can do about that, +though, and in any case it was presumably data the user no longer wants. + +In all of these cases, if WAL replay fails to redo the original action +we must panic and abort recovery. The DBA will have to manually clean up +(for instance, free up some disk space or fix directory permissions) and +then restart recovery. This is part of the reason for not writing a WAL +entry until we've successfully done the original action. + + +Skipping WAL for New RelFileNode +-------------------------------- + +Under wal_level=minimal, if a change modifies a relfilenode that ROLLBACK +would unlink, in-tree access methods write no WAL for that change. Code that +writes WAL without calling RelationNeedsWAL() must check for this case. This +skipping is mandatory. If a WAL-writing change preceded a WAL-skipping change +for the same block, REDO could overwrite the WAL-skipping change. If a +WAL-writing change followed a WAL-skipping change for the same block, a +related problem would arise. When a WAL record contains no full-page image, +REDO expects the page to match its contents from just before record insertion. +A WAL-skipping change may not reach disk at all, violating REDO's expectation +under full_page_writes=off. For any access method, CommitTransaction() writes +and fsyncs affected blocks before recording the commit. + +Prefer to do the same in future access methods. However, two other approaches +can work. First, an access method can irreversibly transition a given fork +from WAL-skipping to WAL-writing by calling FlushRelationBuffers() and +smgrimmedsync(). Second, an access method can opt to write WAL +unconditionally for permanent relations. Under these approaches, the access +method callbacks must not call functions that react to RelationNeedsWAL(). + +This applies only to WAL records whose replay would modify bytes stored in the +new relfilenode. It does not apply to other records about the relfilenode, +such as XLOG_SMGR_CREATE. Because it operates at the level of individual +relfilenodes, RelationNeedsWAL() can differ for tightly-coupled relations. +Consider "CREATE TABLE t (); BEGIN; ALTER TABLE t ADD c text; ..." in which +ALTER TABLE adds a TOAST relation. The TOAST relation will skip WAL, while +the table owning it will not. ALTER TABLE SET TABLESPACE will cause a table +to skip WAL, but that won't affect its indexes. + + +Asynchronous Commit +------------------- + +As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e., +we don't wait while the WAL record for the commit is fsync'ed. +We perform an asynchronous commit when synchronous_commit = off. Instead +of performing an XLogFlush() up to the LSN of the commit, we merely note +the LSN in shared memory. The backend then continues with other work. +We record the LSN only for an asynchronous commit, not an abort; there's +never any need to flush an abort record, since the presumption after a +crash would be that the transaction aborted anyway. + +We always force synchronous commit when the transaction is deleting +relations, to ensure the commit record is down to disk before the relations +are removed from the filesystem. Also, certain utility commands that have +non-roll-backable side effects (such as filesystem changes) force sync +commit to minimize the window in which the filesystem change has been made +but the transaction isn't guaranteed committed. + +The walwriter regularly wakes up (via wal_writer_delay) or is woken up +(via its latch, which is set by backends committing asynchronously) and +performs an XLogBackgroundFlush(). This checks the location of the last +completely filled WAL page. If that has moved forwards, then we write all +the changed buffers up to that point, so that under full load we write +only whole buffers. If there has been a break in activity and the current +WAL page is the same as before, then we find out the LSN of the most +recent asynchronous commit, and write up to that point, if required (i.e. +if it's in the current WAL page). If more than wal_writer_delay has +passed, or more than wal_writer_flush_after blocks have been written, since +the last flush, WAL is also flushed up to the current location. This +arrangement in itself would guarantee that an async commit record reaches +disk after at most two times wal_writer_delay after the transaction +completes. However, we also allow XLogFlush to write/flush full buffers +"flexibly" (ie, not wrapping around at the end of the circular WAL buffer +area), so as to minimize the number of writes issued under high load when +multiple WAL pages are filled per walwriter cycle. This makes the worst-case +delay three wal_writer_delay cycles. + +There are some other subtle points to consider with asynchronous commits. +First, for each page of CLOG we must remember the LSN of the latest commit +affecting the page, so that we can enforce the same flush-WAL-before-write +rule that we do for ordinary relation pages. Otherwise the record of the +commit might reach disk before the WAL record does. Again, abort records +need not factor into this consideration. + +In fact, we store more than one LSN for each clog page. This relates to +the way we set transaction status hint bits during visibility tests. +We must not set a transaction-committed hint bit on a relation page and +have that record make it to disk prior to the WAL record of the commit. +Since visibility tests are normally made while holding buffer share locks, +we do not have the option of changing the page's LSN to guarantee WAL +synchronization. Instead, we defer the setting of the hint bit if we have +not yet flushed WAL as far as the LSN associated with the transaction. +This requires tracking the LSN of each unflushed async commit. It is +convenient to associate this data with clog buffers: because we will flush +WAL before writing a clog page, we know that we do not need to remember a +transaction's LSN longer than the clog page holding its commit status +remains in memory. However, the naive approach of storing an LSN for each +clog position is unattractive: the LSNs are 32x bigger than the two-bit +commit status fields, and so we'd need 256K of additional shared memory for +each 8K clog buffer page. We choose instead to store a smaller number of +LSNs per page, where each LSN is the highest LSN associated with any +transaction commit in a contiguous range of transaction IDs on that page. +This saves storage at the price of some possibly-unnecessary delay in +setting transaction hint bits. + +How many transactions should share the same cached LSN (N)? If the +system's workload consists only of small async-commit transactions, then +it's reasonable to have N similar to the number of transactions per +walwriter cycle, since that is the granularity with which transactions will +become truly committed (and thus hintable) anyway. The worst case is where +a sync-commit xact shares a cached LSN with an async-commit xact that +commits a bit later; even though we paid to sync the first xact to disk, +we won't be able to hint its outputs until the second xact is sync'd, up to +three walwriter cycles later. This argues for keeping N (the group size) +as small as possible. For the moment we are setting the group size to 32, +which makes the LSN cache space the same size as the actual clog buffer +space (independently of BLCKSZ). + +It is useful that we can run both synchronous and asynchronous commit +transactions concurrently, but the safety of this is perhaps not +immediately obvious. Assume we have two transactions, T1 and T2. The Log +Sequence Number (LSN) is the point in the WAL sequence where a transaction +commit is recorded, so LSN1 and LSN2 are the commit records of those +transactions. If T2 can see changes made by T1 then when T2 commits it +must be true that LSN2 follows LSN1. Thus when T2 commits it is certain +that all of the changes made by T1 are also now recorded in the WAL. This +is true whether T1 was asynchronous or synchronous. As a result, it is +safe for asynchronous commits and synchronous commits to work concurrently +without endangering data written by synchronous commits. Sub-transactions +are not important here since the final write to disk only occurs at the +commit of the top level transaction. + +Changes to data blocks cannot reach disk unless WAL is flushed up to the +point of the LSN of the data blocks. Any attempt to write unsafe data to +disk will trigger a write which ensures the safety of all data written by +that and prior transactions. Data blocks and clog pages are both protected +by LSNs. + +Changes to a temp table are not WAL-logged, hence could reach disk in +advance of T1's commit, but we don't care since temp table contents don't +survive crashes anyway. + +Database writes that skip WAL for new relfilenodes are also safe. In these +cases it's entirely possible for the data to reach disk before T1's commit, +because T1 will fsync it down to disk without any sort of interlock. However, +all these paths are designed to write data that no other transaction can see +until after T1 commits. The situation is thus not different from ordinary +WAL-logged updates. + +Transaction Emulation during Recovery +------------------------------------- + +During Recovery we replay transaction changes in the order they occurred. +As part of this replay we emulate some transactional behaviour, so that +read only backends can take MVCC snapshots. We do this by maintaining a +list of XIDs belonging to transactions that are being replayed, so that +each transaction that has recorded WAL records for database writes exist +in the array until it commits. Further details are given in comments in +procarray.c. + +Many actions write no WAL records at all, for example read only transactions. +These have no effect on MVCC in recovery and we can pretend they never +occurred at all. Subtransaction commit does not write a WAL record either +and has very little effect, since lock waiters need to wait for the +parent transaction to complete. + +Not all transactional behaviour is emulated, for example we do not insert +a transaction entry into the lock table, nor do we maintain the transaction +stack in memory. Clog, multixact and commit_ts entries are made normally. +Subtrans is maintained during recovery but the details of the transaction +tree are ignored and all subtransactions reference the top-level TransactionId +directly. Since commit is atomic this provides correct lock wait behaviour +yet simplifies emulation of subtransactions considerably. + +Further details on locking mechanics in recovery are given in comments +with the Lock rmgr code. diff --git a/src/backend/access/transam/README.parallel b/src/backend/access/transam/README.parallel new file mode 100644 index 0000000..99c588d --- /dev/null +++ b/src/backend/access/transam/README.parallel @@ -0,0 +1,237 @@ +Overview +======== + +PostgreSQL provides some simple facilities to make writing parallel algorithms +easier. Using a data structure called a ParallelContext, you can arrange to +launch background worker processes, initialize their state to match that of +the backend which initiated parallelism, communicate with them via dynamic +shared memory, and write reasonably complex code that can run either in the +user backend or in one of the parallel workers without needing to be aware of +where it's running. + +The backend which starts a parallel operation (hereafter, the initiating +backend) starts by creating a dynamic shared memory segment which will last +for the lifetime of the parallel operation. This dynamic shared memory segment +will contain (1) a shm_mq that can be used to transport errors (and other +messages reported via elog/ereport) from the worker back to the initiating +backend; (2) serialized representations of the initiating backend's private +state, so that the worker can synchronize its state with of the initiating +backend; and (3) any other data structures which a particular user of the +ParallelContext data structure may wish to add for its own purposes. Once +the initiating backend has initialized the dynamic shared memory segment, it +asks the postmaster to launch the appropriate number of parallel workers. +These workers then connect to the dynamic shared memory segment, initiate +their state, and then invoke the appropriate entrypoint, as further detailed +below. + +Error Reporting +=============== + +When started, each parallel worker begins by attaching the dynamic shared +memory segment and locating the shm_mq to be used for error reporting; it +redirects all of its protocol messages to this shm_mq. Prior to this point, +any failure of the background worker will not be reported to the initiating +backend; from the point of view of the initiating backend, the worker simply +failed to start. The initiating backend must anyway be prepared to cope +with fewer parallel workers than it originally requested, so catering to +this case imposes no additional burden. + +Whenever a new message (or partial message; very large messages may wrap) is +sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the +initiating backend. This causes the next CHECK_FOR_INTERRUPTS() in the +initiating backend to read and rethrow the message. For the most part, this +makes error reporting in parallel mode "just work". Of course, to work +properly, it is important that the code the initiating backend is executing +CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for +long periods of time, but those are good things to do anyway. + +(A currently-unsolved problem is that some messages may get written to the +system log twice, once in the backend where the report was originally +generated, and again when the initiating backend rethrows the message. If +we decide to suppress one of these reports, it should probably be second one; +otherwise, if the worker is for some reason unable to propagate the message +back to the initiating backend, the message will be lost altogether.) + +State Sharing +============= + +It's possible to write C code which works correctly without parallelism, but +which fails when parallelism is used. No parallel infrastructure can +completely eliminate this problem, because any global variable is a risk. +There's no general mechanism for ensuring that every global variable in the +worker will have the same value that it does in the initiating backend; even +if we could ensure that, some function we're calling could update the variable +after each call, and only the backend where that update is performed will see +the new value. Similar problems can arise with any more-complex data +structure we might choose to use. For example, a pseudo-random number +generator should, given a particular seed value, produce the same predictable +series of values every time. But it does this by relying on some private +state which won't automatically be shared between cooperating backends. A +parallel-safe PRNG would need to store its state in dynamic shared memory, and +would require locking. The parallelism infrastructure has no way of knowing +whether the user intends to call code that has this sort of problem, and can't +do anything about it anyway. + +Instead, we take a more pragmatic approach. First, we try to make as many of +the operations that are safe outside of parallel mode work correctly in +parallel mode as well. Second, we try to prohibit common unsafe operations +via suitable error checks. These checks are intended to catch 100% of +unsafe things that a user might do from the SQL interface, but code written +in C can do unsafe things that won't trigger these checks. The error checks +are engaged via EnterParallelMode(), which should be called before creating +a parallel context, and disarmed via ExitParallelMode(), which should be +called after all parallel contexts have been destroyed. The most +significant restriction imposed by parallel mode is that all operations must +be strictly read-only; we allow no writes to the database and no DDL. We +might try to relax these restrictions in the future. + +To make as many operations as possible safe in parallel mode, we try to copy +the most important pieces of state from the initiating backend to each parallel +worker. This includes: + + - The set of libraries dynamically loaded by dfmgr.c. + + - The authenticated user ID and current database. Each parallel worker + will connect to the same database as the initiating backend, using the + same user ID. + + - The values of all GUCs. Accordingly, permanent changes to the value of + any GUC are forbidden while in parallel mode; but temporary changes, + such as entering a function with non-NULL proconfig, are OK. + + - The current subtransaction's XID, the top-level transaction's XID, and + the list of XIDs considered current (that is, they are in-progress or + subcommitted). This information is needed to ensure that tuple visibility + checks return the same results in the worker as they do in the + initiating backend. See also the section Transaction Integration, below. + + - The combo CID mappings. This is needed to ensure consistent answers to + tuple visibility checks. The need to synchronize this data structure is + a major reason why we can't support writes in parallel mode: such writes + might create new combo CIDs, and we have no way to let other workers + (or the initiating backend) know about them. + + - The transaction snapshot. + + - The active snapshot, which might be different from the transaction + snapshot. + + - The currently active user ID and security context. Note that this is + the fourth user ID we restore: the initial step of binding to the correct + database also involves restoring the authenticated user ID. When GUC + values are restored, this incidentally sets SessionUserId and OuterUserId + to the correct values. This final step restores CurrentUserId. + + - State related to pending REINDEX operations, which prevents access to + an index that is currently being rebuilt. + + - Active relmapper.c mapping state. This is needed to allow consistent + answers when fetching the current relfilenode for relation oids of + mapped relations. + +To prevent unprincipled deadlocks when running in parallel mode, this code +also arranges for the leader and all workers to participate in group +locking. See src/backend/storage/lmgr/README for more details. + +Transaction Integration +======================= + +Regardless of what the TransactionState stack looks like in the parallel +leader, each parallel worker ends up with a stack of depth 1. This stack +entry is marked with the special transaction block state +TBLOCK_PARALLEL_INPROGRESS so that it's not confused with an ordinary +toplevel transaction. The XID of this TransactionState is set to the XID of +the innermost currently-active subtransaction in the initiating backend. The +initiating backend's toplevel XID, and the XIDs of all current (in-progress +or subcommitted) XIDs are stored separately from the TransactionState stack, +but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(), and +TransactionIdIsCurrentTransactionId() return the same values that they would +in the initiating backend. We could copy the entire transaction state stack, +but most of it would be useless: for example, you can't roll back to a +savepoint from within a parallel worker, and there are no resources to +associated with the memory contexts or resource owners of intermediate +subtransactions. + +No meaningful change to the transaction state can be made while in parallel +mode. No XIDs can be assigned, and no subtransactions can start or end, +because we have no way of communicating these state changes to cooperating +backends, or of synchronizing them. It's clearly unworkable for the initiating +backend to exit any transaction or subtransaction that was in progress when +parallelism was started before all parallel workers have exited; and it's even +more clearly crazy for a parallel worker to try to subcommit or subabort the +current subtransaction and execute in some other transaction context than was +present in the initiating backend. It might be practical to allow internal +sub-transactions (e.g. to implement a PL/pgSQL EXCEPTION block) to be used in +parallel mode, provided that they are XID-less, because other backends +wouldn't really need to know about those transactions or do anything +differently because of them. Right now, we don't even allow that. + +At the end of a parallel operation, which can happen either because it +completed successfully or because it was interrupted by an error, parallel +workers associated with that operation exit. In the error case, transaction +abort processing in the parallel leader kills off any remaining workers, and +the parallel leader then waits for them to die. In the case of a successful +parallel operation, the parallel leader does not send any signals, but must +wait for workers to complete and exit of their own volition. In either +case, it is very important that all workers actually exit before the +parallel leader cleans up the (sub)transaction in which they were created; +otherwise, chaos can ensue. For example, if the leader is rolling back the +transaction that created the relation being scanned by a worker, the +relation could disappear while the worker is still busy scanning it. That's +not safe. + +Generally, the cleanup performed by each worker at this point is similar to +top-level commit or abort. Each backend has its own resource owners: buffer +pins, catcache or relcache reference counts, tuple descriptors, and so on +are managed separately by each backend, and must free them before exiting. +There are, however, some important differences between parallel worker +commit or abort and a real top-level transaction commit or abort. Most +importantly: + + - No commit or abort record is written; the initiating backend is + responsible for this. + + - Cleanup of pg_temp namespaces is not done. Parallel workers cannot + safely access the initiating backend's pg_temp namespace, and should + not create one of their own. + +Coding Conventions +=================== + +Before beginning any parallel operation, call EnterParallelMode(); after all +parallel operations are completed, call ExitParallelMode(). To actually +parallelize a particular operation, use a ParallelContext. The basic coding +pattern looks like this: + + EnterParallelMode(); /* prohibit unsafe state changes */ + + pcxt = CreateParallelContext("library_name", "function_name", nworkers); + + /* Allow space for application-specific data here. */ + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, keys); + + InitializeParallelDSM(pcxt); /* create DSM and copy state to it */ + + /* Store the data for which we reserved space. */ + space = shm_toc_allocate(pcxt->toc, size); + shm_toc_insert(pcxt->toc, key, space); + + LaunchParallelWorkers(pcxt); + + /* do parallel stuff */ + + WaitForParallelWorkersToFinish(pcxt); + + /* read any final results from dynamic shared memory */ + + DestroyParallelContext(pcxt); + + ExitParallelMode(); + +If desired, after WaitForParallelWorkersToFinish() has been called, the +context can be reset so that workers can be launched anew using the same +parallel context. To do this, first call ReinitializeParallelDSM() to +reinitialize state managed by the parallel context machinery itself; then, +perform any other necessary resetting of state; after that, you can again +call LaunchParallelWorkers. diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c new file mode 100644 index 0000000..3d9088a --- /dev/null +++ b/src/backend/access/transam/clog.c @@ -0,0 +1,1030 @@ +/*------------------------------------------------------------------------- + * + * clog.c + * PostgreSQL transaction-commit-log manager + * + * This module replaces the old "pg_log" access code, which treated pg_log + * essentially like a relation, in that it went through the regular buffer + * manager. The problem with that was that there wasn't any good way to + * recycle storage space for transactions so old that they'll never be + * looked up again. Now we use specialized access code so that the commit + * log can be broken into relatively small, independent segments. + * + * XLOG interactions: this module generates an XLOG record whenever a new + * CLOG page is initialized to zeroes. Other writes of CLOG come from + * recording of transaction commit or abort in xact.c, which generates its + * own XLOG records for these events and will re-perform the status update + * on redo; so we need make no additional XLOG entry here. For synchronous + * transaction commits, the XLOG is guaranteed flushed through the XLOG commit + * record before we are called to log a commit, so the WAL rule "write xlog + * before data" is satisfied automatically. However, for async commits we + * must track the latest LSN affecting each CLOG page, so that we can flush + * XLOG that far and satisfy the WAL rule. We don't have to worry about this + * for aborts (whether sync or async), since the post-crash assumption would + * be that such transactions failed anyway. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/clog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "access/slru.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/sync.h" + +/* + * Defines for CLOG page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE, + * and CLOG segment numbering at + * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCLOG (see CLOGPagePrecedes). + */ + +/* We need two bits per xact, so four xacts fit in a byte */ +#define CLOG_BITS_PER_XACT 2 +#define CLOG_XACTS_PER_BYTE 4 +#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) +#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) +#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE) +#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE) + +/* We store the latest async LSN for each group of transactions */ +#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */ +#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP) + +#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ + ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) + +/* + * The number of subtransactions below which we consider to apply clog group + * update optimization. Testing reveals that the number higher than this can + * hurt performance. + */ +#define THRESHOLD_SUBTRANS_CLOG_OPT 5 + +/* + * Link to shared-memory data structures for CLOG control + */ +static SlruCtlData XactCtlData; + +#define XactCtl (&XactCtlData) + + +static int ZeroCLOGPage(int pageno, bool writeXlog); +static bool CLOGPagePrecedes(int page1, int page2); +static void WriteZeroPageXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, + Oid oldestXactDb); +static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xact_same_page); +static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, + XLogRecPtr lsn, int slotno); +static void set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn); +static bool TransactionGroupUpdateXidStatus(TransactionId xid, + XidStatus status, XLogRecPtr lsn, int pageno); +static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno); + + +/* + * TransactionIdSetTreeStatus + * + * Record the final state of transaction entries in the commit log for + * a transaction and its subtransaction tree. Take care to ensure this is + * efficient, and as atomic as possible. + * + * xid is a single xid to set status for. This will typically be + * the top level transactionid for a top level commit or abort. It can + * also be a subtransaction when we record transaction aborts. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * + * lsn must be the WAL location of the commit record when recording an async + * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the + * caller guarantees the commit record is already flushed in that case. It + * should be InvalidXLogRecPtr for abort cases, too. + * + * In the commit case, atomicity is limited by whether all the subxids are in + * the same CLOG page as xid. If they all are, then the lock will be grabbed + * only once, and the status will be set to committed directly. Otherwise + * we must + * 1. set sub-committed all subxids that are not on the same page as the + * main xid + * 2. atomically set committed the main xid and the subxids on the same page + * 3. go over the first bunch again and set them committed + * Note that as far as concurrent checkers are concerned, main transaction + * commit as a whole is still atomic. + * + * Example: + * TransactionId t commits and has subxids t1, t2, t3, t4 + * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 + * 1. update pages2-3: + * page2: set t2,t3 as sub-committed + * page3: set t4 as sub-committed + * 2. update page1: + * set t1 as sub-committed, + * then set t as committed, + then set t1 as committed + * 3. update pages2-3: + * page2: set t2,t3 as committed + * page3: set t4 as committed + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; functions in transam.c are the intended callers. + * + * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need, + * but aren't yet in cache, as well as hinting pages not to fall out of + * cache yet. + */ +void +TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(xid); /* get page of parent */ + int i; + + Assert(status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED); + + /* + * See how many subxids, if any, are on the same page as the parent, if + * any. + */ + for (i = 0; i < nsubxids; i++) + { + if (TransactionIdToPage(subxids[i]) != pageno) + break; + } + + /* + * Do all items fit on a single page? + */ + if (i == nsubxids) + { + /* + * Set the parent and all subtransactions in a single call + */ + TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, + pageno, true); + } + else + { + int nsubxids_on_first_page = i; + + /* + * If this is a commit then we care about doing this correctly (i.e. + * using the subcommitted intermediate status). By here, we know + * we're updating more than one page of clog, so we must mark entries + * that are *not* on the first page so that they show as subcommitted + * before we then return to update the status to fully committed. + * + * To avoid touching the first page twice, skip marking subcommitted + * for the subxids on that first page. + */ + if (status == TRANSACTION_STATUS_COMMITTED) + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + TRANSACTION_STATUS_SUB_COMMITTED, lsn); + + /* + * Now set the parent and subtransactions on same page as the parent, + * if any + */ + pageno = TransactionIdToPage(xid); + TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, + lsn, pageno, false); + + /* + * Now work through the rest of the subxids one clog page at a time, + * starting from the second page onwards, like we did above. + */ + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + status, lsn); + } +} + +/* + * Helper for TransactionIdSetTreeStatus: set the status for a bunch of + * transactions, chunking in the separate CLOG pages involved. We never + * pass the whole transaction tree to this function, only subtransactions + * that are on different pages to the top level transaction id. + */ +static void +set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(subxids[0]); + int offset = 0; + int i = 0; + + Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */ + + while (i < nsubxids) + { + int num_on_page = 0; + int nextpageno; + + do + { + nextpageno = TransactionIdToPage(subxids[i]); + if (nextpageno != pageno) + break; + num_on_page++; + i++; + } while (i < nsubxids); + + TransactionIdSetPageStatus(InvalidTransactionId, + num_on_page, subxids + offset, + status, lsn, pageno, false); + offset = i; + pageno = nextpageno; + } +} + +/* + * Record the final state of transaction entries in the commit log for all + * entries on a single page. Atomic only on this page. + */ +static void +TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xact_same_page) +{ + /* Can't use group update when PGPROC overflows. */ + StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS, + "group clog threshold less than PGPROC cached subxids"); + + /* + * When there is contention on XactSLRULock, we try to group multiple + * updates; a single leader process will perform transaction status + * updates for multiple backends so that the number of times XactSLRULock + * needs to be acquired is reduced. + * + * For this optimization to be safe, the XID and subxids in MyProc must be + * the same as the ones for which we're setting the status. Check that + * this is the case. + * + * For this optimization to be efficient, we shouldn't have too many + * sub-XIDs and all of the XIDs for which we're adjusting clog should be + * on the same page. Check those conditions, too. + */ + if (all_xact_same_page && xid == MyProc->xid && + nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && + nsubxids == MyProc->subxidStatus.count && + (nsubxids == 0 || + memcmp(subxids, MyProc->subxids.xids, + nsubxids * sizeof(TransactionId)) == 0)) + { + /* + * If we can immediately acquire XactSLRULock, we update the status of + * our own XID and release the lock. If not, try use group XID + * update. If that doesn't work out, fall back to waiting for the + * lock to perform an update for this transaction only. + */ + if (LWLockConditionalAcquire(XactSLRULock, LW_EXCLUSIVE)) + { + /* Got the lock without waiting! Do the update. */ + TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); + LWLockRelease(XactSLRULock); + return; + } + else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno)) + { + /* Group update mechanism has done the work. */ + return; + } + + /* Fall through only if update isn't done yet. */ + } + + /* Group update not applicable, or couldn't accept this page number. */ + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); + LWLockRelease(XactSLRULock); +} + +/* + * Record the final state of transaction entry in the commit log + * + * We don't do any locking here; caller must handle that. + */ +static void +TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno) +{ + int slotno; + int i; + + Assert(status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED || + (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); + Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE)); + + /* + * If we're doing an async commit (ie, lsn is valid), then we must wait + * for any active write on the page slot to complete. Otherwise our + * update could reach disk in that write, which will not do since we + * mustn't let it reach disk until we've done the appropriate WAL flush. + * But when lsn is invalid, it's OK to scribble on a page while it is + * write-busy, since we don't care if the update reaches disk sooner than + * we think. + */ + slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); + + /* + * Set the main transaction id, if any. + * + * If we update more than one xid on this page while it is being written + * out, we might find that some of the bits go to disk and others don't. + * If we are updating commits on the page with the top-level xid that + * could break atomicity, so we subcommit the subxids first before we mark + * the top-level commit. + */ + if (TransactionIdIsValid(xid)) + { + /* Subtransactions first, if needed ... */ + if (status == TRANSACTION_STATUS_COMMITTED) + { + for (i = 0; i < nsubxids; i++) + { + Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], + TRANSACTION_STATUS_SUB_COMMITTED, + lsn, slotno); + } + } + + /* ... then the main transaction */ + TransactionIdSetStatusBit(xid, status, lsn, slotno); + } + + /* Set the subtransactions */ + for (i = 0; i < nsubxids; i++) + { + Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); + } + + XactCtl->shared->page_dirty[slotno] = true; +} + +/* + * When we cannot immediately acquire XactSLRULock in exclusive mode at + * commit time, add ourselves to a list of processes that need their XIDs + * status update. The first process to add itself to the list will acquire + * XactSLRULock in exclusive mode and set transaction status as required + * on behalf of all group members. This avoids a great deal of contention + * around XactSLRULock when many processes are trying to commit at once, + * since the lock need not be repeatedly handed off from one committing + * process to the next. + * + * Returns true when transaction status has been updated in clog; returns + * false if we decided against applying the optimization because the page + * number we need to update differs from those processes already waiting. + */ +static bool +TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, + XLogRecPtr lsn, int pageno) +{ + volatile PROC_HDR *procglobal = ProcGlobal; + PGPROC *proc = MyProc; + uint32 nextidx; + uint32 wakeidx; + + /* We should definitely have an XID whose status needs to be updated. */ + Assert(TransactionIdIsValid(xid)); + + /* + * Add ourselves to the list of processes needing a group XID status + * update. + */ + proc->clogGroupMember = true; + proc->clogGroupMemberXid = xid; + proc->clogGroupMemberXidStatus = status; + proc->clogGroupMemberPage = pageno; + proc->clogGroupMemberLsn = lsn; + + nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst); + + while (true) + { + /* + * Add the proc to list, if the clog page where we need to update the + * current transaction status is same as group leader's clog page. + * + * There is a race condition here, which is that after doing the below + * check and before adding this proc's clog update to a group, the + * group leader might have already finished the group update for this + * page and becomes group leader of another group. This will lead to a + * situation where a single group can have different clog page + * updates. This isn't likely and will still work, just maybe a bit + * less efficiently. + */ + if (nextidx != INVALID_PGPROCNO && + ProcGlobal->allProcs[nextidx].clogGroupMemberPage != proc->clogGroupMemberPage) + { + /* + * Ensure that this proc is not a member of any clog group that + * needs an XID status update. + */ + proc->clogGroupMember = false; + pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO); + return false; + } + + pg_atomic_write_u32(&proc->clogGroupNext, nextidx); + + if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst, + &nextidx, + (uint32) proc->pgprocno)) + break; + } + + /* + * If the list was not empty, the leader will update the status of our + * XID. It is impossible to have followers without a leader because the + * first process that has added itself to the list will always have + * nextidx as INVALID_PGPROCNO. + */ + if (nextidx != INVALID_PGPROCNO) + { + int extraWaits = 0; + + /* Sleep until the leader updates our XID status. */ + pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE); + for (;;) + { + /* acts as a read barrier */ + PGSemaphoreLock(proc->sem); + if (!proc->clogGroupMember) + break; + extraWaits++; + } + pgstat_report_wait_end(); + + Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PGPROCNO); + + /* Fix semaphore count for any absorbed wakeups */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + return true; + } + + /* We are the leader. Acquire the lock on behalf of everyone. */ + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Now that we've got the lock, clear the list of processes waiting for + * group XID status update, saving a pointer to the head of the list. + * Trying to pop elements one at a time could lead to an ABA problem. + */ + nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst, + INVALID_PGPROCNO); + + /* Remember head of list so we can perform wakeups after dropping lock. */ + wakeidx = nextidx; + + /* Walk the list and update the status of all XIDs. */ + while (nextidx != INVALID_PGPROCNO) + { + PGPROC *proc = &ProcGlobal->allProcs[nextidx]; + + /* + * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs + * should not use group XID status update mechanism. + */ + Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT); + + TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid, + proc->subxidStatus.count, + proc->subxids.xids, + proc->clogGroupMemberXidStatus, + proc->clogGroupMemberLsn, + proc->clogGroupMemberPage); + + /* Move to next proc in list. */ + nextidx = pg_atomic_read_u32(&proc->clogGroupNext); + } + + /* We're done with the lock now. */ + LWLockRelease(XactSLRULock); + + /* + * Now that we've released the lock, go back and wake everybody up. We + * don't do this under the lock so as to keep lock hold times to a + * minimum. + */ + while (wakeidx != INVALID_PGPROCNO) + { + PGPROC *proc = &ProcGlobal->allProcs[wakeidx]; + + wakeidx = pg_atomic_read_u32(&proc->clogGroupNext); + pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PGPROCNO); + + /* ensure all previous writes are visible before follower continues. */ + pg_write_barrier(); + + proc->clogGroupMember = false; + + if (proc != MyProc) + PGSemaphoreUnlock(proc->sem); + } + + return true; +} + +/* + * Sets the commit status of a single transaction. + * + * Must be called with XactSLRULock held + */ +static void +TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +{ + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + char *byteptr; + char byteval; + char curval; + + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + + /* + * When replaying transactions during recovery we still need to perform + * the two phases of subcommit and then commit. However, some transactions + * are already correctly marked, so we just treat those as a no-op which + * allows us to keep the following Assert as restrictive as possible. + */ + if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED && + curval == TRANSACTION_STATUS_COMMITTED) + return; + + /* + * Current state change should be from 0 or subcommitted to target state + * or we should already be there when replaying changes during recovery. + */ + Assert(curval == 0 || + (curval == TRANSACTION_STATUS_SUB_COMMITTED && + status != TRANSACTION_STATUS_IN_PROGRESS) || + curval == status); + + /* note this assumes exclusive access to the clog page */ + byteval = *byteptr; + byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift); + byteval |= (status << bshift); + *byteptr = byteval; + + /* + * Update the group LSN if the transaction completion LSN is higher. + * + * Note: lsn will be invalid when supplied during InRecovery processing, + * so we don't need to do anything special to avoid LSN updates during + * recovery. After recovery completes the next clog change will set the + * LSN correctly. + */ + if (!XLogRecPtrIsInvalid(lsn)) + { + int lsnindex = GetLSNIndex(slotno, xid); + + if (XactCtl->shared->group_lsn[lsnindex] < lsn) + XactCtl->shared->group_lsn[lsnindex] = lsn; + } +} + +/* + * Interrogate the state of a transaction in the commit log. + * + * Aside from the actual commit status, this function returns (into *lsn) + * an LSN that is late enough to be able to guarantee that if we flush up to + * that LSN then we will have flushed the transaction's commit record to disk. + * The result is not necessarily the exact LSN of the transaction's commit + * record! For example, for long-past transactions (those whose clog pages + * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because + * we group transactions on the same clog page to conserve storage, we might + * return the LSN of a later transaction that falls into the same group. + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; TransactionLogFetch() in transam.c is the intended caller. + */ +XidStatus +TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) +{ + int pageno = TransactionIdToPage(xid); + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + int slotno; + int lsnindex; + char *byteptr; + XidStatus status; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + + slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid); + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + + status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + + lsnindex = GetLSNIndex(slotno, xid); + *lsn = XactCtl->shared->group_lsn[lsnindex]; + + LWLockRelease(XactSLRULock); + + return status; +} + +/* + * Number of shared CLOG buffers. + * + * On larger multi-processor systems, it is possible to have many CLOG page + * requests in flight at one time which could lead to disk access for CLOG + * page if the required page is not found in memory. Testing revealed that we + * can get the best performance by having 128 CLOG buffers, more than that it + * doesn't improve performance. + * + * Unconditionally keeping the number of CLOG buffers to 128 did not seem like + * a good idea, because it would increase the minimum amount of shared memory + * required to start, which could be a problem for people running very small + * configurations. The following formula seems to represent a reasonable + * compromise: people with very low values for shared_buffers will get fewer + * CLOG buffers as well, and everyone else will get 128. + */ +Size +CLOGShmemBuffers(void) +{ + return Min(128, Max(4, NBuffers / 512)); +} + +/* + * Initialization of shared memory for CLOG + */ +Size +CLOGShmemSize(void) +{ + return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); +} + +void +CLOGShmemInit(void) +{ + XactCtl->PagePrecedes = CLOGPagePrecedes; + SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, + XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, + SYNC_HANDLER_CLOG); + SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE); +} + +/* + * This func must be called ONCE on system install. It creates + * the initial CLOG segment. (The CLOG directory is assumed to + * have been created by initdb, and CLOGShmemInit must have been + * called already.) + */ +void +BootStrapCLOG(void) +{ + int slotno; + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the commit log */ + slotno = ZeroCLOGPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(XactCtl, slotno); + Assert(!XactCtl->shared->page_dirty[slotno]); + + LWLockRelease(XactSLRULock); +} + +/* + * Initialize (or reinitialize) a page of CLOG to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCLOGPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(XactCtl, pageno); + + if (writeXlog) + WriteZeroPageXlogRec(pageno); + + return slotno; +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + */ +void +StartupCLOG(void) +{ + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + int pageno = TransactionIdToPage(xid); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Initialize our idea of the latest page number. + */ + XactCtl->shared->latest_page_number = pageno; + + LWLockRelease(XactSLRULock); +} + +/* + * This must be called ONCE at the end of startup/recovery. + */ +void +TrimCLOG(void) +{ + TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + int pageno = TransactionIdToPage(xid); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* + * Zero out the remainder of the current clog page. Under normal + * circumstances it should be zeroes already, but it seems at least + * theoretically possible that XLOG replay will have settled on a nextXID + * value that is less than the last XID actually used and marked by the + * previous database lifecycle (since subtransaction commit writes clog + * but makes no WAL entry). Let's just be safe. (We need not worry about + * pages beyond the current one, since those will be zeroed when first + * used. For the same reason, there is no need to do anything when + * nextXid is exactly at a page boundary; and it's likely that the + * "current" page doesn't exist yet in that case.) + */ + if (TransactionIdToPgIndex(xid) != 0) + { + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + int slotno; + char *byteptr; + + slotno = SimpleLruReadPage(XactCtl, pageno, false, xid); + byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + + /* Zero so-far-unused positions in the current byte */ + *byteptr &= (1 << bshift) - 1; + /* Zero the rest of the page */ + MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); + + XactCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(XactSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCLOG(void) +{ + /* + * Write dirty CLOG pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); + SimpleLruWriteAll(XactCtl, true); + TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); +} + + +/* + * Make sure that CLOG has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty clog or xlog page to make room + * in shared memory. + */ +void +ExtendCLOG(TransactionId newestXact) +{ + int pageno; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToPgIndex(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCLOGPage(pageno, true); + + LWLockRelease(XactSLRULock); +} + + +/* + * Remove all CLOG segments before the one holding the passed transaction ID + * + * Before removing any CLOG data, we must flush XLOG to disk, to ensure + * that any recently-emitted FREEZE_PAGE records have reached disk; otherwise + * a crash and restart might leave us with some unfrozen tuples referencing + * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too. + * Replaying the deletion from XLOG is not critical, since the files could + * just as well be removed later, but doing so prevents a long-running hot + * standby server from acquiring an unreasonably bloated CLOG directory. + * + * Since CLOG segments hold a large number of transactions, the opportunity to + * actually remove a segment is fairly rare, and so it seems best not to do + * the XLOG flush unless we have confirmed that there is a removable segment. + */ +void +TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToPage(oldestXact); + + /* Check to see if there's any files that could be removed */ + if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage)) + return; /* nothing to remove */ + + /* + * Advance oldestClogXid before truncating clog, so concurrent xact status + * lookups can ensure they don't attempt to access truncated-away clog. + * + * It's only necessary to do this if we will actually truncate away clog + * pages. + */ + AdvanceOldestClogXid(oldestXact); + + /* + * Write XLOG record and flush XLOG to disk. We record the oldest xid + * we're keeping information about here so we can ensure that it's always + * ahead of clog truncation in case we crash, and so a standby finds out + * the new valid xid before the next checkpoint. + */ + WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); + + /* Now we can remove the old CLOG segment(s) */ + SimpleLruTruncate(XactCtl, cutoffPage); +} + + +/* + * Decide whether a CLOG page number is "older" for truncation purposes. + * + * We need to use comparison of TransactionIds here in order to do the right + * thing with wraparound XID arithmetic. However, TransactionIdPrecedes() + * would get weird about permanent xact IDs. So, offset both such that xid1, + * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset + * is relevant to page 0 and to the page preceding page 0. + * + * The page containing oldestXact-2^31 is the important edge case. The + * portion of that page equaling or following oldestXact-2^31 is expendable, + * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is + * the first XID of a page and segment, the entire page and segment is + * expendable, and we could truncate the segment. Recognizing that case would + * require making oldestXact, not just the page containing oldestXact, + * available to this callback. The benefit would be rare and small, so we + * don't optimize that edge case. + */ +static bool +CLOGPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1)); +} + + +/* + * Write a ZEROPAGE xlog record + */ +static void +WriteZeroPageXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); +} + +/* + * Write a TRUNCATE xlog record + * + * We must flush the xlog record to disk before returning --- see notes + * in TruncateCLOG(). + */ +static void +WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb) +{ + XLogRecPtr recptr; + xl_clog_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXact = oldestXact; + xlrec.oldestXactDb = oldestXactDb; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xl_clog_truncate)); + recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE); + XLogFlush(recptr); +} + +/* + * CLOG resource manager's routines + */ +void +clog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in clog records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == CLOG_ZEROPAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + + slotno = ZeroCLOGPage(pageno, false); + SimpleLruWritePage(XactCtl, slotno); + Assert(!XactCtl->shared->page_dirty[slotno]); + + LWLockRelease(XactSLRULock); + } + else if (info == CLOG_TRUNCATE) + { + xl_clog_truncate xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate)); + + AdvanceOldestClogXid(xlrec.oldestXact); + + SimpleLruTruncate(XactCtl, xlrec.pageno); + } + else + elog(PANIC, "clog_redo: unknown op code %u", info); +} + +/* + * Entrypoint for sync.c to sync clog files. + */ +int +clogsyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(XactCtl, ftag, path); +} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c new file mode 100644 index 0000000..4dc8d40 --- /dev/null +++ b/src/backend/access/transam/commit_ts.c @@ -0,0 +1,1035 @@ +/*------------------------------------------------------------------------- + * + * commit_ts.c + * PostgreSQL commit timestamp manager + * + * This module is a pg_xact-like system that stores the commit timestamp + * for each transaction. + * + * XLOG interactions: this module generates an XLOG record whenever a new + * CommitTs page is initialized to zeroes. Also, one XLOG record is + * generated for setting of values when the caller requests it; this allows + * us to support values coming from places other than transaction commit. + * Other writes of CommitTS come from recording of transaction commit in + * xact.c, which generates its own XLOG records for these events and will + * re-perform the status update on redo; so we need make no additional XLOG + * entry here. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/commit_ts.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "access/slru.h" +#include "access/transam.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "storage/shmem.h" +#include "utils/builtins.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +/* + * Defines for CommitTs page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CommitTs page numbering also wraps around at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE, and CommitTs segment numbering at + * 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCommitTs (see CommitTsPagePrecedes). + */ + +/* + * We need 8+2 bytes per xact. Note that enlarging this struct might mean + * the largest possible file name is more than 5 chars long; see + * SlruScanDirectory. + */ +typedef struct CommitTimestampEntry +{ + TimestampTz time; + RepOriginId nodeid; +} CommitTimestampEntry; + +#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \ + sizeof(RepOriginId)) + +#define COMMIT_TS_XACTS_PER_PAGE \ + (BLCKSZ / SizeOfCommitTimestampEntry) + +#define TransactionIdToCTsPage(xid) \ + ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) +#define TransactionIdToCTsEntry(xid) \ + ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE) + +/* + * Link to shared-memory data structures for CommitTs control + */ +static SlruCtlData CommitTsCtlData; + +#define CommitTsCtl (&CommitTsCtlData) + +/* + * We keep a cache of the last value set in shared memory. + * + * This is also good place to keep the activation status. We keep this + * separate from the GUC so that the standby can activate the module if the + * primary has it active independently of the value of the GUC. + * + * This is protected by CommitTsLock. In some places, we use commitTsActive + * without acquiring the lock; where this happens, a comment explains the + * rationale for it. + */ +typedef struct CommitTimestampShared +{ + TransactionId xidLastCommit; + CommitTimestampEntry dataLastCommit; + bool commitTsActive; +} CommitTimestampShared; + +static CommitTimestampShared *commitTsShared; + + +/* GUC variable */ +bool track_commit_timestamp; + +static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + RepOriginId nodeid, int pageno); +static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + RepOriginId nodeid, int slotno); +static void error_commit_ts_disabled(void); +static int ZeroCommitTsPage(int pageno, bool writeXlog); +static bool CommitTsPagePrecedes(int page1, int page2); +static void ActivateCommitTs(void); +static void DeactivateCommitTs(void); +static void WriteZeroPageXlogRec(int pageno); +static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid); + +/* + * TransactionTreeSetCommitTsData + * + * Record the final commit timestamp of transaction entries in the commit log + * for a transaction and its subtransaction tree, as efficiently as possible. + * + * xid is the top level transaction id. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * The reason why tracking just the parent xid commit timestamp is not enough + * is that the subtrans SLRU does not stay valid across crashes (it's not + * permanent) so we need to keep the information about them here. If the + * subtrans implementation changes in the future, we might want to revisit the + * decision of storing timestamp info for each subxid. + */ +void +TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz timestamp, + RepOriginId nodeid) +{ + int i; + TransactionId headxid; + TransactionId newestXact; + + /* + * No-op if the module is not active. + * + * An unlocked read here is fine, because in a standby (the only place + * where the flag can change in flight) this routine is only called by the + * recovery process, which is also the only process which can change the + * flag. + */ + if (!commitTsShared->commitTsActive) + return; + + /* + * Figure out the latest Xid in this batch: either the last subxid if + * there's any, otherwise the parent xid. + */ + if (nsubxids > 0) + newestXact = subxids[nsubxids - 1]; + else + newestXact = xid; + + /* + * We split the xids to set the timestamp to in groups belonging to the + * same SLRU page; the first element in each such set is its head. The + * first group has the main XID as the head; subsequent sets use the first + * subxid not on the previous page as head. This way, we only have to + * lock/modify each SLRU page once. + */ + headxid = xid; + i = 0; + for (;;) + { + int pageno = TransactionIdToCTsPage(headxid); + int j; + + for (j = i; j < nsubxids; j++) + { + if (TransactionIdToCTsPage(subxids[j]) != pageno) + break; + } + /* subxids[i..j] are on the same page as the head */ + + SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid, + pageno); + + /* if we wrote out all subxids, we're done. */ + if (j >= nsubxids) + break; + + /* + * Set the new head and skip over it, as well as over the subxids we + * just wrote. + */ + headxid = subxids[j]; + i = j + 1; + } + + /* update the cached value in shared memory */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + commitTsShared->xidLastCommit = xid; + commitTsShared->dataLastCommit.time = timestamp; + commitTsShared->dataLastCommit.nodeid = nodeid; + + /* and move forwards our endpoint, if needed */ + if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact)) + ShmemVariableCache->newestCommitTsXid = newestXact; + LWLockRelease(CommitTsLock); +} + +/* + * Record the commit timestamp of transaction entries in the commit log for all + * entries on a single page. Atomic only on this page. + */ +static void +SetXidCommitTsInPage(TransactionId xid, int nsubxids, + TransactionId *subxids, TimestampTz ts, + RepOriginId nodeid, int pageno) +{ + int slotno; + int i; + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid); + + TransactionIdSetCommitTs(xid, ts, nodeid, slotno); + for (i = 0; i < nsubxids; i++) + TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno); + + CommitTsCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(CommitTsSLRULock); +} + +/* + * Sets the commit timestamp of a single transaction. + * + * Must be called with CommitTsSLRULock held + */ +static void +TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, + RepOriginId nodeid, int slotno) +{ + int entryno = TransactionIdToCTsEntry(xid); + CommitTimestampEntry entry; + + Assert(TransactionIdIsNormal(xid)); + + entry.time = ts; + entry.nodeid = nodeid; + + memcpy(CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + &entry, SizeOfCommitTimestampEntry); +} + +/* + * Interrogate the commit timestamp of a transaction. + * + * The return value indicates whether a commit timestamp record was found for + * the given xid. The timestamp value is returned in *ts (which may not be + * null), and the origin node for the Xid is returned in *nodeid, if it's not + * null. + */ +bool +TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, + RepOriginId *nodeid) +{ + int pageno = TransactionIdToCTsPage(xid); + int entryno = TransactionIdToCTsEntry(xid); + int slotno; + CommitTimestampEntry entry; + TransactionId oldestCommitTsXid; + TransactionId newestCommitTsXid; + + if (!TransactionIdIsValid(xid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot retrieve commit timestamp for transaction %u", xid))); + else if (!TransactionIdIsNormal(xid)) + { + /* frozen and bootstrap xids are always committed far in the past */ + *ts = 0; + if (nodeid) + *nodeid = 0; + return false; + } + + LWLockAcquire(CommitTsLock, LW_SHARED); + + /* Error if module not enabled */ + if (!commitTsShared->commitTsActive) + error_commit_ts_disabled(); + + /* + * If we're asked for the cached value, return that. Otherwise, fall + * through to read from SLRU. + */ + if (commitTsShared->xidLastCommit == xid) + { + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + + LWLockRelease(CommitTsLock); + return *ts != 0; + } + + oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; + newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; + /* neither is invalid, or both are */ + Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid)); + LWLockRelease(CommitTsLock); + + /* + * Return empty if the requested value is outside our valid range. + */ + if (!TransactionIdIsValid(oldestCommitTsXid) || + TransactionIdPrecedes(xid, oldestCommitTsXid) || + TransactionIdPrecedes(newestCommitTsXid, xid)) + { + *ts = 0; + if (nodeid) + *nodeid = InvalidRepOriginId; + return false; + } + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); + memcpy(&entry, + CommitTsCtl->shared->page_buffer[slotno] + + SizeOfCommitTimestampEntry * entryno, + SizeOfCommitTimestampEntry); + + *ts = entry.time; + if (nodeid) + *nodeid = entry.nodeid; + + LWLockRelease(CommitTsSLRULock); + return *ts != 0; +} + +/* + * Return the Xid of the latest committed transaction. (As far as this module + * is concerned, anyway; it's up to the caller to ensure the value is useful + * for its purposes.) + * + * ts and nodeid are filled with the corresponding data; they can be passed + * as NULL if not wanted. + */ +TransactionId +GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid) +{ + TransactionId xid; + + LWLockAcquire(CommitTsLock, LW_SHARED); + + /* Error if module not enabled */ + if (!commitTsShared->commitTsActive) + error_commit_ts_disabled(); + + xid = commitTsShared->xidLastCommit; + if (ts) + *ts = commitTsShared->dataLastCommit.time; + if (nodeid) + *nodeid = commitTsShared->dataLastCommit.nodeid; + LWLockRelease(CommitTsLock); + + return xid; +} + +static void +error_commit_ts_disabled(void) +{ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not get commit timestamp data"), + RecoveryInProgress() ? + errhint("Make sure the configuration parameter \"%s\" is set on the primary server.", + "track_commit_timestamp") : + errhint("Make sure the configuration parameter \"%s\" is set.", + "track_commit_timestamp"))); +} + +/* + * SQL-callable wrapper to obtain commit time of a transaction + */ +Datum +pg_xact_commit_timestamp(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_TRANSACTIONID(0); + TimestampTz ts; + bool found; + + found = TransactionIdGetCommitTsData(xid, &ts, NULL); + + if (!found) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); +} + + +/* + * pg_last_committed_xact + * + * SQL-callable wrapper to obtain some information about the latest + * committed transaction: transaction ID, timestamp and replication + * origin. + */ +Datum +pg_last_committed_xact(PG_FUNCTION_ARGS) +{ + TransactionId xid; + RepOriginId nodeid; + TimestampTz ts; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + HeapTuple htup; + + /* and construct a tuple with our data */ + xid = GetLatestCommitTsData(&ts, &nodeid); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "roident", + OIDOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + if (!TransactionIdIsNormal(xid)) + { + memset(nulls, true, sizeof(nulls)); + } + else + { + values[0] = TransactionIdGetDatum(xid); + nulls[0] = false; + + values[1] = TimestampTzGetDatum(ts); + nulls[1] = false; + + values[2] = ObjectIdGetDatum((Oid) nodeid); + nulls[2] = false; + } + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + +/* + * pg_xact_commit_timestamp_origin + * + * SQL-callable wrapper to obtain commit timestamp and replication origin + * of a given transaction. + */ +Datum +pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) +{ + TransactionId xid = PG_GETARG_TRANSACTIONID(0); + RepOriginId nodeid; + TimestampTz ts; + Datum values[2]; + bool nulls[2]; + TupleDesc tupdesc; + HeapTuple htup; + bool found; + + found = TransactionIdGetCommitTsData(xid, &ts, &nodeid); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "timestamp", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "roident", + OIDOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + if (!found) + { + memset(nulls, true, sizeof(nulls)); + } + else + { + values[0] = TimestampTzGetDatum(ts); + nulls[0] = false; + + values[1] = ObjectIdGetDatum((Oid) nodeid); + nulls[1] = false; + } + + htup = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(htup)); +} + +/* + * Number of shared CommitTS buffers. + * + * We use a very similar logic as for the number of CLOG buffers (except we + * scale up twice as fast with shared buffers, and the maximum is twice as + * high); see comments in CLOGShmemBuffers. + */ +Size +CommitTsShmemBuffers(void) +{ + return Min(256, Max(4, NBuffers / 256)); +} + +/* + * Shared memory sizing for CommitTs + */ +Size +CommitTsShmemSize(void) +{ + return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) + + sizeof(CommitTimestampShared); +} + +/* + * Initialize CommitTs at system startup (postmaster start or standalone + * backend) + */ +void +CommitTsShmemInit(void) +{ + bool found; + + CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; + SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, + CommitTsSLRULock, "pg_commit_ts", + LWTRANCHE_COMMITTS_BUFFER, + SYNC_HANDLER_COMMIT_TS); + SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE); + + commitTsShared = ShmemInitStruct("CommitTs shared", + sizeof(CommitTimestampShared), + &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + + commitTsShared->xidLastCommit = InvalidTransactionId; + TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time); + commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId; + commitTsShared->commitTsActive = false; + } + else + Assert(found); +} + +/* + * This function must be called ONCE on system install. + * + * (The CommitTs directory is assumed to have been created by initdb, and + * CommitTsShmemInit must have been called already.) + */ +void +BootStrapCommitTs(void) +{ + /* + * Nothing to do here at present, unlike most other SLRU modules; segments + * are created when the server is started with this module enabled. See + * ActivateCommitTs. + */ +} + +/* + * Initialize (or reinitialize) a page of CommitTs to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCommitTsPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(CommitTsCtl, pageno); + + if (writeXlog) + WriteZeroPageXlogRec(pageno); + + return slotno; +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + */ +void +StartupCommitTs(void) +{ + ActivateCommitTs(); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after recovery has finished. + */ +void +CompleteCommitTsInitialization(void) +{ + /* + * If the feature is not enabled, turn it off for good. This also removes + * any leftover data. + * + * Conversely, we activate the module if the feature is enabled. This is + * necessary for primary and standby as the activation depends on the + * control file contents at the beginning of recovery or when a + * XLOG_PARAMETER_CHANGE is replayed. + */ + if (!track_commit_timestamp) + DeactivateCommitTs(); + else + ActivateCommitTs(); +} + +/* + * Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE + * XLog record during recovery. + */ +void +CommitTsParameterChange(bool newvalue, bool oldvalue) +{ + /* + * If the commit_ts module is disabled in this server and we get word from + * the primary server that it is enabled there, activate it so that we can + * replay future WAL records involving it; also mark it as active on + * pg_control. If the old value was already set, we already did this, so + * don't do anything. + * + * If the module is disabled in the primary, disable it here too, unless + * the module is enabled locally. + * + * Note this only runs in the recovery process, so an unlocked read is + * fine. + */ + if (newvalue) + { + if (!commitTsShared->commitTsActive) + ActivateCommitTs(); + } + else if (commitTsShared->commitTsActive) + DeactivateCommitTs(); +} + +/* + * Activate this module whenever necessary. + * This must happen during postmaster or standalone-backend startup, + * or during WAL replay anytime the track_commit_timestamp setting is + * changed in the primary. + * + * The reason why this SLRU needs separate activation/deactivation functions is + * that it can be enabled/disabled during start and the activation/deactivation + * on the primary is propagated to the standby via replay. Other SLRUs don't + * have this property and they can be just initialized during normal startup. + * + * This is in charge of creating the currently active segment, if it's not + * already there. The reason for this is that the server might have been + * running with this module disabled for a while and thus might have skipped + * the normal creation point. + */ +static void +ActivateCommitTs(void) +{ + TransactionId xid; + int pageno; + + /* If we've done this already, there's nothing to do */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (commitTsShared->commitTsActive) + { + LWLockRelease(CommitTsLock); + return; + } + LWLockRelease(CommitTsLock); + + xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + pageno = TransactionIdToCTsPage(xid); + + /* + * Re-Initialize our idea of the latest page number. + */ + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + CommitTsCtl->shared->latest_page_number = pageno; + LWLockRelease(CommitTsSLRULock); + + /* + * If CommitTs is enabled, but it wasn't in the previous server run, we + * need to set the oldest and newest values to the next Xid; that way, we + * will not try to read data that might not have been set. + * + * XXX does this have a problem if a server is started with commitTs + * enabled, then started with commitTs disabled, then restarted with it + * enabled again? It doesn't look like it does, because there should be a + * checkpoint that sets the value to InvalidTransactionId at end of + * recovery; and so any chance of injecting new transactions without + * CommitTs values would occur after the oldestCommitTsXid has been set to + * Invalid temporarily. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid == InvalidTransactionId) + { + ShmemVariableCache->oldestCommitTsXid = + ShmemVariableCache->newestCommitTsXid = ReadNextTransactionId(); + } + LWLockRelease(CommitTsLock); + + /* Create the current segment file, if necessary */ + if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) + { + int slotno; + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + LWLockRelease(CommitTsSLRULock); + } + + /* Change the activation status in shared memory. */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + commitTsShared->commitTsActive = true; + LWLockRelease(CommitTsLock); +} + +/* + * Deactivate this module. + * + * This must be called when the track_commit_timestamp parameter is turned off. + * This happens during postmaster or standalone-backend startup, or during WAL + * replay. + * + * Resets CommitTs into invalid state to make sure we don't hand back + * possibly-invalid data; also removes segments of old data. + */ +static void +DeactivateCommitTs(void) +{ + /* + * Cleanup the status in the shared memory. + * + * We reset everything in the commitTsShared record to prevent user from + * getting confusing data about last committed transaction on the standby + * when the module was activated repeatedly on the primary. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + + commitTsShared->commitTsActive = false; + commitTsShared->xidLastCommit = InvalidTransactionId; + TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time); + commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId; + + ShmemVariableCache->oldestCommitTsXid = InvalidTransactionId; + ShmemVariableCache->newestCommitTsXid = InvalidTransactionId; + + LWLockRelease(CommitTsLock); + + /* + * Remove *all* files. This is necessary so that there are no leftover + * files; in the case where this feature is later enabled after running + * with it disabled for some time there may be a gap in the file sequence. + * (We can probably tolerate out-of-sequence files, as they are going to + * be overwritten anyway when we wrap around, but it seems better to be + * tidy.) + */ + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL); + LWLockRelease(CommitTsSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCommitTs(void) +{ + /* + * Write dirty CommitTs pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(CommitTsCtl, true); +} + +/* + * Make sure that CommitTs has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty CommitTs or xlog page to make room + * in shared memory. + * + * NB: the current implementation relies on track_commit_timestamp being + * PGC_POSTMASTER. + */ +void +ExtendCommitTs(TransactionId newestXact) +{ + int pageno; + + /* + * Nothing to do if module not enabled. Note we do an unlocked read of + * the flag here, which is okay because this routine is only called from + * GetNewTransactionId, which is never called in a standby. + */ + Assert(!InRecovery); + if (!commitTsShared->commitTsActive) + return; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToCTsEntry(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToCTsPage(newestXact); + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroCommitTsPage(pageno, !InRecovery); + + LWLockRelease(CommitTsSLRULock); +} + +/* + * Remove all CommitTs segments before the one holding the passed + * transaction ID. + * + * Note that we don't need to flush XLOG here. + */ +void +TruncateCommitTs(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToCTsPage(oldestXact); + + /* Check to see if there's any files that could be removed */ + if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, + &cutoffPage)) + return; /* nothing to remove */ + + /* Write XLOG record */ + WriteTruncateXlogRec(cutoffPage, oldestXact); + + /* Now we can remove the old CommitTs segment(s) */ + SimpleLruTruncate(CommitTsCtl, cutoffPage); +} + +/* + * Set the limit values between which commit TS can be consulted. + */ +void +SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) +{ + /* + * Be careful not to overwrite values that are either further into the + * "future" or signal a disabled committs. + */ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId) + { + if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact)) + ShmemVariableCache->oldestCommitTsXid = oldestXact; + if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTsXid)) + ShmemVariableCache->newestCommitTsXid = newestXact; + } + else + { + Assert(ShmemVariableCache->newestCommitTsXid == InvalidTransactionId); + ShmemVariableCache->oldestCommitTsXid = oldestXact; + ShmemVariableCache->newestCommitTsXid = newestXact; + } + LWLockRelease(CommitTsLock); +} + +/* + * Move forwards the oldest commitTS value that can be consulted + */ +void +AdvanceOldestCommitTsXid(TransactionId oldestXact) +{ + LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); + if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId && + TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact)) + ShmemVariableCache->oldestCommitTsXid = oldestXact; + LWLockRelease(CommitTsLock); +} + + +/* + * Decide whether a commitTS page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + * + * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This + * introduces differences compared to CLOG and the other SLRUs having (1 << + * 31) % per_page == 0. This function never tests exactly + * TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit, + * there are two possible counts of page boundaries between oldestXact and the + * latest XID assigned, depending on whether oldestXact is within the first + * 128 entries of its page. Since this function doesn't know the location of + * oldestXact within page2, it returns false for one page that actually is + * expendable. This is a wider (yet still negligible) version of the + * truncation opportunity that CLOGPagePrecedes() cannot recognize. + * + * For the sake of a worked example, number entries with decimal values such + * that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of + * pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1, + * then the final safe XID assignment leaves newestXact=1.95. We keep page 2, + * because entry=2.85 is the border that toggles whether entries precede the + * last entry of the oldestXact page. While page 2 is expendable at + * oldestXact=N+2.1, it would be precious at oldestXact=N+2.9. + */ +static bool +CommitTsPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1)); +} + + +/* + * Write a ZEROPAGE xlog record + */ +static void +WriteZeroPageXlogRec(int pageno) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); +} + +/* + * Write a TRUNCATE xlog record + */ +static void +WriteTruncateXlogRec(int pageno, TransactionId oldestXid) +{ + xl_commit_ts_truncate xlrec; + + xlrec.pageno = pageno; + xlrec.oldestXid = oldestXid; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate); + (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE); +} + +/* + * CommitTS resource manager's routines + */ +void +commit_ts_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in commit_ts records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == COMMIT_TS_ZEROPAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + + slotno = ZeroCommitTsPage(pageno, false); + SimpleLruWritePage(CommitTsCtl, slotno); + Assert(!CommitTsCtl->shared->page_dirty[slotno]); + + LWLockRelease(CommitTsSLRULock); + } + else if (info == COMMIT_TS_TRUNCATE) + { + xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record); + + AdvanceOldestCommitTsXid(trunc->oldestXid); + + /* + * During XLOG replay, latest_page_number isn't set up yet; insert a + * suitable value to bypass the sanity test in SimpleLruTruncate. + */ + CommitTsCtl->shared->latest_page_number = trunc->pageno; + + SimpleLruTruncate(CommitTsCtl, trunc->pageno); + } + else + elog(PANIC, "commit_ts_redo: unknown op code %u", info); +} + +/* + * Entrypoint for sync.c to sync commit_ts files. + */ +int +committssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(CommitTsCtl, ftag, path); +} diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c new file mode 100644 index 0000000..0136ca7 --- /dev/null +++ b/src/backend/access/transam/generic_xlog.c @@ -0,0 +1,540 @@ +/*------------------------------------------------------------------------- + * + * generic_xlog.c + * Implementation of generic xlog records. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/generic_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/bufmask.h" +#include "access/generic_xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "utils/memutils.h" + +/*------------------------------------------------------------------------- + * Internally, a delta between pages consists of a set of fragments. Each + * fragment represents changes made in a given region of a page. A fragment + * is made up as follows: + * + * - offset of page region (OffsetNumber) + * - length of page region (OffsetNumber) + * - data - the data to place into the region ('length' number of bytes) + * + * Unchanged regions of a page are not represented in its delta. As a result, + * a delta can be more compact than the full page image. But having an + * unchanged region between two fragments that is smaller than the fragment + * header (offset+length) does not pay off in terms of the overall size of + * the delta. For this reason, we merge adjacent fragments if the unchanged + * region between them is <= MATCH_THRESHOLD bytes. + * + * We do not bother to merge fragments across the "lower" and "upper" parts + * of a page; it's very seldom the case that pd_lower and pd_upper are within + * MATCH_THRESHOLD bytes of each other, and handling that infrequent case + * would complicate and slow down the delta-computation code unduly. + * Therefore, the worst-case delta size includes two fragment headers plus + * a full page's worth of data. + *------------------------------------------------------------------------- + */ +#define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber)) +#define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE +#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE) + +/* Struct of generic xlog data for single page */ +typedef struct +{ + Buffer buffer; /* registered buffer */ + int flags; /* flags for this buffer */ + int deltaLen; /* space consumed in delta field */ + char *image; /* copy of page image for modification, do not + * do it in-place to have aligned memory chunk */ + char delta[MAX_DELTA_SIZE]; /* delta between page images */ +} PageData; + +/* State of generic xlog record construction */ +struct GenericXLogState +{ + /* Info about each page, see above */ + PageData pages[MAX_GENERIC_XLOG_PAGES]; + bool isLogged; + /* Page images (properly aligned) */ + PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; +}; + +static void writeFragment(PageData *pageData, OffsetNumber offset, + OffsetNumber len, const char *data); +static void computeRegionDelta(PageData *pageData, + const char *curpage, const char *targetpage, + int targetStart, int targetEnd, + int validStart, int validEnd); +static void computeDelta(PageData *pageData, Page curpage, Page targetpage); +static void applyPageRedo(Page page, const char *delta, Size deltaSize); + + +/* + * Write next fragment into pageData's delta. + * + * The fragment has the given offset and length, and data points to the + * actual data (of length length). + */ +static void +writeFragment(PageData *pageData, OffsetNumber offset, OffsetNumber length, + const char *data) +{ + char *ptr = pageData->delta + pageData->deltaLen; + + /* Verify we have enough space */ + Assert(pageData->deltaLen + sizeof(offset) + + sizeof(length) + length <= sizeof(pageData->delta)); + + /* Write fragment data */ + memcpy(ptr, &offset, sizeof(offset)); + ptr += sizeof(offset); + memcpy(ptr, &length, sizeof(length)); + ptr += sizeof(length); + memcpy(ptr, data, length); + ptr += length; + + pageData->deltaLen = ptr - pageData->delta; +} + +/* + * Compute the XLOG fragments needed to transform a region of curpage into the + * corresponding region of targetpage, and append them to pageData's delta + * field. The region to transform runs from targetStart to targetEnd-1. + * Bytes in curpage outside the range validStart to validEnd-1 should be + * considered invalid, and always overwritten with target data. + * + * This function is a hot spot, so it's worth being as tense as possible + * about the data-matching loops. + */ +static void +computeRegionDelta(PageData *pageData, + const char *curpage, const char *targetpage, + int targetStart, int targetEnd, + int validStart, int validEnd) +{ + int i, + loopEnd, + fragmentBegin = -1, + fragmentEnd = -1; + + /* Deal with any invalid start region by including it in first fragment */ + if (validStart > targetStart) + { + fragmentBegin = targetStart; + targetStart = validStart; + } + + /* We'll deal with any invalid end region after the main loop */ + loopEnd = Min(targetEnd, validEnd); + + /* Examine all the potentially matchable bytes */ + i = targetStart; + while (i < loopEnd) + { + if (curpage[i] != targetpage[i]) + { + /* On unmatched byte, start new fragment if not already in one */ + if (fragmentBegin < 0) + fragmentBegin = i; + /* Mark unmatched-data endpoint as uncertain */ + fragmentEnd = -1; + /* Extend the fragment as far as possible in a tight loop */ + i++; + while (i < loopEnd && curpage[i] != targetpage[i]) + i++; + if (i >= loopEnd) + break; + } + + /* Found a matched byte, so remember end of unmatched fragment */ + fragmentEnd = i; + + /* + * Extend the match as far as possible in a tight loop. (On typical + * workloads, this inner loop is the bulk of this function's runtime.) + */ + i++; + while (i < loopEnd && curpage[i] == targetpage[i]) + i++; + + /* + * There are several possible cases at this point: + * + * 1. We have no unwritten fragment (fragmentBegin < 0). There's + * nothing to write; and it doesn't matter what fragmentEnd is. + * + * 2. We found more than MATCH_THRESHOLD consecutive matching bytes. + * Dump out the unwritten fragment, stopping at fragmentEnd. + * + * 3. The match extends to loopEnd. We'll do nothing here, exit the + * loop, and then dump the unwritten fragment, after merging it with + * the invalid end region if any. If we don't so merge, fragmentEnd + * establishes how much the final writeFragment call needs to write. + * + * 4. We found an unmatched byte before loopEnd. The loop will repeat + * and will enter the unmatched-byte stanza above. So in this case + * also, it doesn't matter what fragmentEnd is. The matched bytes + * will get merged into the continuing unmatched fragment. + * + * Only in case 3 do we reach the bottom of the loop with a meaningful + * fragmentEnd value, which is why it's OK that we unconditionally + * assign "fragmentEnd = i" above. + */ + if (fragmentBegin >= 0 && i - fragmentEnd > MATCH_THRESHOLD) + { + writeFragment(pageData, fragmentBegin, + fragmentEnd - fragmentBegin, + targetpage + fragmentBegin); + fragmentBegin = -1; + fragmentEnd = -1; /* not really necessary */ + } + } + + /* Deal with any invalid end region by including it in final fragment */ + if (loopEnd < targetEnd) + { + if (fragmentBegin < 0) + fragmentBegin = loopEnd; + fragmentEnd = targetEnd; + } + + /* Write final fragment if any */ + if (fragmentBegin >= 0) + { + if (fragmentEnd < 0) + fragmentEnd = targetEnd; + writeFragment(pageData, fragmentBegin, + fragmentEnd - fragmentBegin, + targetpage + fragmentBegin); + } +} + +/* + * Compute the XLOG delta record needed to transform curpage into targetpage, + * and store it in pageData's delta field. + */ +static void +computeDelta(PageData *pageData, Page curpage, Page targetpage) +{ + int targetLower = ((PageHeader) targetpage)->pd_lower, + targetUpper = ((PageHeader) targetpage)->pd_upper, + curLower = ((PageHeader) curpage)->pd_lower, + curUpper = ((PageHeader) curpage)->pd_upper; + + pageData->deltaLen = 0; + + /* Compute delta records for lower part of page ... */ + computeRegionDelta(pageData, curpage, targetpage, + 0, targetLower, + 0, curLower); + /* ... and for upper part, ignoring what's between */ + computeRegionDelta(pageData, curpage, targetpage, + targetUpper, BLCKSZ, + curUpper, BLCKSZ); + + /* + * If xlog debug is enabled, then check produced delta. Result of delta + * application to curpage should be equivalent to targetpage. + */ +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + { + PGAlignedBlock tmp; + + memcpy(tmp.data, curpage, BLCKSZ); + applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen); + if (memcmp(tmp.data, targetpage, targetLower) != 0 || + memcmp(tmp.data + targetUpper, targetpage + targetUpper, + BLCKSZ - targetUpper) != 0) + elog(ERROR, "result of generic xlog apply does not match"); + } +#endif +} + +/* + * Start new generic xlog record for modifications to specified relation. + */ +GenericXLogState * +GenericXLogStart(Relation relation) +{ + GenericXLogState *state; + int i; + + state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); + state->isLogged = RelationNeedsWAL(relation); + + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + state->pages[i].image = state->images[i].data; + state->pages[i].buffer = InvalidBuffer; + } + + return state; +} + +/* + * Register new buffer for generic xlog record. + * + * Returns pointer to the page's image in the GenericXLogState, which + * is what the caller should modify. + * + * If the buffer is already registered, just return its existing entry. + * (It's not very clear what to do with the flags in such a case, but + * for now we stay with the original flags.) + */ +Page +GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags) +{ + int block_id; + + /* Search array for existing entry or first unused slot */ + for (block_id = 0; block_id < MAX_GENERIC_XLOG_PAGES; block_id++) + { + PageData *page = &state->pages[block_id]; + + if (BufferIsInvalid(page->buffer)) + { + /* Empty slot, so use it (there cannot be a match later) */ + page->buffer = buffer; + page->flags = flags; + memcpy(page->image, BufferGetPage(buffer), BLCKSZ); + return (Page) page->image; + } + else if (page->buffer == buffer) + { + /* + * Buffer is already registered. Just return the image, which is + * already prepared. + */ + return (Page) page->image; + } + } + + elog(ERROR, "maximum number %d of generic xlog buffers is exceeded", + MAX_GENERIC_XLOG_PAGES); + /* keep compiler quiet */ + return NULL; +} + +/* + * Apply changes represented by GenericXLogState to the actual buffers, + * and emit a generic xlog record. + */ +XLogRecPtr +GenericXLogFinish(GenericXLogState *state) +{ + XLogRecPtr lsn; + int i; + + if (state->isLogged) + { + /* Logged relation: make xlog record in critical section. */ + XLogBeginInsert(); + + START_CRIT_SECTION(); + + /* + * Compute deltas if necessary, write changes to buffers, mark + * buffers dirty, and register changes. + */ + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + Page page; + PageHeader pageHeader; + + if (BufferIsInvalid(pageData->buffer)) + continue; + + page = BufferGetPage(pageData->buffer); + pageHeader = (PageHeader) pageData->image; + + /* + * Compute delta while we still have both the unmodified page and + * the new image. Not needed if we are logging the full image. + */ + if (!(pageData->flags & GENERIC_XLOG_FULL_IMAGE)) + computeDelta(pageData, page, (Page) pageData->image); + + /* + * Apply the image, being careful to zero the "hole" between + * pd_lower and pd_upper in order to avoid divergence between + * actual page state and what replay would produce. + */ + memcpy(page, pageData->image, pageHeader->pd_lower); + memset(page + pageHeader->pd_lower, 0, + pageHeader->pd_upper - pageHeader->pd_lower); + memcpy(page + pageHeader->pd_upper, + pageData->image + pageHeader->pd_upper, + BLCKSZ - pageHeader->pd_upper); + + MarkBufferDirty(pageData->buffer); + + if (pageData->flags & GENERIC_XLOG_FULL_IMAGE) + { + XLogRegisterBuffer(i, pageData->buffer, + REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + } + else + { + XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD); + XLogRegisterBufData(i, pageData->delta, pageData->deltaLen); + } + } + + /* Insert xlog record */ + lsn = XLogInsert(RM_GENERIC_ID, 0); + + /* Set LSN */ + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + + if (BufferIsInvalid(pageData->buffer)) + continue; + PageSetLSN(BufferGetPage(pageData->buffer), lsn); + } + END_CRIT_SECTION(); + } + else + { + /* Unlogged relation: skip xlog-related stuff */ + START_CRIT_SECTION(); + for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) + { + PageData *pageData = &state->pages[i]; + + if (BufferIsInvalid(pageData->buffer)) + continue; + memcpy(BufferGetPage(pageData->buffer), + pageData->image, + BLCKSZ); + /* We don't worry about zeroing the "hole" in this case */ + MarkBufferDirty(pageData->buffer); + } + END_CRIT_SECTION(); + /* We don't have a LSN to return, in this case */ + lsn = InvalidXLogRecPtr; + } + + pfree(state); + + return lsn; +} + +/* + * Abort generic xlog record construction. No changes are applied to buffers. + * + * Note: caller is responsible for releasing locks/pins on buffers, if needed. + */ +void +GenericXLogAbort(GenericXLogState *state) +{ + pfree(state); +} + +/* + * Apply delta to given page image. + */ +static void +applyPageRedo(Page page, const char *delta, Size deltaSize) +{ + const char *ptr = delta; + const char *end = delta + deltaSize; + + while (ptr < end) + { + OffsetNumber offset, + length; + + memcpy(&offset, ptr, sizeof(offset)); + ptr += sizeof(offset); + memcpy(&length, ptr, sizeof(length)); + ptr += sizeof(length); + + memcpy(page + offset, ptr, length); + + ptr += length; + } +} + +/* + * Redo function for generic xlog record. + */ +void +generic_redo(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffers[MAX_GENERIC_XLOG_PAGES]; + uint8 block_id; + + /* Protect limited size of buffers[] array */ + Assert(XLogRecMaxBlockId(record) < MAX_GENERIC_XLOG_PAGES); + + /* Iterate over blocks */ + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + XLogRedoAction action; + + if (!XLogRecHasBlockRef(record, block_id)) + { + buffers[block_id] = InvalidBuffer; + continue; + } + + action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]); + + /* Apply redo to given block if needed */ + if (action == BLK_NEEDS_REDO) + { + Page page; + PageHeader pageHeader; + char *blockDelta; + Size blockDeltaSize; + + page = BufferGetPage(buffers[block_id]); + blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize); + applyPageRedo(page, blockDelta, blockDeltaSize); + + /* + * Since the delta contains no information about what's in the + * "hole" between pd_lower and pd_upper, set that to zero to + * ensure we produce the same page state that application of the + * logged action by GenericXLogFinish did. + */ + pageHeader = (PageHeader) page; + memset(page + pageHeader->pd_lower, 0, + pageHeader->pd_upper - pageHeader->pd_lower); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffers[block_id]); + } + } + + /* Changes are done: unlock and release all buffers */ + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + if (BufferIsValid(buffers[block_id])) + UnlockReleaseBuffer(buffers[block_id]); + } +} + +/* + * Mask a generic page before performing consistency checks on it. + */ +void +generic_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn_and_checksum(page); + + mask_unused_space(page); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c new file mode 100644 index 0000000..b8b1773 --- /dev/null +++ b/src/backend/access/transam/multixact.c @@ -0,0 +1,3428 @@ +/*------------------------------------------------------------------------- + * + * multixact.c + * PostgreSQL multi-transaction-log manager + * + * The pg_multixact manager is a pg_xact-like manager that stores an array of + * MultiXactMember for each MultiXactId. It is a fundamental part of the + * shared-row-lock implementation. Each MultiXactMember is comprised of a + * TransactionId and a set of flag bits. The name is a bit historical: + * originally, a MultiXactId consisted of more than one TransactionId (except + * in rare corner cases), hence "multi". Nowadays, however, it's perfectly + * legitimate to have MultiXactIds that only include a single Xid. + * + * The meaning of the flag bits is opaque to this module, but they are mostly + * used in heapam.c to identify lock modes that each of the member transactions + * is holding on any given tuple. This module just contains support to store + * and retrieve the arrays. + * + * We use two SLRU areas, one for storing the offsets at which the data + * starts for each MultiXactId in the other one. This trick allows us to + * store variable length arrays of TransactionIds. (We could alternatively + * use one area containing counts and TransactionIds, with valid MultiXactId + * values pointing at slots containing counts; but that way seems less robust + * since it would get completely confused if someone inquired about a bogus + * MultiXactId that pointed to an intermediate slot containing an XID.) + * + * XLOG interactions: this module generates a record whenever a new OFFSETs or + * MEMBERs page is initialized to zeroes, as well as an + * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. + * This module ignores the WAL rule "write xlog before data," because it + * suffices that actions recording a MultiXactId in a heap xmax do follow that + * rule. The only way for the MXID to be referenced from any data page is for + * heap_lock_tuple() or heap_update() to have put it there, and each generates + * an XLOG record that must follow ours. The normal LSN interlock between the + * data page and that XLOG record will ensure that our XLOG record reaches + * disk first. If the SLRU members/offsets data reaches disk sooner than the + * XLOG records, we do not care; after recovery, no xmax will refer to it. On + * the flip side, to ensure that all referenced entries _do_ reach disk, this + * module's XLOG records completely rebuild the data entered since the last + * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk + * before each checkpoint is considered complete. + * + * Like clog.c, and unlike subtrans.c, we have to preserve state across + * crashes and ensure that MXID and offset numbering increases monotonically + * across a crash. We do this in the same way as it's done for transaction + * IDs: the WAL record is guaranteed to contain evidence of every MXID we + * could need to worry about, and we just make sure that at the end of + * replay, the next-MXID and next-offset counters are at least as large as + * anything we saw during replay. + * + * We are able to remove segments no longer necessary by carefully tracking + * each table's used values: during vacuum, any multixact older than a certain + * value is removed; the cutoff value is stored in pg_class. The minimum value + * across all tables in each database is stored in pg_database, and the global + * minimum across all databases is part of pg_control and is kept in shared + * memory. Whenever that minimum is advanced, the SLRUs are truncated. + * + * When new multixactid values are to be created, care is taken that the + * counter does not fall within the wraparound horizon considering the global + * minimum value. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/multixact.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" +#include "access/slru.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "commands/dbcommands.h" +#include "funcapi.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "postmaster/autovacuum.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + + +/* + * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * used everywhere else in Postgres. + * + * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, + * MultiXact page numbering also wraps around at + * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at + * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need + * take no explicit notice of that fact in this module, except when comparing + * segment and page numbers in TruncateMultiXact (see + * MultiXactOffsetPagePrecedes). + */ + +/* We need four bytes per offset */ +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + +#define MultiXactIdToOffsetPage(xid) \ + ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) +#define MultiXactIdToOffsetEntry(xid) \ + ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) +#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) + +/* + * The situation for members is a bit more complex: we store one byte of + * additional flag bits for each TransactionId. To do this without getting + * into alignment issues, we store four bytes of flags, and then the + * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and + * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and + * performance) trumps space efficiency here. + * + * Note that the "offset" macros work with byte offset, not array indexes, so + * arithmetic must be done using "char *" pointers. + */ +/* We need eight bits per xact, so one xact fits in a byte */ +#define MXACT_MEMBER_BITS_PER_XACT 8 +#define MXACT_MEMBER_FLAGS_PER_BYTE 1 +#define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) + +/* how many full bytes of flags are there in a group? */ +#define MULTIXACT_FLAGBYTES_PER_GROUP 4 +#define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ + (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) +/* size in bytes of a complete group */ +#define MULTIXACT_MEMBERGROUP_SIZE \ + (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERS_PER_PAGE \ + (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) + +/* + * Because the number of items per page is not a divisor of the last item + * number (member 0xFFFFFFFF), the last segment does not use the maximum number + * of pages, and moreover the last used page therein does not use the same + * number of items as previous pages. (Another way to say it is that the + * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page + * has some empty space after that item.) + * + * This constant is the number of members in the last page of the last segment. + */ +#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ + ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) + +/* page in which a member is to be found */ +#define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) +#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) + +/* Location (byte offset within page) of flag word for a given member */ +#define MXOffsetToFlagsOffset(xid) \ + ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ + (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ + (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) +#define MXOffsetToFlagsBitShift(xid) \ + (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ + MXACT_MEMBER_BITS_PER_XACT) + +/* Location (byte offset within page) of TransactionId of given member */ +#define MXOffsetToMemberOffset(xid) \ + (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ + ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) + +/* Multixact members wraparound thresholds. */ +#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) +#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ + (MaxMultiXactOffset - MaxMultiXactOffset / 4) + +#define PreviousMultiXactId(xid) \ + ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1) + +/* + * Links to shared-memory data structures for MultiXact control + */ +static SlruCtlData MultiXactOffsetCtlData; +static SlruCtlData MultiXactMemberCtlData; + +#define MultiXactOffsetCtl (&MultiXactOffsetCtlData) +#define MultiXactMemberCtl (&MultiXactMemberCtlData) + +/* + * MultiXact state shared across all backends. All this state is protected + * by MultiXactGenLock. (We also use MultiXactOffsetSLRULock and + * MultiXactMemberSLRULock to guard accesses to the two sets of SLRU + * buffers. For concurrency's sake, we avoid holding more than one of these + * locks at a time.) + */ +typedef struct MultiXactStateData +{ + /* next-to-be-assigned MultiXactId */ + MultiXactId nextMXact; + + /* next-to-be-assigned offset */ + MultiXactOffset nextOffset; + + /* Have we completed multixact startup? */ + bool finishedStartup; + + /* + * Oldest multixact that is still potentially referenced by a relation. + * Anything older than this should not be consulted. These values are + * updated by vacuum. + */ + MultiXactId oldestMultiXactId; + Oid oldestMultiXactDB; + + /* + * Oldest multixact offset that is potentially referenced by a multixact + * referenced by a relation. We don't always know this value, so there's + * a flag here to indicate whether or not we currently do. + */ + MultiXactOffset oldestOffset; + bool oldestOffsetKnown; + + /* support for anti-wraparound measures */ + MultiXactId multiVacLimit; + MultiXactId multiWarnLimit; + MultiXactId multiStopLimit; + MultiXactId multiWrapLimit; + + /* support for members anti-wraparound measures */ + MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ + + /* + * Per-backend data starts here. We have two arrays stored in the area + * immediately following the MultiXactStateData struct. Each is indexed by + * BackendId. + * + * In both arrays, there's a slot for all normal backends (1..MaxBackends) + * followed by a slot for max_prepared_xacts prepared transactions. Valid + * BackendIds start from 1; element zero of each array is never used. + * + * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current + * transaction(s) could possibly be a member of, or InvalidMultiXactId + * when the backend has no live transaction that could possibly be a + * member of a MultiXact. Each backend sets its entry to the current + * nextMXact counter just before first acquiring a shared lock in a given + * transaction, and clears it at transaction end. (This works because only + * during or after acquiring a shared lock could an XID possibly become a + * member of a MultiXact, and that MultiXact would have to be created + * during or after the lock acquisition.) + * + * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's + * current transaction(s) think is potentially live, or InvalidMultiXactId + * when not in a transaction or not in a transaction that's paid any + * attention to MultiXacts yet. This is computed when first needed in a + * given transaction, and cleared at transaction end. We can compute it + * as the minimum of the valid OldestMemberMXactId[] entries at the time + * we compute it (using nextMXact if none are valid). Each backend is + * required not to attempt to access any SLRU data for MultiXactIds older + * than its own OldestVisibleMXactId[] setting; this is necessary because + * the checkpointer could truncate away such data at any instant. + * + * The oldest valid value among all of the OldestMemberMXactId[] and + * OldestVisibleMXactId[] entries is considered by vacuum as the earliest + * possible value still having any live member transaction. Subtracting + * vacuum_multixact_freeze_min_age from that value we obtain the freezing + * point for multixacts for that table. Any value older than that is + * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note + * that multis that have member xids that are older than the cutoff point + * for xids must also be frozen, even if the multis themselves are newer + * than the multixid cutoff point). Whenever a full table vacuum happens, + * the freezing point so computed is used as the new pg_class.relminmxid + * value. The minimum of all those values in a database is stored as + * pg_database.datminmxid. In turn, the minimum of all of those values is + * stored in pg_control and used as truncation point for pg_multixact. At + * checkpoint or restartpoint, unneeded segments are removed. + */ + MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]; +} MultiXactStateData; + +/* + * Last element of OldestMemberMXactId and OldestVisibleMXactId arrays. + * Valid elements are (1..MaxOldestSlot); element 0 is never used. + */ +#define MaxOldestSlot (MaxBackends + max_prepared_xacts) + +/* Pointers to the state data in shared memory */ +static MultiXactStateData *MultiXactState; +static MultiXactId *OldestMemberMXactId; +static MultiXactId *OldestVisibleMXactId; + + +/* + * Definitions for the backend-local MultiXactId cache. + * + * We use this cache to store known MultiXacts, so we don't need to go to + * SLRU areas every time. + * + * The cache lasts for the duration of a single transaction, the rationale + * for this being that most entries will contain our own TransactionId and + * so they will be uninteresting by the time our next transaction starts. + * (XXX not clear that this is correct --- other members of the MultiXact + * could hang around longer than we did. However, it's not clear what a + * better policy for flushing old cache entries would be.) FIXME actually + * this is plain wrong now that multixact's may contain update Xids. + * + * We allocate the cache entries in a memory context that is deleted at + * transaction end, so we don't need to do retail freeing of entries. + */ +typedef struct mXactCacheEnt +{ + MultiXactId multi; + int nmembers; + dlist_node node; + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; +} mXactCacheEnt; + +#define MAX_CACHE_ENTRIES 256 +static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache); +static int MXactCacheMembers = 0; +static MemoryContext MXactContext = NULL; + +#ifdef MULTIXACT_DEBUG +#define debug_elog2(a,b) elog(a,b) +#define debug_elog3(a,b,c) elog(a,b,c) +#define debug_elog4(a,b,c,d) elog(a,b,c,d) +#define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) +#define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f) +#else +#define debug_elog2(a,b) +#define debug_elog3(a,b,c) +#define debug_elog4(a,b,c,d) +#define debug_elog5(a,b,c,d,e) +#define debug_elog6(a,b,c,d,e,f) +#endif + +/* internal MultiXactId management */ +static void MultiXactIdSetOldestVisible(void); +static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nmembers, MultiXactMember *members); +static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); + +/* MultiXact cache management */ +static int mxactMemberComparator(const void *arg1, const void *arg2); +static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); +static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); +static void mXactCachePut(MultiXactId multi, int nmembers, + MultiXactMember *members); + +static char *mxstatus_to_string(MultiXactStatus status); + +/* management of SLRU infrastructure */ +static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); +static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); +static bool MultiXactOffsetPagePrecedes(int page1, int page2); +static bool MultiXactMemberPagePrecedes(int page1, int page2); +static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, + MultiXactOffset offset2); +static void ExtendMultiXactOffset(MultiXactId multi); +static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); +static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, + MultiXactOffset start, uint32 distance); +static bool SetOffsetVacuumLimit(bool is_startup); +static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); +static void WriteMZeroPageXlogRec(int pageno, uint8 info); +static void WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startTruncOff, + MultiXactId endTruncOff, + MultiXactOffset startTruncMemb, + MultiXactOffset endTruncMemb); + + +/* + * MultiXactIdCreate + * Construct a MultiXactId representing two TransactionIds. + * + * The two XIDs must be different, or be requesting different statuses. + * + * NB - we don't worry about our local MultiXactId cache here, because that + * is handled by the lower-level routines. + */ +MultiXactId +MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, + TransactionId xid2, MultiXactStatus status2) +{ + MultiXactId newMulti; + MultiXactMember members[2]; + + AssertArg(TransactionIdIsValid(xid1)); + AssertArg(TransactionIdIsValid(xid2)); + + Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); + + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + + /* + * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs + * are still running. In typical usage, xid2 will be our own XID and the + * caller just did a check on xid1, so it'd be wasted effort. + */ + + members[0].xid = xid1; + members[0].status = status1; + members[1].xid = xid2; + members[1].status = status2; + + newMulti = MultiXactIdCreateFromMembers(2, members); + + debug_elog3(DEBUG2, "Create: %s", + mxid_to_string(newMulti, 2, members)); + + return newMulti; +} + +/* + * MultiXactIdExpand + * Add a TransactionId to a pre-existing MultiXactId. + * + * If the TransactionId is already a member of the passed MultiXactId with the + * same status, just return it as-is. + * + * Note that we do NOT actually modify the membership of a pre-existing + * MultiXactId; instead we create a new one. This is necessary to avoid + * a race condition against code trying to wait for one MultiXactId to finish; + * see notes in heapam.c. + * + * NB - we don't worry about our local MultiXactId cache here, because that + * is handled by the lower-level routines. + * + * Note: It is critical that MultiXactIds that come from an old cluster (i.e. + * one upgraded by pg_upgrade from a cluster older than this feature) are not + * passed in. + */ +MultiXactId +MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) +{ + MultiXactId newMulti; + MultiXactMember *members; + MultiXactMember *newMembers; + int nmembers; + int i; + int j; + + AssertArg(MultiXactIdIsValid(multi)); + AssertArg(TransactionIdIsValid(xid)); + + /* MultiXactIdSetOldestMember() must have been called already. */ + Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])); + + debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s", + multi, xid, mxstatus_to_string(status)); + + /* + * Note: we don't allow for old multis here. The reason is that the only + * caller of this function does a check that the multixact is no longer + * running. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, false); + + if (nmembers < 0) + { + MultiXactMember member; + + /* + * The MultiXactId is obsolete. This can only happen if all the + * MultiXactId members stop running between the caller checking and + * passing it to us. It would be better to return that fact to the + * caller, but it would complicate the API and it's unlikely to happen + * too often, so just deal with it by creating a singleton MultiXact. + */ + member.xid = xid; + member.status = status; + newMulti = MultiXactIdCreateFromMembers(1, &member); + + debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", + multi, newMulti); + return newMulti; + } + + /* + * If the TransactionId is already a member of the MultiXactId with the + * same status, just return the existing MultiXactId. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdEquals(members[i].xid, xid) && + (members[i].status == status)) + { + debug_elog4(DEBUG2, "Expand: %u is already a member of %u", + xid, multi); + pfree(members); + return multi; + } + } + + /* + * Determine which of the members of the MultiXactId are still of + * interest. This is any running transaction, and also any transaction + * that grabbed something stronger than just a lock and was committed. (An + * update that aborted is of no interest here; and having more than one + * update Xid in a multixact would cause errors elsewhere.) + * + * Removing dead members is not just an optimization: freezing of tuples + * whose Xmax are multis depends on this behavior. + * + * Note we have the same race condition here as above: j could be 0 at the + * end of the loop. + */ + newMembers = (MultiXactMember *) + palloc(sizeof(MultiXactMember) * (nmembers + 1)); + + for (i = 0, j = 0; i < nmembers; i++) + { + if (TransactionIdIsInProgress(members[i].xid) || + (ISUPDATE_from_mxstatus(members[i].status) && + TransactionIdDidCommit(members[i].xid))) + { + newMembers[j].xid = members[i].xid; + newMembers[j++].status = members[i].status; + } + } + + newMembers[j].xid = xid; + newMembers[j++].status = status; + newMulti = MultiXactIdCreateFromMembers(j, newMembers); + + pfree(members); + pfree(newMembers); + + debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti); + + return newMulti; +} + +/* + * MultiXactIdIsRunning + * Returns whether a MultiXactId is "running". + * + * We return true if at least one member of the given MultiXactId is still + * running. Note that a "false" result is certain not to change, + * because it is not legal to add members to an existing MultiXactId. + * + * Caller is expected to have verified that the multixact does not come from + * a pg_upgraded share-locked tuple. + */ +bool +MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) +{ + MultiXactMember *members; + int nmembers; + int i; + + debug_elog3(DEBUG2, "IsRunning %u?", multi); + + /* + * "false" here means we assume our callers have checked that the given + * multi cannot possibly come from a pg_upgraded database. + */ + nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly); + + if (nmembers <= 0) + { + debug_elog2(DEBUG2, "IsRunning: no members"); + return false; + } + + /* + * Checking for myself is cheap compared to looking in shared memory; + * return true if any live subtransaction of the current top-level + * transaction is a member. + * + * This is not needed for correctness, it's just a fast path. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdIsCurrentTransactionId(members[i].xid)) + { + debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i); + pfree(members); + return true; + } + } + + /* + * This could be made faster by having another entry point in procarray.c, + * walking the PGPROC array only once for all the members. But in most + * cases nmembers should be small enough that it doesn't much matter. + */ + for (i = 0; i < nmembers; i++) + { + if (TransactionIdIsInProgress(members[i].xid)) + { + debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", + i, members[i].xid); + pfree(members); + return true; + } + } + + pfree(members); + + debug_elog3(DEBUG2, "IsRunning: %u is not running", multi); + + return false; +} + +/* + * MultiXactIdSetOldestMember + * Save the oldest MultiXactId this transaction could be a member of. + * + * We set the OldestMemberMXactId for a given transaction the first time it's + * going to do some operation that might require a MultiXactId (tuple lock, + * update or delete). We need to do this even if we end up using a + * TransactionId instead of a MultiXactId, because there is a chance that + * another transaction would add our XID to a MultiXactId. + * + * The value to set is the next-to-be-assigned MultiXactId, so this is meant to + * be called just before doing any such possibly-MultiXactId-able operation. + */ +void +MultiXactIdSetOldestMember(void) +{ + if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId])) + { + MultiXactId nextMXact; + + /* + * You might think we don't need to acquire a lock here, since + * fetching and storing of TransactionIds is probably atomic, but in + * fact we do: suppose we pick up nextMXact and then lose the CPU for + * a long time. Someone else could advance nextMXact, and then + * another someone else could compute an OldestVisibleMXactId that + * would be after the value we are going to store when we get control + * back. Which would be wrong. + * + * Note that a shared lock is sufficient, because it's enough to stop + * someone from advancing nextMXact; and nobody else could be trying + * to write to our OldestMember entry, only reading (and we assume + * storing it is atomic.) + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to store a valid value in our array entry. + */ + nextMXact = MultiXactState->nextMXact; + if (nextMXact < FirstMultiXactId) + nextMXact = FirstMultiXactId; + + OldestMemberMXactId[MyBackendId] = nextMXact; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u", + MyBackendId, nextMXact); + } +} + +/* + * MultiXactIdSetOldestVisible + * Save the oldest MultiXactId this transaction considers possibly live. + * + * We set the OldestVisibleMXactId for a given transaction the first time + * it's going to inspect any MultiXactId. Once we have set this, we are + * guaranteed that the checkpointer won't truncate off SLRU data for + * MultiXactIds at or after our OldestVisibleMXactId. + * + * The value to set is the oldest of nextMXact and all the valid per-backend + * OldestMemberMXactId[] entries. Because of the locking we do, we can be + * certain that no subsequent call to MultiXactIdSetOldestMember can set + * an OldestMemberMXactId[] entry older than what we compute here. Therefore + * there is no live transaction, now or later, that can be a member of any + * MultiXactId older than the OldestVisibleMXactId we compute here. + */ +static void +MultiXactIdSetOldestVisible(void) +{ + if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId])) + { + MultiXactId oldestMXact; + int i; + + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to store a valid value in our array entry. + */ + oldestMXact = MultiXactState->nextMXact; + if (oldestMXact < FirstMultiXactId) + oldestMXact = FirstMultiXactId; + + for (i = 1; i <= MaxOldestSlot; i++) + { + MultiXactId thisoldest = OldestMemberMXactId[i]; + + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + } + + OldestVisibleMXactId[MyBackendId] = oldestMXact; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u", + MyBackendId, oldestMXact); + } +} + +/* + * ReadNextMultiXactId + * Return the next MultiXactId to be assigned, but don't allocate it + */ +MultiXactId +ReadNextMultiXactId(void) +{ + MultiXactId mxid; + + /* XXX we could presumably do this without a lock. */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + mxid = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + if (mxid < FirstMultiXactId) + mxid = FirstMultiXactId; + + return mxid; +} + +/* + * ReadMultiXactIdRange + * Get the range of IDs that may still be referenced by a relation. + */ +void +ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next) +{ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + *oldest = MultiXactState->oldestMultiXactId; + *next = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + if (*oldest < FirstMultiXactId) + *oldest = FirstMultiXactId; + if (*next < FirstMultiXactId) + *next = FirstMultiXactId; +} + + +/* + * MultiXactIdCreateFromMembers + * Make a new MultiXactId from the specified set of members + * + * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the + * given TransactionIds as members. Returns the newly created MultiXactId. + * + * NB: the passed members[] array will be sorted in-place. + */ +MultiXactId +MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) +{ + MultiXactId multi; + MultiXactOffset offset; + xl_multixact_create xlrec; + + debug_elog3(DEBUG2, "Create: %s", + mxid_to_string(InvalidMultiXactId, nmembers, members)); + + /* + * See if the same set of members already exists in our cache; if so, just + * re-use that MultiXactId. (Note: it might seem that looking in our + * cache is insufficient, and we ought to search disk to see if a + * duplicate definition already exists. But since we only ever create + * MultiXacts containing our own XID, in most cases any such MultiXacts + * were in fact created by us, and so will be in our cache. There are + * corner cases where someone else added us to a MultiXact without our + * knowledge, but it's not worth checking for.) + */ + multi = mXactCacheGetBySet(nmembers, members); + if (MultiXactIdIsValid(multi)) + { + debug_elog2(DEBUG2, "Create: in cache!"); + return multi; + } + + /* Verify that there is a single update Xid among the given members. */ + { + int i; + bool has_update = false; + + for (i = 0; i < nmembers; i++) + { + if (ISUPDATE_from_mxstatus(members[i].status)) + { + if (has_update) + elog(ERROR, "new multixact has more than one updating member: %s", + mxid_to_string(InvalidMultiXactId, nmembers, members)); + has_update = true; + } + } + } + + /* + * Assign the MXID and offsets range to use, and make sure there is space + * in the OFFSETs and MEMBERs files. NB: this routine does + * START_CRIT_SECTION(). + * + * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check + * that we've called MultiXactIdSetOldestMember here. This is because + * this routine is used in some places to create new MultiXactIds of which + * the current backend is not a member, notably during freezing of multis + * in vacuum. During vacuum, in particular, it would be unacceptable to + * keep OldestMulti set, in case it runs for long. + */ + multi = GetNewMultiXactId(nmembers, &offset); + + /* Make an XLOG entry describing the new MXID. */ + xlrec.mid = multi; + xlrec.moff = offset; + xlrec.nmembers = nmembers; + + /* + * XXX Note: there's a lot of padding space in MultiXactMember. We could + * find a more compact representation of this Xlog record -- perhaps all + * the status flags in one XLogRecData, then all the xids in another one? + * Not clear that it's worth the trouble though. + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactCreate); + XLogRegisterData((char *) members, nmembers * sizeof(MultiXactMember)); + + (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); + + /* Now enter the information into the OFFSETs and MEMBERs logs */ + RecordNewMultiXact(multi, offset, nmembers, members); + + /* Done with critical section */ + END_CRIT_SECTION(); + + /* Store the new MultiXactId in the local cache, too */ + mXactCachePut(multi, nmembers, members); + + debug_elog2(DEBUG2, "Create: all done"); + + return multi; +} + +/* + * RecordNewMultiXact + * Write info about a new multixact into the offsets and members files + * + * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can + * use it. + */ +static void +RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, + int nmembers, MultiXactMember *members) +{ + int pageno; + int prev_pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + int i; + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* + * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" + * to complain about if there's any I/O error. This is kinda bogus, but + * since the errors will always give the full pathname, it should be clear + * enough that a MultiXactId is really involved. Perhaps someday we'll + * take the trouble to generalize the slru.c error reporting code. + */ + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + + *offptr = offset; + + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + + /* Exchange our lock */ + LWLockRelease(MultiXactOffsetSLRULock); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + prev_pageno = -1; + + for (i = 0; i < nmembers; i++, offset++) + { + TransactionId *memberptr; + uint32 *flagsptr; + uint32 flagsval; + int bshift; + int flagsoff; + int memberoff; + + Assert(members[i].status <= MultiXactStatusUpdate); + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + + if (pageno != prev_pageno) + { + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + prev_pageno = pageno; + } + + memberptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + *memberptr = members[i].xid; + + flagsptr = (uint32 *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + flagsval = *flagsptr; + flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= (members[i].status << bshift); + *flagsptr = flagsval; + + MultiXactMemberCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactMemberSLRULock); +} + +/* + * GetNewMultiXactId + * Get the next MultiXactId. + * + * Also, reserve the needed amount of space in the "members" area. The + * starting offset of the reserved space is returned in *offset. + * + * This may generate XLOG records for expansion of the offsets and/or members + * files. Unfortunately, we have to do that while holding MultiXactGenLock + * to avoid race conditions --- the XLOG record for zeroing a page must appear + * before any backend can possibly try to store data in that page! + * + * We start a critical section before advancing the shared counters. The + * caller must end the critical section after writing SLRU data. + */ +static MultiXactId +GetNewMultiXactId(int nmembers, MultiXactOffset *offset) +{ + MultiXactId result; + MultiXactOffset nextOffset; + + debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign MultiXactIds during recovery"); + + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + /* Handle wraparound of the nextMXact counter */ + if (MultiXactState->nextMXact < FirstMultiXactId) + MultiXactState->nextMXact = FirstMultiXactId; + + /* Assign the MXID */ + result = MultiXactState->nextMXact; + + /*---------- + * Check to see if it's safe to assign another MultiXactId. This protects + * against catastrophic data loss due to multixact wraparound. The basic + * rules are: + * + * If we're past multiVacLimit or the safe threshold for member storage + * space, or we don't know what the safe threshold for member storage is, + * start trying to force autovacuum cycles. + * If we're past multiWarnLimit, start issuing warnings. + * If we're past multiStopLimit, refuse to create new MultiXactIds. + * + * Note these are pretty much the same protections in GetNewTransactionId. + *---------- + */ + if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) + { + /* + * For safety's sake, we release MultiXactGenLock while sending + * signals, warnings, etc. This is not so much because we care about + * preserving concurrency in this situation, as to avoid any + * possibility of deadlock while doing get_database_name(). First, + * copy all the shared values we'll need in this path. + */ + MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; + MultiXactId multiStopLimit = MultiXactState->multiStopLimit; + MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; + Oid oldest_datoid = MultiXactState->oldestMultiXactDB; + + LWLockRelease(MultiXactGenLock); + + if (IsUnderPostmaster && + !MultiXactIdPrecedes(result, multiStopLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* + * Immediately kick autovacuum into action as we're already in + * ERROR territory. + */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"", + oldest_datname), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u", + oldest_datoid), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only once per 64K multis generated. This still gives + * plenty of chances before we get into real trouble. + */ + if (IsUnderPostmaster && (result % 65536) == 0) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + if (!MultiXactIdPrecedes(result, multiWarnLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(WARNING, + (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", + "database \"%s\" must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - result, + oldest_datname, + multiWrapLimit - result), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", + "database with OID %u must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - result, + oldest_datoid, + multiWrapLimit - result), + errhint("Execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* Re-acquire lock and start over */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + result = MultiXactState->nextMXact; + if (result < FirstMultiXactId) + result = FirstMultiXactId; + } + + /* Make sure there is room for the MXID in the file. */ + ExtendMultiXactOffset(result); + + /* + * Reserve the members space, similarly to above. Also, be careful not to + * return zero as the starting offset for any multixact. See + * GetMultiXactIdMembers() for motivation. + */ + nextOffset = MultiXactState->nextOffset; + if (nextOffset == 0) + { + *offset = 1; + nmembers++; /* allocate member slot 0 too */ + } + else + *offset = nextOffset; + + /*---------- + * Protect against overrun of the members space as well, with the + * following rules: + * + * If we're past offsetStopLimit, refuse to generate more multis. + * If we're close to offsetStopLimit, emit a warning. + * + * Arbitrarily, we start emitting warnings when we're 20 segments or less + * from offsetStopLimit. + * + * Note we haven't updated the shared state yet, so if we fail at this + * point, the multixact ID we grabbed can still be used by the next guy. + * + * Note that there is no point in forcing autovacuum runs here: the + * multixact freeze settings would have to be reduced for that to have any + * effect. + *---------- + */ +#define OFFSET_WARN_SEGMENTS 20 + if (MultiXactState->oldestOffsetKnown && + MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, + nmembers)) + { + /* see comment in the corresponding offsets wraparound case */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("multixact \"members\" limit exceeded"), + errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", + "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", + MultiXactState->offsetStopLimit - nextOffset - 1, + nmembers, + MultiXactState->offsetStopLimit - nextOffset - 1), + errhint("Execute a database-wide VACUUM in database with OID %u with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings.", + MultiXactState->oldestMultiXactDB))); + } + + /* + * Check whether we should kick autovacuum into action, to prevent members + * wraparound. NB we use a much larger window to trigger autovacuum than + * just the warning limit. The warning is just a measure of last resort - + * this is in line with GetNewTransactionId's behaviour. + */ + if (!MultiXactState->oldestOffsetKnown || + (MultiXactState->nextOffset - MultiXactState->oldestOffset + > MULTIXACT_MEMBER_SAFE_THRESHOLD)) + { + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only when crossing a segment boundary. With default + * compilation settings that's roughly after 50k members. This still + * gives plenty of chances before we get into real trouble. + */ + if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != + (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + } + + if (MultiXactState->oldestOffsetKnown && + MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, + nextOffset, + nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) + ereport(WARNING, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", + "database with OID %u must be vacuumed before %d more multixact members are used", + MultiXactState->offsetStopLimit - nextOffset + nmembers, + MultiXactState->oldestMultiXactDB, + MultiXactState->offsetStopLimit - nextOffset + nmembers), + errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings."))); + + ExtendMultiXactMember(nextOffset, nmembers); + + /* + * Critical section from here until caller has written the data into the + * just-reserved SLRU space; we don't want to error out with a partly + * written MultiXact structure. (In particular, failing to write our + * start offset after advancing nextMXact would effectively corrupt the + * previous MultiXact.) + */ + START_CRIT_SECTION(); + + /* + * Advance counters. As in GetNewTransactionId(), this must not happen + * until after file extension has succeeded! + * + * We don't care about MultiXactId wraparound here; it will be handled by + * the next iteration. But note that nextMXact may be InvalidMultiXactId + * or the first value on a segment-beginning page after this routine + * exits, so anyone else looking at the variable must be prepared to deal + * with either case. Similarly, nextOffset may be zero, but we won't use + * that as the actual start offset of the next multixact. + */ + (MultiXactState->nextMXact)++; + + MultiXactState->nextOffset += nmembers; + + LWLockRelease(MultiXactGenLock); + + debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + return result; +} + +/* + * GetMultiXactIdMembers + * Return the set of MultiXactMembers that make up a MultiXactId + * + * Return value is the number of members found, or -1 if there are none, + * and *members is set to a newly palloc'ed array of members. It's the + * caller's responsibility to free it when done with it. + * + * from_pgupgrade must be passed as true if and only if only the multixact + * corresponds to a value from a tuple that was locked in a 9.2-or-older + * installation and later pg_upgrade'd (that is, the infomask is + * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members + * can still be running, so we return -1 just like for an empty multixact + * without any further checking. It would be wrong to try to resolve such a + * multixact: either the multixact is within the current valid multixact + * range, in which case the returned result would be bogus, or outside that + * range, in which case an error would be raised. + * + * In all other cases, the passed multixact must be within the known valid + * range, that is, greater to or equal than oldestMultiXactId, and less than + * nextMXact. Otherwise, an error is raised. + * + * onlyLock must be set to true if caller is certain that the given multi + * is used only to lock tuples; can be false without loss of correctness, + * but passing a true means we can return quickly without checking for + * old updates. + */ +int +GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, + bool from_pgupgrade, bool onlyLock) +{ + int pageno; + int prev_pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + MultiXactOffset offset; + int length; + int truelength; + int i; + MultiXactId oldestMXact; + MultiXactId nextMXact; + MultiXactId tmpMXact; + MultiXactOffset nextOffset; + MultiXactMember *ptr; + + debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); + + if (!MultiXactIdIsValid(multi) || from_pgupgrade) + { + *members = NULL; + return -1; + } + + /* See if the MultiXactId is in the local cache */ + length = mXactCacheGetById(multi, members); + if (length >= 0) + { + debug_elog3(DEBUG2, "GetMembers: found %s in the cache", + mxid_to_string(multi, length, *members)); + return length; + } + + /* Set our OldestVisibleMXactId[] entry if we didn't already */ + MultiXactIdSetOldestVisible(); + + /* + * If we know the multi is used only for locking and not for updates, then + * we can skip checking if the value is older than our oldest visible + * multi. It cannot possibly still be running. + */ + if (onlyLock && + MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyBackendId])) + { + debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old"); + *members = NULL; + return -1; + } + + /* + * We check known limits on MultiXact before resorting to the SLRU area. + * + * An ID older than MultiXactState->oldestMultiXactId cannot possibly be + * useful; it has already been removed, or will be removed shortly, by + * truncation. If one is passed, an error is raised. + * + * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it + * implies undetected ID wraparound has occurred. This raises a hard + * error. + * + * Shared lock is enough here since we aren't modifying any global state. + * Acquire it just long enough to grab the current counter values. We may + * need both nextMXact and nextOffset; see below. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + oldestMXact = MultiXactState->oldestMultiXactId; + nextMXact = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + + LWLockRelease(MultiXactGenLock); + + if (MultiXactIdPrecedes(multi, oldestMXact)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("MultiXactId %u does no longer exist -- apparent wraparound", + multi))); + + if (!MultiXactIdPrecedes(multi, nextMXact)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("MultiXactId %u has not been created yet -- apparent wraparound", + multi))); + + /* + * Find out the offset at which we need to start reading MultiXactMembers + * and the number of members in the multixact. We determine the latter as + * the difference between this multixact's starting offset and the next + * one's. However, there are some corner cases to worry about: + * + * 1. This multixact may be the latest one created, in which case there is + * no next one to look at. In this case the nextOffset value we just + * saved is the correct endpoint. + * + * 2. The next multixact may still be in process of being filled in: that + * is, another process may have done GetNewMultiXactId but not yet written + * the offset entry for that ID. In that scenario, it is guaranteed that + * the offset entry for that multixact exists (because GetNewMultiXactId + * won't release MultiXactGenLock until it does) but contains zero + * (because we are careful to pre-zero offset pages). Because + * GetNewMultiXactId will never return zero as the starting offset for a + * multixact, when we read zero as the next multixact's offset, we know we + * have this case. We sleep for a bit and try again. + * + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our + * multixact's actual endpoint, or did it end at zero with a subsequent + * increment? We handle this using the knowledge that if the zero'th + * member slot wasn't filled, it'll contain zero, and zero isn't a valid + * transaction ID so it can't be a multixact member. Therefore, if we + * read a zero from the members array, just ignore it. + * + * This is all pretty messy, but the mess occurs only in infrequent corner + * cases, so it seems better than holding the MultiXactGenLock for a long + * time on every multixact creation. + */ +retry: + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + offset = *offptr; + + Assert(offset != 0); + + /* + * Use the same increment rule as GetNewMultiXactId(), that is, don't + * handle wraparound explicitly until needed. + */ + tmpMXact = multi + 1; + + if (nextMXact == tmpMXact) + { + /* Corner case 1: there is no next multixact */ + length = nextOffset - offset; + } + else + { + MultiXactOffset nextMXOffset; + + /* handle wraparound if needed */ + if (tmpMXact < FirstMultiXactId) + tmpMXact = FirstMultiXactId; + + prev_pageno = pageno; + + pageno = MultiXactIdToOffsetPage(tmpMXact); + entryno = MultiXactIdToOffsetEntry(tmpMXact); + + if (pageno != prev_pageno) + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); + + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + nextMXOffset = *offptr; + + if (nextMXOffset == 0) + { + /* Corner case 2: next multixact is still being filled in */ + LWLockRelease(MultiXactOffsetSLRULock); + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + goto retry; + } + + length = nextMXOffset - offset; + } + + LWLockRelease(MultiXactOffsetSLRULock); + + ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); + + /* Now get the members themselves. */ + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + truelength = 0; + prev_pageno = -1; + for (i = 0; i < length; i++, offset++) + { + TransactionId *xactptr; + uint32 *flagsptr; + int flagsoff; + int bshift; + int memberoff; + + pageno = MXOffsetToMemberPage(offset); + memberoff = MXOffsetToMemberOffset(offset); + + if (pageno != prev_pageno) + { + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + prev_pageno = pageno; + } + + xactptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + if (!TransactionIdIsValid(*xactptr)) + { + /* Corner case 3: we must be looking at unused slot zero */ + Assert(offset == 0); + continue; + } + + flagsoff = MXOffsetToFlagsOffset(offset); + bshift = MXOffsetToFlagsBitShift(offset); + flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + + ptr[truelength].xid = *xactptr; + ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; + truelength++; + } + + LWLockRelease(MultiXactMemberSLRULock); + + /* A multixid with zero members should not happen */ + Assert(truelength > 0); + + /* + * Copy the result into the local cache. + */ + mXactCachePut(multi, truelength, ptr); + + debug_elog3(DEBUG2, "GetMembers: no cache for %s", + mxid_to_string(multi, truelength, ptr)); + *members = ptr; + return truelength; +} + +/* + * mxactMemberComparator + * qsort comparison function for MultiXactMember + * + * We can't use wraparound comparison for XIDs because that does not respect + * the triangle inequality! Any old sort order will do. + */ +static int +mxactMemberComparator(const void *arg1, const void *arg2) +{ + MultiXactMember member1 = *(const MultiXactMember *) arg1; + MultiXactMember member2 = *(const MultiXactMember *) arg2; + + if (member1.xid > member2.xid) + return 1; + if (member1.xid < member2.xid) + return -1; + if (member1.status > member2.status) + return 1; + if (member1.status < member2.status) + return -1; + return 0; +} + +/* + * mXactCacheGetBySet + * returns a MultiXactId from the cache based on the set of + * TransactionIds that compose it, or InvalidMultiXactId if + * none matches. + * + * This is helpful, for example, if two transactions want to lock a huge + * table. By using the cache, the second will use the same MultiXactId + * for the majority of tuples, thus keeping MultiXactId usage low (saving + * both I/O and wraparound issues). + * + * NB: the passed members array will be sorted in-place. + */ +static MultiXactId +mXactCacheGetBySet(int nmembers, MultiXactMember *members) +{ + dlist_iter iter; + + debug_elog3(DEBUG2, "CacheGet: looking for %s", + mxid_to_string(InvalidMultiXactId, nmembers, members)); + + /* sort the array so comparison is easy */ + qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); + + dlist_foreach(iter, &MXactCache) + { + mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); + + if (entry->nmembers != nmembers) + continue; + + /* + * We assume the cache entries are sorted, and that the unused bits in + * "status" are zeroed. + */ + if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) + { + debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi); + dlist_move_head(&MXactCache, iter.cur); + return entry->multi; + } + } + + debug_elog2(DEBUG2, "CacheGet: not found :-("); + return InvalidMultiXactId; +} + +/* + * mXactCacheGetById + * returns the composing MultiXactMember set from the cache for a + * given MultiXactId, if present. + * + * If successful, *xids is set to the address of a palloc'd copy of the + * MultiXactMember set. Return value is number of members, or -1 on failure. + */ +static int +mXactCacheGetById(MultiXactId multi, MultiXactMember **members) +{ + dlist_iter iter; + + debug_elog3(DEBUG2, "CacheGet: looking for %u", multi); + + dlist_foreach(iter, &MXactCache) + { + mXactCacheEnt *entry = dlist_container(mXactCacheEnt, node, iter.cur); + + if (entry->multi == multi) + { + MultiXactMember *ptr; + Size size; + + size = sizeof(MultiXactMember) * entry->nmembers; + ptr = (MultiXactMember *) palloc(size); + + memcpy(ptr, entry->members, size); + + debug_elog3(DEBUG2, "CacheGet: found %s", + mxid_to_string(multi, + entry->nmembers, + entry->members)); + + /* + * Note we modify the list while not using a modifiable iterator. + * This is acceptable only because we exit the iteration + * immediately afterwards. + */ + dlist_move_head(&MXactCache, iter.cur); + + *members = ptr; + return entry->nmembers; + } + } + + debug_elog2(DEBUG2, "CacheGet: not found"); + return -1; +} + +/* + * mXactCachePut + * Add a new MultiXactId and its composing set into the local cache. + */ +static void +mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + mXactCacheEnt *entry; + + debug_elog3(DEBUG2, "CachePut: storing %s", + mxid_to_string(multi, nmembers, members)); + + if (MXactContext == NULL) + { + /* The cache only lives as long as the current transaction */ + debug_elog2(DEBUG2, "CachePut: initializing memory context"); + MXactContext = AllocSetContextCreate(TopTransactionContext, + "MultiXact cache context", + ALLOCSET_SMALL_SIZES); + } + + entry = (mXactCacheEnt *) + MemoryContextAlloc(MXactContext, + offsetof(mXactCacheEnt, members) + + nmembers * sizeof(MultiXactMember)); + + entry->multi = multi; + entry->nmembers = nmembers; + memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); + + /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ + qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); + + dlist_push_head(&MXactCache, &entry->node); + if (MXactCacheMembers++ >= MAX_CACHE_ENTRIES) + { + dlist_node *node; + mXactCacheEnt *entry; + + node = dlist_tail_node(&MXactCache); + dlist_delete(node); + MXactCacheMembers--; + + entry = dlist_container(mXactCacheEnt, node, node); + debug_elog3(DEBUG2, "CachePut: pruning cached multi %u", + entry->multi); + + pfree(entry); + } +} + +static char * +mxstatus_to_string(MultiXactStatus status) +{ + switch (status) + { + case MultiXactStatusForKeyShare: + return "keysh"; + case MultiXactStatusForShare: + return "sh"; + case MultiXactStatusForNoKeyUpdate: + return "fornokeyupd"; + case MultiXactStatusForUpdate: + return "forupd"; + case MultiXactStatusNoKeyUpdate: + return "nokeyupd"; + case MultiXactStatusUpdate: + return "upd"; + default: + elog(ERROR, "unrecognized multixact status %d", status); + return ""; + } +} + +char * +mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + static char *str = NULL; + StringInfoData buf; + int i; + + if (str != NULL) + pfree(str); + + initStringInfo(&buf); + + appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid, + mxstatus_to_string(members[0].status)); + + for (i = 1; i < nmembers; i++) + appendStringInfo(&buf, ", %u (%s)", members[i].xid, + mxstatus_to_string(members[i].status)); + + appendStringInfoChar(&buf, ']'); + str = MemoryContextStrdup(TopMemoryContext, buf.data); + pfree(buf.data); + return str; +} + +/* + * AtEOXact_MultiXact + * Handle transaction end for MultiXact + * + * This is called at top transaction commit or abort (we don't care which). + */ +void +AtEOXact_MultiXact(void) +{ + /* + * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of + * which should only be valid while within a transaction. + * + * We assume that storing a MultiXactId is atomic and so we need not take + * MultiXactGenLock to do this. + */ + OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; + OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; + + /* + * Discard the local MultiXactId cache. Since MXactContext was created as + * a child of TopTransactionContext, we needn't delete it explicitly. + */ + MXactContext = NULL; + dlist_init(&MXactCache); + MXactCacheMembers = 0; +} + +/* + * AtPrepare_MultiXact + * Save multixact state at 2PC transaction prepare + * + * In this phase, we only store our OldestMemberMXactId value in the two-phase + * state file. + */ +void +AtPrepare_MultiXact(void) +{ + MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId]; + + if (MultiXactIdIsValid(myOldestMember)) + RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, + &myOldestMember, sizeof(MultiXactId)); +} + +/* + * PostPrepare_MultiXact + * Clean up after successful PREPARE TRANSACTION + */ +void +PostPrepare_MultiXact(TransactionId xid) +{ + MultiXactId myOldestMember; + + /* + * Transfer our OldestMemberMXactId value to the slot reserved for the + * prepared transaction. + */ + myOldestMember = OldestMemberMXactId[MyBackendId]; + if (MultiXactIdIsValid(myOldestMember)) + { + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); + + /* + * Even though storing MultiXactId is atomic, acquire lock to make + * sure others see both changes, not just the reset of the slot of the + * current backend. Using a volatile pointer might suffice, but this + * isn't a hot spot. + */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + + OldestMemberMXactId[dummyBackendId] = myOldestMember; + OldestMemberMXactId[MyBackendId] = InvalidMultiXactId; + + LWLockRelease(MultiXactGenLock); + } + + /* + * We don't need to transfer OldestVisibleMXactId value, because the + * transaction is not going to be looking at any more multixacts once it's + * prepared. + * + * We assume that storing a MultiXactId is atomic and so we need not take + * MultiXactGenLock to do this. + */ + OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId; + + /* + * Discard the local MultiXactId cache like in AtEOXact_MultiXact. + */ + MXactContext = NULL; + dlist_init(&MXactCache); + MXactCacheMembers = 0; +} + +/* + * multixact_twophase_recover + * Recover the state of a prepared transaction at startup + */ +void +multixact_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, false); + MultiXactId oldestMember; + + /* + * Get the oldest member XID from the state file record, and set it in the + * OldestMemberMXactId slot reserved for this prepared transaction. + */ + Assert(len == sizeof(MultiXactId)); + oldestMember = *((MultiXactId *) recdata); + + OldestMemberMXactId[dummyBackendId] = oldestMember; +} + +/* + * multixact_twophase_postcommit + * Similar to AtEOXact_MultiXact but for COMMIT PREPARED + */ +void +multixact_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + BackendId dummyBackendId = TwoPhaseGetDummyBackendId(xid, true); + + Assert(len == sizeof(MultiXactId)); + + OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId; +} + +/* + * multixact_twophase_postabort + * This is actually just the same as the COMMIT case. + */ +void +multixact_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + multixact_twophase_postcommit(xid, info, recdata, len); +} + +/* + * Initialization of shared memory for MultiXact. We use two SLRU areas, + * thus double memory. Also, reserve space for the shared MultiXactState + * struct and the per-backend MultiXactId arrays (two of those, too). + */ +Size +MultiXactShmemSize(void) +{ + Size size; + + /* We need 2*MaxOldestSlot + 1 perBackendXactIds[] entries */ +#define SHARED_MULTIXACT_STATE_SIZE \ + add_size(offsetof(MultiXactStateData, perBackendXactIds) + sizeof(MultiXactId), \ + mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) + + size = SHARED_MULTIXACT_STATE_SIZE; + size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0)); + size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0)); + + return size; +} + +void +MultiXactShmemInit(void) +{ + bool found; + + debug_elog2(DEBUG2, "Shared Memory Init for MultiXact"); + + MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; + MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; + + SimpleLruInit(MultiXactOffsetCtl, + "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, + MultiXactOffsetSLRULock, "pg_multixact/offsets", + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + SYNC_HANDLER_MULTIXACT_OFFSET); + SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); + SimpleLruInit(MultiXactMemberCtl, + "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, + MultiXactMemberSLRULock, "pg_multixact/members", + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + SYNC_HANDLER_MULTIXACT_MEMBER); + /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ + + /* Initialize our shared state struct */ + MultiXactState = ShmemInitStruct("Shared MultiXact State", + SHARED_MULTIXACT_STATE_SIZE, + &found); + if (!IsUnderPostmaster) + { + Assert(!found); + + /* Make sure we zero out the per-backend state */ + MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); + } + else + Assert(found); + + /* + * Set up array pointers. Note that perBackendXactIds[0] is wasted space + * since we only use indexes 1..MaxOldestSlot in each array. + */ + OldestMemberMXactId = MultiXactState->perBackendXactIds; + OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; +} + +/* + * This func must be called ONCE on system install. It creates the initial + * MultiXact segments. (The MultiXacts directories are assumed to have been + * created by initdb, and MultiXactShmemInit must have been called already.) + */ +void +BootStrapMultiXact(void) +{ + int slotno; + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the offsets log */ + slotno = ZeroMultiXactOffsetPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactOffsetSLRULock); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the members log */ + slotno = ZeroMultiXactMemberPage(0, false); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactMemberCtl, slotno); + Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactMemberSLRULock); +} + +/* + * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. + * If writeXlog is true, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroMultiXactOffsetPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + + return slotno; +} + +/* + * Ditto, for MultiXactMember + */ +static int +ZeroMultiXactMemberPage(int pageno, bool writeXlog) +{ + int slotno; + + slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); + + if (writeXlog) + WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); + + return slotno; +} + +/* + * MaybeExtendOffsetSlru + * Extend the offsets SLRU area, if necessary + * + * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might + * contain files that are shorter than necessary; this would occur if the old + * installation had used multixacts beyond the first page (files cannot be + * copied, because the on-disk representation is different). pg_upgrade would + * update pg_control to set the next offset value to be at that position, so + * that tuples marked as locked by such MultiXacts would be seen as visible + * without having to consult multixact. However, trying to create and use a + * new MultiXactId would result in an error because the page on which the new + * value would reside does not exist. This routine is in charge of creating + * such pages. + */ +static void +MaybeExtendOffsetSlru(void) +{ + int pageno; + + pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + { + int slotno; + + /* + * Fortunately for us, SimpleLruWritePage is already prepared to deal + * with creating a new segment file even if the page we're writing is + * not the first in it, so this is enough. + */ + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + } + + LWLockRelease(MultiXactOffsetSLRULock); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup. + * + * StartupXLOG has already established nextMXact/nextOffset by calling + * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti + * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet + * replayed WAL. + */ +void +StartupMultiXact(void) +{ + MultiXactId multi = MultiXactState->nextMXact; + MultiXactOffset offset = MultiXactState->nextOffset; + int pageno; + + /* + * Initialize offset's idea of the latest page number. + */ + pageno = MultiXactIdToOffsetPage(multi); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + + /* + * Initialize member's idea of the latest page number. + */ + pageno = MXOffsetToMemberPage(offset); + MultiXactMemberCtl->shared->latest_page_number = pageno; +} + +/* + * This must be called ONCE at the end of startup/recovery. + */ +void +TrimMultiXact(void) +{ + MultiXactId nextMXact; + MultiXactOffset offset; + MultiXactId oldestMXact; + Oid oldestMXactDB; + int pageno; + int entryno; + int flagsoff; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextMXact = MultiXactState->nextMXact; + offset = MultiXactState->nextOffset; + oldestMXact = MultiXactState->oldestMultiXactId; + oldestMXactDB = MultiXactState->oldestMultiXactDB; + LWLockRelease(MultiXactGenLock); + + /* Clean up offsets state */ + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* + * (Re-)Initialize our idea of the latest page number for offsets. + */ + pageno = MultiXactIdToOffsetPage(nextMXact); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + + /* + * Zero out the remainder of the current offsets page. See notes in + * TrimCLOG() for background. Unlike CLOG, some WAL record covers every + * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL + * rule "write xlog before data," nextMXact successors may carry obsolete, + * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() + * operates normally. + */ + entryno = MultiXactIdToOffsetEntry(nextMXact); + if (entryno != 0) + { + int slotno; + MultiXactOffset *offptr; + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + + MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactOffsetSLRULock); + + /* And the same for members */ + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* + * (Re-)Initialize our idea of the latest page number for members. + */ + pageno = MXOffsetToMemberPage(offset); + MultiXactMemberCtl->shared->latest_page_number = pageno; + + /* + * Zero out the remainder of the current members page. See notes in + * TrimCLOG() for motivation. + */ + flagsoff = MXOffsetToFlagsOffset(offset); + if (flagsoff != 0) + { + int slotno; + TransactionId *xidptr; + int memberoff; + + memberoff = MXOffsetToMemberOffset(offset); + slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); + xidptr = (TransactionId *) + (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + + MemSet(xidptr, 0, BLCKSZ - memberoff); + + /* + * Note: we don't need to zero out the flag bits in the remaining + * members of the current group, because they are always reset before + * writing. + */ + + MultiXactMemberCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(MultiXactMemberSLRULock); + + /* signal that we're officially up */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->finishedStartup = true; + LWLockRelease(MultiXactGenLock); + + /* Now compute how far away the next members wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); +} + +/* + * Get the MultiXact data to save in a checkpoint record + */ +void +MultiXactGetCheckptMulti(bool is_shutdown, + MultiXactId *nextMulti, + MultiXactOffset *nextMultiOffset, + MultiXactId *oldestMulti, + Oid *oldestMultiDB) +{ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + *nextMulti = MultiXactState->nextMXact; + *nextMultiOffset = MultiXactState->nextOffset; + *oldestMulti = MultiXactState->oldestMultiXactId; + *oldestMultiDB = MultiXactState->oldestMultiXactDB; + LWLockRelease(MultiXactGenLock); + + debug_elog6(DEBUG2, + "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", + *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointMultiXact(void) +{ + TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); + + /* + * Write dirty MultiXact pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); + + TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); +} + +/* + * Set the next-to-be-assigned MultiXactId and offset + * + * This is used when we can determine the correct next ID/offset exactly + * from a checkpoint record. Although this is only called during bootstrap + * and XLog replay, we take the lock in case any hot-standby backends are + * examining the values. + */ +void +MultiXactSetNextMXact(MultiXactId nextMulti, + MultiXactOffset nextMultiOffset) +{ + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", + nextMulti, nextMultiOffset); + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->nextMXact = nextMulti; + MultiXactState->nextOffset = nextMultiOffset; + LWLockRelease(MultiXactGenLock); + + /* + * During a binary upgrade, make sure that the offsets SLRU is large + * enough to contain the next value that would be created. + * + * We need to do this pretty early during the first startup in binary + * upgrade mode: before StartupMultiXact() in fact, because this routine + * is called even before that by StartupXLOG(). And we can't do it + * earlier than at this point, because during that first call of this + * routine we determine the MultiXactState->nextMXact value that + * MaybeExtendOffsetSlru needs. + */ + if (IsBinaryUpgrade) + MaybeExtendOffsetSlru(); +} + +/* + * Determine the last safe MultiXactId to allocate given the currently oldest + * datminmxid (ie, the oldest MultiXactId that might exist in any database + * of our cluster), and the OID of the (or a) database with that value. + * + * is_startup is true when we are just starting the cluster, false when we + * are updating state in a running cluster. This only affects log messages. + */ +void +SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, + bool is_startup) +{ + MultiXactId multiVacLimit; + MultiXactId multiWarnLimit; + MultiXactId multiStopLimit; + MultiXactId multiWrapLimit; + MultiXactId curMulti; + bool needs_offset_vacuum; + + Assert(MultiXactIdIsValid(oldest_datminmxid)); + + /* + * We pretend that a wrap will happen halfway through the multixact ID + * space, but that's not really true, because multixacts wrap differently + * from transaction IDs. Note that, separately from any concern about + * multixact IDs wrapping, we must ensure that multixact members do not + * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. + */ + multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); + if (multiWrapLimit < FirstMultiXactId) + multiWrapLimit += FirstMultiXactId; + + /* + * We'll refuse to continue assigning MultiXactIds once we get within 3M + * multi of data loss. See SetTransactionIdLimit. + */ + multiStopLimit = multiWrapLimit - 3000000; + if (multiStopLimit < FirstMultiXactId) + multiStopLimit -= FirstMultiXactId; + + /* + * We'll start complaining loudly when we get within 40M multis of data + * loss. This is kind of arbitrary, but if you let your gas gauge get + * down to 2% of full, would you be looking for the next gas station? We + * need to be fairly liberal about this number because there are lots of + * scenarios where most transactions are done by automatic clients that + * won't pay attention to warnings. (No, we're not gonna make this + * configurable. If you know enough to configure it, you know enough to + * not get in this kind of trouble in the first place.) + */ + multiWarnLimit = multiWrapLimit - 40000000; + if (multiWarnLimit < FirstMultiXactId) + multiWarnLimit -= FirstMultiXactId; + + /* + * We'll start trying to force autovacuums when oldest_datminmxid gets to + * be more than autovacuum_multixact_freeze_max_age mxids old. + * + * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter + * so that we don't have to worry about dealing with on-the-fly changes in + * its value. See SetTransactionIdLimit. + */ + multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; + if (multiVacLimit < FirstMultiXactId) + multiVacLimit += FirstMultiXactId; + + /* Grab lock for just long enough to set the new limit values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactId = oldest_datminmxid; + MultiXactState->oldestMultiXactDB = oldest_datoid; + MultiXactState->multiVacLimit = multiVacLimit; + MultiXactState->multiWarnLimit = multiWarnLimit; + MultiXactState->multiStopLimit = multiStopLimit; + MultiXactState->multiWrapLimit = multiWrapLimit; + curMulti = MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + /* Log the info */ + ereport(DEBUG1, + (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u", + multiWrapLimit, oldest_datoid))); + + /* + * Computing the actual limits is only possible once the data directory is + * in a consistent state. There's no need to compute the limits while + * still replaying WAL - no decisions about new multis are made even + * though multixact creations might be replayed. So we'll only do further + * checks after TrimMultiXact() has been called. + */ + if (!MultiXactState->finishedStartup) + return; + + Assert(!InRecovery); + + /* Set limits for offset vacuum. */ + needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); + + /* + * If past the autovacuum force point, immediately signal an autovac + * request. The reason for this is that autovac only processes one + * database per invocation. Once it's finished cleaning up the oldest + * database, it'll call here, and we'll signal the postmaster to start + * another iteration immediately if there are still any old databases. + */ + if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || + needs_offset_vacuum) && IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* Give an immediate warning if past the wrap warn point */ + if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) + { + char *oldest_datname; + + /* + * We can be called when not inside a transaction, for example during + * StartupXLOG(). In such a case we cannot do database access, so we + * must just report the oldest DB's OID. + * + * Note: it's also possible that get_database_name fails and returns + * NULL, for example because the database just got dropped. We'll + * still warn, even though the warning might now be unnecessary. + */ + if (IsTransactionState()) + oldest_datname = get_database_name(oldest_datoid); + else + oldest_datname = NULL; + + if (oldest_datname) + ereport(WARNING, + (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", + "database \"%s\" must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - curMulti, + oldest_datname, + multiWrapLimit - curMulti), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", + "database with OID %u must be vacuumed before %u more MultiXactIds are used", + multiWrapLimit - curMulti, + oldest_datoid, + multiWrapLimit - curMulti), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } +} + +/* + * Ensure the next-to-be-assigned MultiXactId is at least minMulti, + * and similarly nextOffset is at least minMultiOffset. + * + * This is used when we can determine minimum safe values from an XLog + * record (either an on-line checkpoint or an mxact creation log entry). + * Although this is only called during XLog replay, we take the lock in case + * any hot-standby backends are examining the values. + */ +void +MultiXactAdvanceNextMXact(MultiXactId minMulti, + MultiXactOffset minMultiOffset) +{ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) + { + debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); + MultiXactState->nextMXact = minMulti; + } + if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) + { + debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", + minMultiOffset); + MultiXactState->nextOffset = minMultiOffset; + } + LWLockRelease(MultiXactGenLock); +} + +/* + * Update our oldestMultiXactId value, but only if it's more recent than what + * we had. + * + * This may only be called during WAL replay. + */ +void +MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) +{ + Assert(InRecovery); + + if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) + SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); +} + +/* + * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. + * + * NB: this is called while holding MultiXactGenLock. We want it to be very + * fast most of the time; even when it's not so fast, no actual I/O need + * happen unless we're forced to write out a dirty log or xlog page to make + * room in shared memory. + */ +static void +ExtendMultiXactOffset(MultiXactId multi) +{ + int pageno; + + /* + * No work except at first MultiXactId of a page. But beware: just after + * wraparound, the first MultiXactId of page zero is FirstMultiXactId. + */ + if (MultiXactIdToOffsetEntry(multi) != 0 && + multi != FirstMultiXactId) + return; + + pageno = MultiXactIdToOffsetPage(multi); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactOffsetPage(pageno, true); + + LWLockRelease(MultiXactOffsetSLRULock); +} + +/* + * Make sure that MultiXactMember has room for the members of a newly- + * allocated MultiXactId. + * + * Like the above routine, this is called while holding MultiXactGenLock; + * same comments apply. + */ +static void +ExtendMultiXactMember(MultiXactOffset offset, int nmembers) +{ + /* + * It's possible that the members span more than one page of the members + * file, so we loop to ensure we consider each page. The coding is not + * optimal if the members span several pages, but that seems unusual + * enough to not worry much about. + */ + while (nmembers > 0) + { + int flagsoff; + int flagsbit; + uint32 difference; + + /* + * Only zero when at first entry of a page. + */ + flagsoff = MXOffsetToFlagsOffset(offset); + flagsbit = MXOffsetToFlagsBitShift(offset); + if (flagsoff == 0 && flagsbit == 0) + { + int pageno; + + pageno = MXOffsetToMemberPage(offset); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + ZeroMultiXactMemberPage(pageno, true); + + LWLockRelease(MultiXactMemberSLRULock); + } + + /* + * Compute the number of items till end of current page. Careful: if + * addition of unsigned ints wraps around, we're at the last page of + * the last segment; since that page holds a different number of items + * than other pages, we need to do it differently. + */ + if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) + { + /* + * This is the last page of the last segment; we can compute the + * number of items left to allocate in it without modulo + * arithmetic. + */ + difference = MaxMultiXactOffset - offset + 1; + } + else + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + + /* + * Advance to next page, taking care to properly handle the wraparound + * case. OK if nmembers goes negative. + */ + nmembers -= difference; + offset += difference; + } +} + +/* + * GetOldestMultiXactId + * + * Return the oldest MultiXactId that's still possibly still seen as live by + * any running transaction. Older ones might still exist on disk, but they no + * longer have any running member transaction. + * + * It's not safe to truncate MultiXact SLRU segments on the value returned by + * this function; however, it can be used by a full-table vacuum to set the + * point at which it will be possible to truncate SLRU for that table. + */ +MultiXactId +GetOldestMultiXactId(void) +{ + MultiXactId oldestMXact; + MultiXactId nextMXact; + int i; + + /* + * This is the oldest valid value among all the OldestMemberMXactId[] and + * OldestVisibleMXactId[] entries, or nextMXact if none are valid. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + + /* + * We have to beware of the possibility that nextMXact is in the + * wrapped-around state. We don't fix the counter itself here, but we + * must be sure to use a valid value in our calculation. + */ + nextMXact = MultiXactState->nextMXact; + if (nextMXact < FirstMultiXactId) + nextMXact = FirstMultiXactId; + + oldestMXact = nextMXact; + for (i = 1; i <= MaxOldestSlot; i++) + { + MultiXactId thisoldest; + + thisoldest = OldestMemberMXactId[i]; + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + thisoldest = OldestVisibleMXactId[i]; + if (MultiXactIdIsValid(thisoldest) && + MultiXactIdPrecedes(thisoldest, oldestMXact)) + oldestMXact = thisoldest; + } + + LWLockRelease(MultiXactGenLock); + + return oldestMXact; +} + +/* + * Determine how aggressively we need to vacuum in order to prevent member + * wraparound. + * + * To do so determine what's the oldest member offset and install the limit + * info in MultiXactState, where it can be used to prevent overrun of old data + * in the members SLRU area. + * + * The return value is true if emergency autovacuum is required and false + * otherwise. + */ +static bool +SetOffsetVacuumLimit(bool is_startup) +{ + MultiXactId oldestMultiXactId; + MultiXactId nextMXact; + MultiXactOffset oldestOffset = 0; /* placate compiler */ + MultiXactOffset prevOldestOffset; + MultiXactOffset nextOffset; + bool oldestOffsetKnown = false; + bool prevOldestOffsetKnown; + MultiXactOffset offsetStopLimit = 0; + MultiXactOffset prevOffsetStopLimit; + + /* + * NB: Have to prevent concurrent truncation, we might otherwise try to + * lookup an oldestMulti that's concurrently getting truncated away. + */ + LWLockAcquire(MultiXactTruncationLock, LW_SHARED); + + /* Read relevant fields from shared memory. */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + oldestMultiXactId = MultiXactState->oldestMultiXactId; + nextMXact = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; + prevOldestOffset = MultiXactState->oldestOffset; + prevOffsetStopLimit = MultiXactState->offsetStopLimit; + Assert(MultiXactState->finishedStartup); + LWLockRelease(MultiXactGenLock); + + /* + * Determine the offset of the oldest multixact. Normally, we can read + * the offset from the multixact itself, but there's an important special + * case: if there are no multixacts in existence at all, oldestMXact + * obviously can't point to one. It will instead point to the multixact + * ID that will be assigned the next time one is needed. + */ + if (oldestMultiXactId == nextMXact) + { + /* + * When the next multixact gets created, it will be stored at the next + * offset. + */ + oldestOffset = nextOffset; + oldestOffsetKnown = true; + } + else + { + /* + * Figure out where the oldest existing multixact's offsets are + * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, + * the supposedly-earliest multixact might not really exist. We are + * careful not to fail in that case. + */ + oldestOffsetKnown = + find_multixact_start(oldestMultiXactId, &oldestOffset); + + if (oldestOffsetKnown) + ereport(DEBUG1, + (errmsg_internal("oldest MultiXactId member is at offset %u", + oldestOffset))); + else + ereport(LOG, + (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + oldestMultiXactId))); + } + + LWLockRelease(MultiXactTruncationLock); + + /* + * If we can, compute limits (and install them MultiXactState) to prevent + * overrun of old data in the members SLRU area. We can only do so if the + * oldest offset is known though. + */ + if (oldestOffsetKnown) + { + /* move back to start of the corresponding segment */ + offsetStopLimit = oldestOffset - (oldestOffset % + (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); + + /* always leave one segment before the wraparound point */ + offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); + + if (!prevOldestOffsetKnown && !is_startup) + ereport(LOG, + (errmsg("MultiXact member wraparound protections are now enabled"))); + + ereport(DEBUG1, + (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", + offsetStopLimit, oldestMultiXactId))); + } + else if (prevOldestOffsetKnown) + { + /* + * If we failed to get the oldest offset this time, but we have a + * value from a previous pass through this function, use the old + * values rather than automatically forcing an emergency autovacuum + * cycle again. + */ + oldestOffset = prevOldestOffset; + oldestOffsetKnown = true; + offsetStopLimit = prevOffsetStopLimit; + } + + /* Install the computed values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + MultiXactState->oldestOffsetKnown = oldestOffsetKnown; + MultiXactState->offsetStopLimit = offsetStopLimit; + LWLockRelease(MultiXactGenLock); + + /* + * Do we need an emergency autovacuum? If we're not sure, assume yes. + */ + return !oldestOffsetKnown || + (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); +} + +/* + * Return whether adding "distance" to "start" would move past "boundary". + * + * We use this to determine whether the addition is "wrapping around" the + * boundary point, hence the name. The reason we don't want to use the regular + * 2^31-modulo arithmetic here is that we want to be able to use the whole of + * the 2^32-1 space here, allowing for more multixacts than would fit + * otherwise. + */ +static bool +MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, + uint32 distance) +{ + MultiXactOffset finish; + + /* + * Note that offset number 0 is not used (see GetMultiXactIdMembers), so + * if the addition wraps around the UINT_MAX boundary, skip that value. + */ + finish = start + distance; + if (finish < start) + finish++; + + /*----------------------------------------------------------------------- + * When the boundary is numerically greater than the starting point, any + * value numerically between the two is not wrapped: + * + * <----S----B----> + * [---) = F wrapped past B (and UINT_MAX) + * [---) = F not wrapped + * [----] = F wrapped past B + * + * When the boundary is numerically less than the starting point (i.e. the + * UINT_MAX wraparound occurs somewhere in between) then all values in + * between are wrapped: + * + * <----B----S----> + * [---) = F not wrapped past B (but wrapped past UINT_MAX) + * [---) = F wrapped past B (and UINT_MAX) + * [----] = F not wrapped + *----------------------------------------------------------------------- + */ + if (start < boundary) + return finish >= boundary || finish < start; + else + return finish >= boundary && finish < start; +} + +/* + * Find the starting offset of the given MultiXactId. + * + * Returns false if the file containing the multi does not exist on disk. + * Otherwise, returns true and sets *result to the starting member offset. + * + * This function does not prevent concurrent truncation, so if that's + * required, the caller has to protect against that. + */ +static bool +find_multixact_start(MultiXactId multi, MultiXactOffset *result) +{ + MultiXactOffset offset; + int pageno; + int entryno; + int slotno; + MultiXactOffset *offptr; + + Assert(MultiXactState->finishedStartup); + + pageno = MultiXactIdToOffsetPage(multi); + entryno = MultiXactIdToOffsetEntry(multi); + + /* + * Write out dirty data, so PhysicalPageExists can work correctly. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); + + if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + return false; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); + offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr += entryno; + offset = *offptr; + LWLockRelease(MultiXactOffsetSLRULock); + + *result = offset; + return true; +} + +/* + * Determine how many multixacts, and how many multixact members, currently + * exist. Return false if unable to determine. + */ +static bool +ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) +{ + MultiXactOffset nextOffset; + MultiXactOffset oldestOffset; + MultiXactId oldestMultiXactId; + MultiXactId nextMultiXactId; + bool oldestOffsetKnown; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextOffset = MultiXactState->nextOffset; + oldestMultiXactId = MultiXactState->oldestMultiXactId; + nextMultiXactId = MultiXactState->nextMXact; + oldestOffset = MultiXactState->oldestOffset; + oldestOffsetKnown = MultiXactState->oldestOffsetKnown; + LWLockRelease(MultiXactGenLock); + + if (!oldestOffsetKnown) + return false; + + *members = nextOffset - oldestOffset; + *multixacts = nextMultiXactId - oldestMultiXactId; + return true; +} + +/* + * Multixact members can be removed once the multixacts that refer to them + * are older than every datminmxid. autovacuum_multixact_freeze_max_age and + * vacuum_multixact_freeze_table_age work together to make sure we never have + * too many multixacts; we hope that, at least under normal circumstances, + * this will also be sufficient to keep us from using too many offsets. + * However, if the average multixact has many members, we might exhaust the + * members space while still using few enough members that these limits fail + * to trigger full table scans for relminmxid advancement. At that point, + * we'd have no choice but to start failing multixact-creating operations + * with an error. + * + * To prevent that, if more than a threshold portion of the members space is + * used, we effectively reduce autovacuum_multixact_freeze_max_age and + * to a value just less than the number of multixacts in use. We hope that + * this will quickly trigger autovacuuming on the table or tables with the + * oldest relminmxid, thus allowing datminmxid values to advance and removing + * some members. + * + * As the fraction of the member space currently in use grows, we become + * more aggressive in clamping this value. That not only causes autovacuum + * to ramp up, but also makes any manual vacuums the user issues more + * aggressive. This happens because vacuum_set_xid_limits() clamps the + * freeze table and the minimum freeze age based on the effective + * autovacuum_multixact_freeze_max_age this function returns. In the worst + * case, we'll claim the freeze_max_age to zero, and every vacuum of any + * table will try to freeze every multixact. + * + * It's possible that these thresholds should be user-tunable, but for now + * we keep it simple. + */ +int +MultiXactMemberFreezeThreshold(void) +{ + MultiXactOffset members; + uint32 multixacts; + uint32 victim_multixacts; + double fraction; + + /* If we can't determine member space utilization, assume the worst. */ + if (!ReadMultiXactCounts(&multixacts, &members)) + return 0; + + /* If member space utilization is low, no special action is required. */ + if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + return autovacuum_multixact_freeze_max_age; + + /* + * Compute a target for relminmxid advancement. The number of multixacts + * we try to eliminate from the system is based on how far we are past + * MULTIXACT_MEMBER_SAFE_THRESHOLD. + */ + fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / + (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + victim_multixacts = multixacts * fraction; + + /* fraction could be > 1.0, but lowest possible freeze age is zero */ + if (victim_multixacts > multixacts) + return 0; + return multixacts - victim_multixacts; +} + +typedef struct mxtruncinfo +{ + int earliestExistingPage; +} mxtruncinfo; + +/* + * SlruScanDirectory callback + * This callback determines the earliest existing page number. + */ +static bool +SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) +{ + mxtruncinfo *trunc = (mxtruncinfo *) data; + + if (trunc->earliestExistingPage == -1 || + ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) + { + trunc->earliestExistingPage = segpage; + } + + return false; /* keep going */ +} + + +/* + * Delete members segments [oldest, newOldest) + * + * The members SLRU can, in contrast to the offsets one, be filled to almost + * the full range at once. This means SimpleLruTruncate() can't trivially be + * used - instead the to-be-deleted range is computed using the offsets + * SLRU. C.f. TruncateMultiXact(). + */ +static void +PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) +{ + const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); + int startsegment = MXOffsetToMemberSegment(oldestOffset); + int endsegment = MXOffsetToMemberSegment(newOldestOffset); + int segment = startsegment; + + /* + * Delete all the segments but the last one. The last segment can still + * contain, possibly partially, valid data. + */ + while (segment != endsegment) + { + elog(DEBUG2, "truncating multixact members segment %x", segment); + SlruDeleteSegment(MultiXactMemberCtl, segment); + + /* move to next segment, handling wraparound correctly */ + if (segment == maxsegment) + segment = 0; + else + segment += 1; + } +} + +/* + * Delete offsets segments [oldest, newOldest) + */ +static void +PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) +{ + /* + * We step back one multixact to avoid passing a cutoff page that hasn't + * been created yet in the rare case that oldestMulti would be the first + * item on a page and oldestMulti == nextMulti. In that case, if we + * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound + * detection. + */ + SimpleLruTruncate(MultiXactOffsetCtl, + MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); +} + +/* + * Remove all MultiXactOffset and MultiXactMember segments before the oldest + * ones still of interest. + * + * This is only called on a primary as part of vacuum (via + * vac_truncate_clog()). During recovery truncation is done by replaying + * truncation WAL records logged here. + * + * newOldestMulti is the oldest currently required multixact, newOldestMultiDB + * is one of the databases preventing newOldestMulti from increasing. + */ +void +TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) +{ + MultiXactId oldestMulti; + MultiXactId nextMulti; + MultiXactOffset newOldestOffset; + MultiXactOffset oldestOffset; + MultiXactOffset nextOffset; + mxtruncinfo trunc; + MultiXactId earliest; + + Assert(!RecoveryInProgress()); + Assert(MultiXactState->finishedStartup); + + /* + * We can only allow one truncation to happen at once. Otherwise parts of + * members might vanish while we're doing lookups or similar. There's no + * need to have an interlock with creating new multis or such, since those + * are constrained by the limits (which only grow, never shrink). + */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextMulti = MultiXactState->nextMXact; + nextOffset = MultiXactState->nextOffset; + oldestMulti = MultiXactState->oldestMultiXactId; + LWLockRelease(MultiXactGenLock); + Assert(MultiXactIdIsValid(oldestMulti)); + + /* + * Make sure to only attempt truncation if there's values to truncate + * away. In normal processing values shouldn't go backwards, but there's + * some corner cases (due to bugs) where that's possible. + */ + if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) + { + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * Note we can't just plow ahead with the truncation; it's possible that + * there are no segments to truncate, which is a problem because we are + * going to attempt to read the offsets page to determine where to + * truncate the members SLRU. So we first scan the directory to determine + * the earliest offsets page number that we can read without error. + * + * When nextMXact is less than one segment away from multiWrapLimit, + * SlruScanDirCbFindEarliest can find some early segment other than the + * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST) + * returns false, because not all pairs of entries have the same answer.) + * That can also arise when an earlier truncation attempt failed unlink() + * or returned early from this function. The only consequence is + * returning early, which wastes space that we could have liberated. + * + * NB: It's also possible that the page that oldestMulti is on has already + * been truncated away, and we crashed before updating oldestMulti. + */ + trunc.earliestExistingPage = -1; + SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); + earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; + if (earliest < FirstMultiXactId) + earliest = FirstMultiXactId; + + /* If there's nothing to remove, we can bail out early. */ + if (MultiXactIdPrecedes(oldestMulti, earliest)) + { + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * First, compute the safe truncation point for MultiXactMember. This is + * the starting offset of the oldest multixact. + * + * Hopefully, find_multixact_start will always work here, because we've + * already checked that it doesn't precede the earliest MultiXact on disk. + * But if it fails, don't truncate anything, and log a message. + */ + if (oldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + oldestOffset = nextOffset; + } + else if (!find_multixact_start(oldestMulti, &oldestOffset)) + { + ereport(LOG, + (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation", + oldestMulti, earliest))); + LWLockRelease(MultiXactTruncationLock); + return; + } + + /* + * Secondly compute up to where to truncate. Lookup the corresponding + * member offset for newOldestMulti for that. + */ + if (newOldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + newOldestOffset = nextOffset; + } + else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) + { + ereport(LOG, + (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation", + newOldestMulti))); + LWLockRelease(MultiXactTruncationLock); + return; + } + + elog(DEBUG1, "performing multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + oldestMulti, newOldestMulti, + MultiXactIdToOffsetSegment(oldestMulti), + MultiXactIdToOffsetSegment(newOldestMulti), + oldestOffset, newOldestOffset, + MXOffsetToMemberSegment(oldestOffset), + MXOffsetToMemberSegment(newOldestOffset)); + + /* + * Do truncation, and the WAL logging of the truncation, in a critical + * section. That way offsets/members cannot get out of sync anymore, i.e. + * once consistent the newOldestMulti will always exist in members, even + * if we crashed in the wrong moment. + */ + START_CRIT_SECTION(); + + /* + * Prevent checkpoints from being scheduled concurrently. This is critical + * because otherwise a truncation record might not be replayed after a + * crash/basebackup, even though the state of the data directory would + * require it. + */ + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + /* WAL log truncation */ + WriteMTruncateXlogRec(newOldestMultiDB, + oldestMulti, newOldestMulti, + oldestOffset, newOldestOffset); + + /* + * Update in-memory limits before performing the truncation, while inside + * the critical section: Have to do it before truncation, to prevent + * concurrent lookups of those values. Has to be inside the critical + * section as otherwise a future call to this function would error out, + * while looking up the oldest member in offsets, if our caller crashes + * before updating the limits. + */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactId = newOldestMulti; + MultiXactState->oldestMultiXactDB = newOldestMultiDB; + LWLockRelease(MultiXactGenLock); + + /* First truncate members */ + PerformMembersTruncation(oldestOffset, newOldestOffset); + + /* Then offsets */ + PerformOffsetsTruncation(oldestMulti, newOldestMulti); + + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + END_CRIT_SECTION(); + LWLockRelease(MultiXactTruncationLock); +} + +/* + * Decide whether a MultiXactOffset page number is "older" for truncation + * purposes. Analogous to CLOGPagePrecedes(). + * + * Offsetting the values is optional, because MultiXactIdPrecedes() has + * translational symmetry. + */ +static bool +MultiXactOffsetPagePrecedes(int page1, int page2) +{ + MultiXactId multi1; + MultiXactId multi2; + + multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; + multi1 += FirstMultiXactId + 1; + multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; + multi2 += FirstMultiXactId + 1; + + return (MultiXactIdPrecedes(multi1, multi2) && + MultiXactIdPrecedes(multi1, + multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1)); +} + +/* + * Decide whether a MultiXactMember page number is "older" for truncation + * purposes. There is no "invalid offset number" so use the numbers verbatim. + */ +static bool +MultiXactMemberPagePrecedes(int page1, int page2) +{ + MultiXactOffset offset1; + MultiXactOffset offset2; + + offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; + offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; + + return (MultiXactOffsetPrecedes(offset1, offset2) && + MultiXactOffsetPrecedes(offset1, + offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); +} + +/* + * Decide which of two MultiXactIds is earlier. + * + * XXX do we need to do something special for InvalidMultiXactId? + * (Doesn't look like it.) + */ +bool +MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) +{ + int32 diff = (int32) (multi1 - multi2); + + return (diff < 0); +} + +/* + * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2? + * + * XXX do we need to do something special for InvalidMultiXactId? + * (Doesn't look like it.) + */ +bool +MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) +{ + int32 diff = (int32) (multi1 - multi2); + + return (diff <= 0); +} + + +/* + * Decide which of two offsets is earlier. + */ +static bool +MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) +{ + int32 diff = (int32) (offset1 - offset2); + + return (diff < 0); +} + +/* + * Write an xlog record reflecting the zeroing of either a MEMBERs or + * OFFSETs page (info shows which) + */ +static void +WriteMZeroPageXlogRec(int pageno, uint8 info) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&pageno), sizeof(int)); + (void) XLogInsert(RM_MULTIXACT_ID, info); +} + +/* + * Write a TRUNCATE xlog record + * + * We must flush the xlog record to disk before returning --- see notes in + * TruncateCLOG(). + */ +static void +WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startTruncOff, MultiXactId endTruncOff, + MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) +{ + XLogRecPtr recptr; + xl_multixact_truncate xlrec; + + xlrec.oldestMultiDB = oldestMultiDB; + + xlrec.startTruncOff = startTruncOff; + xlrec.endTruncOff = endTruncOff; + + xlrec.startTruncMemb = startTruncMemb; + xlrec.endTruncMemb = endTruncMemb; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate); + recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); + XLogFlush(recptr); +} + +/* + * MULTIXACT resource manager's routines + */ +void +multixact_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in multixact records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactOffsetSLRULock); + } + else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) + { + int pageno; + int slotno; + + memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + + LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + + slotno = ZeroMultiXactMemberPage(pageno, false); + SimpleLruWritePage(MultiXactMemberCtl, slotno); + Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); + + LWLockRelease(MultiXactMemberSLRULock); + } + else if (info == XLOG_MULTIXACT_CREATE_ID) + { + xl_multixact_create *xlrec = + (xl_multixact_create *) XLogRecGetData(record); + TransactionId max_xid; + int i; + + /* Store the data back into the SLRU files */ + RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, + xlrec->members); + + /* Make sure nextMXact/nextOffset are beyond what this record has */ + MultiXactAdvanceNextMXact(xlrec->mid + 1, + xlrec->moff + xlrec->nmembers); + + /* + * Make sure nextXid is beyond any XID mentioned in the record. This + * should be unnecessary, since any XID found here ought to have other + * evidence in the XLOG, but let's be safe. + */ + max_xid = XLogRecGetXid(record); + for (i = 0; i < xlrec->nmembers; i++) + { + if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) + max_xid = xlrec->members[i].xid; + } + + AdvanceNextFullTransactionIdPastXid(max_xid); + } + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate xlrec; + int pageno; + + memcpy(&xlrec, XLogRecGetData(record), + SizeOfMultiXactTruncate); + + elog(DEBUG1, "replaying multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + xlrec.startTruncOff, xlrec.endTruncOff, + MultiXactIdToOffsetSegment(xlrec.startTruncOff), + MultiXactIdToOffsetSegment(xlrec.endTruncOff), + xlrec.startTruncMemb, xlrec.endTruncMemb, + MXOffsetToMemberSegment(xlrec.startTruncMemb), + MXOffsetToMemberSegment(xlrec.endTruncMemb)); + + /* should not be required, but more than cheap enough */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); + + /* + * Advance the horizon values, so they're current at the end of + * recovery. + */ + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + + PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); + + /* + * During XLOG replay, latest_page_number isn't necessarily set up + * yet; insert a suitable value to bypass the sanity test in + * SimpleLruTruncate. + */ + pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); + + LWLockRelease(MultiXactTruncationLock); + } + else + elog(PANIC, "multixact_redo: unknown op code %u", info); +} + +Datum +pg_get_multixact_members(PG_FUNCTION_ARGS) +{ + typedef struct + { + MultiXactMember *members; + int nmembers; + int iter; + } mxact; + MultiXactId mxid = PG_GETARG_TRANSACTIONID(0); + mxact *multi; + FuncCallContext *funccxt; + + if (mxid < FirstMultiXactId) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid MultiXactId: %u", mxid))); + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcxt; + TupleDesc tupdesc; + + funccxt = SRF_FIRSTCALL_INIT(); + oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); + + multi = palloc(sizeof(mxact)); + /* no need to allow for old values here */ + multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, + false); + multi->iter = 0; + + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode", + TEXTOID, -1, 0); + + funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); + funccxt->user_fctx = multi; + + MemoryContextSwitchTo(oldcxt); + } + + funccxt = SRF_PERCALL_SETUP(); + multi = (mxact *) funccxt->user_fctx; + + while (multi->iter < multi->nmembers) + { + HeapTuple tuple; + char *values[2]; + + values[0] = psprintf("%u", multi->members[multi->iter].xid); + values[1] = mxstatus_to_string(multi->members[multi->iter].status); + + tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); + + multi->iter++; + pfree(values[0]); + SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); + } + + SRF_RETURN_DONE(funccxt); +} + +/* + * Entrypoint for sync.c to sync offsets files. + */ +int +multixactoffsetssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); +} + +/* + * Entrypoint for sync.c to sync members files. + */ +int +multixactmemberssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); +} diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c new file mode 100644 index 0000000..df0cd77 --- /dev/null +++ b/src/backend/access/transam/parallel.c @@ -0,0 +1,1597 @@ +/*------------------------------------------------------------------------- + * + * parallel.c + * Infrastructure for launching parallel workers + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/transam/parallel.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/parallel.h" +#include "access/session.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/pg_enum.h" +#include "catalog/storage.h" +#include "commands/async.h" +#include "commands/vacuum.h" +#include "executor/execParallel.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/sinval.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/combocid.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/typcache.h" + +/* + * We don't want to waste a lot of memory on an error queue which, most of + * the time, will process only a handful of small messages. However, it is + * desirable to make it large enough that a typical ErrorResponse can be sent + * without blocking. That way, a worker that errors out can write the whole + * message into the queue and terminate without waiting for the user backend. + */ +#define PARALLEL_ERROR_QUEUE_SIZE 16384 + +/* Magic number for parallel context TOC. */ +#define PARALLEL_MAGIC 0x50477c7c + +/* + * Magic numbers for per-context parallel state sharing. Higher-level code + * should use smaller values, leaving these very large ones for use by this + * module. + */ +#define PARALLEL_KEY_FIXED UINT64CONST(0xFFFFFFFFFFFF0001) +#define PARALLEL_KEY_ERROR_QUEUE UINT64CONST(0xFFFFFFFFFFFF0002) +#define PARALLEL_KEY_LIBRARY UINT64CONST(0xFFFFFFFFFFFF0003) +#define PARALLEL_KEY_GUC UINT64CONST(0xFFFFFFFFFFFF0004) +#define PARALLEL_KEY_COMBO_CID UINT64CONST(0xFFFFFFFFFFFF0005) +#define PARALLEL_KEY_TRANSACTION_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0006) +#define PARALLEL_KEY_ACTIVE_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0007) +#define PARALLEL_KEY_TRANSACTION_STATE UINT64CONST(0xFFFFFFFFFFFF0008) +#define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009) +#define PARALLEL_KEY_SESSION_DSM UINT64CONST(0xFFFFFFFFFFFF000A) +#define PARALLEL_KEY_PENDING_SYNCS UINT64CONST(0xFFFFFFFFFFFF000B) +#define PARALLEL_KEY_REINDEX_STATE UINT64CONST(0xFFFFFFFFFFFF000C) +#define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) +#define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) + +/* Fixed-size parallel state. */ +typedef struct FixedParallelState +{ + /* Fixed-size state that workers must restore. */ + Oid database_id; + Oid authenticated_user_id; + Oid current_user_id; + Oid outer_user_id; + Oid temp_namespace_id; + Oid temp_toast_namespace_id; + int sec_context; + bool is_superuser; + PGPROC *parallel_leader_pgproc; + pid_t parallel_leader_pid; + BackendId parallel_leader_backend_id; + TimestampTz xact_ts; + TimestampTz stmt_ts; + SerializableXactHandle serializable_xact_handle; + + /* Mutex protects remaining fields. */ + slock_t mutex; + + /* Maximum XactLastRecEnd of any worker. */ + XLogRecPtr last_xlog_end; +} FixedParallelState; + +/* + * Our parallel worker number. We initialize this to -1, meaning that we are + * not a parallel worker. In parallel workers, it will be set to a value >= 0 + * and < the number of workers before any user code is invoked; each parallel + * worker will get a different parallel worker number. + */ +int ParallelWorkerNumber = -1; + +/* Is there a parallel message pending which we need to receive? */ +volatile bool ParallelMessagePending = false; + +/* Are we initializing a parallel worker? */ +bool InitializingParallelWorker = false; + +/* Pointer to our fixed parallel state. */ +static FixedParallelState *MyFixedParallelState; + +/* List of active parallel contexts. */ +static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list); + +/* Backend-local copy of data from FixedParallelState. */ +static pid_t ParallelLeaderPid; + +/* + * List of internal parallel worker entry points. We need this for + * reasons explained in LookupParallelWorkerFunction(), below. + */ +static const struct +{ + const char *fn_name; + parallel_worker_main_type fn_addr; +} InternalParallelWorkers[] = + +{ + { + "ParallelQueryMain", ParallelQueryMain + }, + { + "_bt_parallel_build_main", _bt_parallel_build_main + }, + { + "parallel_vacuum_main", parallel_vacuum_main + } +}; + +/* Private functions. */ +static void HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg); +static void WaitForParallelWorkersToExit(ParallelContext *pcxt); +static parallel_worker_main_type LookupParallelWorkerFunction(const char *libraryname, const char *funcname); +static void ParallelWorkerShutdown(int code, Datum arg); + + +/* + * Establish a new parallel context. This should be done after entering + * parallel mode, and (unless there is an error) the context should be + * destroyed before exiting the current subtransaction. + */ +ParallelContext * +CreateParallelContext(const char *library_name, const char *function_name, + int nworkers) +{ + MemoryContext oldcontext; + ParallelContext *pcxt; + + /* It is unsafe to create a parallel context if not in parallel mode. */ + Assert(IsInParallelMode()); + + /* Number of workers should be non-negative. */ + Assert(nworkers >= 0); + + /* We might be running in a short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Initialize a new ParallelContext. */ + pcxt = palloc0(sizeof(ParallelContext)); + pcxt->subid = GetCurrentSubTransactionId(); + pcxt->nworkers = nworkers; + pcxt->nworkers_to_launch = nworkers; + pcxt->library_name = pstrdup(library_name); + pcxt->function_name = pstrdup(function_name); + pcxt->error_context_stack = error_context_stack; + shm_toc_initialize_estimator(&pcxt->estimator); + dlist_push_head(&pcxt_list, &pcxt->node); + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); + + return pcxt; +} + +/* + * Establish the dynamic shared memory segment for a parallel context and + * copy state and other bookkeeping information that will be needed by + * parallel workers into it. + */ +void +InitializeParallelDSM(ParallelContext *pcxt) +{ + MemoryContext oldcontext; + Size library_len = 0; + Size guc_len = 0; + Size combocidlen = 0; + Size tsnaplen = 0; + Size asnaplen = 0; + Size tstatelen = 0; + Size pendingsyncslen = 0; + Size reindexlen = 0; + Size relmapperlen = 0; + Size uncommittedenumslen = 0; + Size segsize = 0; + int i; + FixedParallelState *fps; + dsm_handle session_dsm_handle = DSM_HANDLE_INVALID; + Snapshot transaction_snapshot = GetTransactionSnapshot(); + Snapshot active_snapshot = GetActiveSnapshot(); + + /* We might be running in a very short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Allow space to store the fixed-size parallel state. */ + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(FixedParallelState)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* + * Normally, the user will have requested at least one worker process, but + * if by chance they have not, we can skip a bunch of things here. + */ + if (pcxt->nworkers > 0) + { + /* Get (or create) the per-session DSM segment's handle. */ + session_dsm_handle = GetSessionDsmHandle(); + + /* + * If we weren't able to create a per-session DSM segment, then we can + * continue but we can't safely launch any workers because their + * record typmods would be incompatible so they couldn't exchange + * tuples. + */ + if (session_dsm_handle == DSM_HANDLE_INVALID) + pcxt->nworkers = 0; + } + + if (pcxt->nworkers > 0) + { + /* Estimate space for various kinds of state sharing. */ + library_len = EstimateLibraryStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, library_len); + guc_len = EstimateGUCStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, guc_len); + combocidlen = EstimateComboCIDStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, combocidlen); + if (IsolationUsesXactSnapshot()) + { + tsnaplen = EstimateSnapshotSpace(transaction_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, tsnaplen); + } + asnaplen = EstimateSnapshotSpace(active_snapshot); + shm_toc_estimate_chunk(&pcxt->estimator, asnaplen); + tstatelen = EstimateTransactionStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, tstatelen); + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(dsm_handle)); + pendingsyncslen = EstimatePendingSyncsSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, pendingsyncslen); + reindexlen = EstimateReindexStateSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, reindexlen); + relmapperlen = EstimateRelationMapSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, relmapperlen); + uncommittedenumslen = EstimateUncommittedEnumsSpace(); + shm_toc_estimate_chunk(&pcxt->estimator, uncommittedenumslen); + /* If you add more chunks here, you probably need to add keys. */ + shm_toc_estimate_keys(&pcxt->estimator, 11); + + /* Estimate space need for error queues. */ + StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == + PARALLEL_ERROR_QUEUE_SIZE, + "parallel error queue size not buffer-aligned"); + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(PARALLEL_ERROR_QUEUE_SIZE, + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + /* Estimate how much we'll need for the entrypoint info. */ + shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + + strlen(pcxt->function_name) + 2); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + + /* + * Create DSM and initialize with new table of contents. But if the user + * didn't request any workers, then don't bother creating a dynamic shared + * memory segment; instead, just use backend-private memory. + * + * Also, if we can't create a dynamic shared memory segment because the + * maximum number of segments have already been created, then fall back to + * backend-private memory, and plan not to use any workers. We hope this + * won't happen very often, but it's better to abandon the use of + * parallelism than to fail outright. + */ + segsize = shm_toc_estimate(&pcxt->estimator); + if (pcxt->nworkers > 0) + pcxt->seg = dsm_create(segsize, DSM_CREATE_NULL_IF_MAXSEGMENTS); + if (pcxt->seg != NULL) + pcxt->toc = shm_toc_create(PARALLEL_MAGIC, + dsm_segment_address(pcxt->seg), + segsize); + else + { + pcxt->nworkers = 0; + pcxt->private_memory = MemoryContextAlloc(TopMemoryContext, segsize); + pcxt->toc = shm_toc_create(PARALLEL_MAGIC, pcxt->private_memory, + segsize); + } + + /* Initialize fixed-size state in shared memory. */ + fps = (FixedParallelState *) + shm_toc_allocate(pcxt->toc, sizeof(FixedParallelState)); + fps->database_id = MyDatabaseId; + fps->authenticated_user_id = GetAuthenticatedUserId(); + fps->outer_user_id = GetCurrentRoleId(); + fps->is_superuser = session_auth_is_superuser; + GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context); + GetTempNamespaceState(&fps->temp_namespace_id, + &fps->temp_toast_namespace_id); + fps->parallel_leader_pgproc = MyProc; + fps->parallel_leader_pid = MyProcPid; + fps->parallel_leader_backend_id = MyBackendId; + fps->xact_ts = GetCurrentTransactionStartTimestamp(); + fps->stmt_ts = GetCurrentStatementStartTimestamp(); + fps->serializable_xact_handle = ShareSerializableXact(); + SpinLockInit(&fps->mutex); + fps->last_xlog_end = 0; + shm_toc_insert(pcxt->toc, PARALLEL_KEY_FIXED, fps); + + /* We can skip the rest of this if we're not budgeting for any workers. */ + if (pcxt->nworkers > 0) + { + char *libraryspace; + char *gucspace; + char *combocidspace; + char *tsnapspace; + char *asnapspace; + char *tstatespace; + char *pendingsyncsspace; + char *reindexspace; + char *relmapperspace; + char *error_queue_space; + char *session_dsm_handle_space; + char *entrypointstate; + char *uncommittedenumsspace; + Size lnamelen; + + /* Serialize shared libraries we have loaded. */ + libraryspace = shm_toc_allocate(pcxt->toc, library_len); + SerializeLibraryState(library_len, libraryspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_LIBRARY, libraryspace); + + /* Serialize GUC settings. */ + gucspace = shm_toc_allocate(pcxt->toc, guc_len); + SerializeGUCState(guc_len, gucspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_GUC, gucspace); + + /* Serialize combo CID state. */ + combocidspace = shm_toc_allocate(pcxt->toc, combocidlen); + SerializeComboCIDState(combocidlen, combocidspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_COMBO_CID, combocidspace); + + /* + * Serialize the transaction snapshot if the transaction + * isolation-level uses a transaction snapshot. + */ + if (IsolationUsesXactSnapshot()) + { + tsnapspace = shm_toc_allocate(pcxt->toc, tsnaplen); + SerializeSnapshot(transaction_snapshot, tsnapspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, + tsnapspace); + } + + /* Serialize the active snapshot. */ + asnapspace = shm_toc_allocate(pcxt->toc, asnaplen); + SerializeSnapshot(active_snapshot, asnapspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, asnapspace); + + /* Provide the handle for per-session segment. */ + session_dsm_handle_space = shm_toc_allocate(pcxt->toc, + sizeof(dsm_handle)); + *(dsm_handle *) session_dsm_handle_space = session_dsm_handle; + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSION_DSM, + session_dsm_handle_space); + + /* Serialize transaction state. */ + tstatespace = shm_toc_allocate(pcxt->toc, tstatelen); + SerializeTransactionState(tstatelen, tstatespace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_TRANSACTION_STATE, tstatespace); + + /* Serialize pending syncs. */ + pendingsyncsspace = shm_toc_allocate(pcxt->toc, pendingsyncslen); + SerializePendingSyncs(pendingsyncslen, pendingsyncsspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_PENDING_SYNCS, + pendingsyncsspace); + + /* Serialize reindex state. */ + reindexspace = shm_toc_allocate(pcxt->toc, reindexlen); + SerializeReindexState(reindexlen, reindexspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_REINDEX_STATE, reindexspace); + + /* Serialize relmapper state. */ + relmapperspace = shm_toc_allocate(pcxt->toc, relmapperlen); + SerializeRelationMap(relmapperlen, relmapperspace); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_RELMAPPER_STATE, + relmapperspace); + + /* Serialize uncommitted enum state. */ + uncommittedenumsspace = shm_toc_allocate(pcxt->toc, + uncommittedenumslen); + SerializeUncommittedEnums(uncommittedenumsspace, uncommittedenumslen); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_UNCOMMITTEDENUMS, + uncommittedenumsspace); + + /* Allocate space for worker information. */ + pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers); + + /* + * Establish error queues in dynamic shared memory. + * + * These queues should be used only for transmitting ErrorResponse, + * NoticeResponse, and NotifyResponse protocol messages. Tuple data + * should be transmitted via separate (possibly larger?) queues. + */ + error_queue_space = + shm_toc_allocate(pcxt->toc, + mul_size(PARALLEL_ERROR_QUEUE_SIZE, + pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + char *start; + shm_mq *mq; + + start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; + mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, MyProc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, error_queue_space); + + /* + * Serialize entrypoint information. It's unsafe to pass function + * pointers across processes, as the function pointer may be different + * in each process in EXEC_BACKEND builds, so we always pass library + * and function name. (We use library name "postgres" for functions + * in the core backend.) + */ + lnamelen = strlen(pcxt->library_name); + entrypointstate = shm_toc_allocate(pcxt->toc, lnamelen + + strlen(pcxt->function_name) + 2); + strcpy(entrypointstate, pcxt->library_name); + strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); + shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + } + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Reinitialize the dynamic shared memory segment for a parallel context such + * that we could launch workers for it again. + */ +void +ReinitializeParallelDSM(ParallelContext *pcxt) +{ + FixedParallelState *fps; + + /* Wait for any old workers to exit. */ + if (pcxt->nworkers_launched > 0) + { + WaitForParallelWorkersToFinish(pcxt); + WaitForParallelWorkersToExit(pcxt); + pcxt->nworkers_launched = 0; + if (pcxt->known_attached_workers) + { + pfree(pcxt->known_attached_workers); + pcxt->known_attached_workers = NULL; + pcxt->nknown_attached_workers = 0; + } + } + + /* Reset a few bits of fixed parallel state to a clean state. */ + fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); + fps->last_xlog_end = 0; + + /* Recreate error queues (if they exist). */ + if (pcxt->nworkers > 0) + { + char *error_queue_space; + int i; + + error_queue_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false); + for (i = 0; i < pcxt->nworkers; ++i) + { + char *start; + shm_mq *mq; + + start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE; + mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_receiver(mq, MyProc); + pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); + } + } +} + +/* + * Reinitialize parallel workers for a parallel context such that we could + * launch a different number of workers. This is required for cases where + * we need to reuse the same DSM segment, but the number of workers can + * vary from run-to-run. + */ +void +ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch) +{ + /* + * The number of workers that need to be launched must be less than the + * number of workers with which the parallel context is initialized. + */ + Assert(pcxt->nworkers >= nworkers_to_launch); + pcxt->nworkers_to_launch = nworkers_to_launch; +} + +/* + * Launch parallel workers. + */ +void +LaunchParallelWorkers(ParallelContext *pcxt) +{ + MemoryContext oldcontext; + BackgroundWorker worker; + int i; + bool any_registrations_failed = false; + + /* Skip this if we have no workers. */ + if (pcxt->nworkers == 0 || pcxt->nworkers_to_launch == 0) + return; + + /* We need to be a lock group leader. */ + BecomeLockGroupLeader(); + + /* If we do have workers, we'd better have a DSM segment. */ + Assert(pcxt->seg != NULL); + + /* We might be running in a short-lived memory context. */ + oldcontext = MemoryContextSwitchTo(TopTransactionContext); + + /* Configure a worker. */ + memset(&worker, 0, sizeof(worker)); + snprintf(worker.bgw_name, BGW_MAXLEN, "parallel worker for PID %d", + MyProcPid); + snprintf(worker.bgw_type, BGW_MAXLEN, "parallel worker"); + worker.bgw_flags = + BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION + | BGWORKER_CLASS_PARALLEL; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + sprintf(worker.bgw_library_name, "postgres"); + sprintf(worker.bgw_function_name, "ParallelWorkerMain"); + worker.bgw_main_arg = UInt32GetDatum(dsm_segment_handle(pcxt->seg)); + worker.bgw_notify_pid = MyProcPid; + + /* + * Start workers. + * + * The caller must be able to tolerate ending up with fewer workers than + * expected, so there is no need to throw an error here if registration + * fails. It wouldn't help much anyway, because registering the worker in + * no way guarantees that it will start up and initialize successfully. + */ + for (i = 0; i < pcxt->nworkers_to_launch; ++i) + { + memcpy(worker.bgw_extra, &i, sizeof(int)); + if (!any_registrations_failed && + RegisterDynamicBackgroundWorker(&worker, + &pcxt->worker[i].bgwhandle)) + { + shm_mq_set_handle(pcxt->worker[i].error_mqh, + pcxt->worker[i].bgwhandle); + pcxt->nworkers_launched++; + } + else + { + /* + * If we weren't able to register the worker, then we've bumped up + * against the max_worker_processes limit, and future + * registrations will probably fail too, so arrange to skip them. + * But we still have to execute this code for the remaining slots + * to make sure that we forget about the error queues we budgeted + * for those workers. Otherwise, we'll wait for them to start, + * but they never will. + */ + any_registrations_failed = true; + pcxt->worker[i].bgwhandle = NULL; + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + + /* + * Now that nworkers_launched has taken its final value, we can initialize + * known_attached_workers. + */ + if (pcxt->nworkers_launched > 0) + { + pcxt->known_attached_workers = + palloc0(sizeof(bool) * pcxt->nworkers_launched); + pcxt->nknown_attached_workers = 0; + } + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Wait for all workers to attach to their error queues, and throw an error if + * any worker fails to do this. + * + * Callers can assume that if this function returns successfully, then the + * number of workers given by pcxt->nworkers_launched have initialized and + * attached to their error queues. Whether or not these workers are guaranteed + * to still be running depends on what code the caller asked them to run; + * this function does not guarantee that they have not exited. However, it + * does guarantee that any workers which exited must have done so cleanly and + * after successfully performing the work with which they were tasked. + * + * If this function is not called, then some of the workers that were launched + * may not have been started due to a fork() failure, or may have exited during + * early startup prior to attaching to the error queue, so nworkers_launched + * cannot be viewed as completely reliable. It will never be less than the + * number of workers which actually started, but it might be more. Any workers + * that failed to start will still be discovered by + * WaitForParallelWorkersToFinish and an error will be thrown at that time, + * provided that function is eventually reached. + * + * In general, the leader process should do as much work as possible before + * calling this function. fork() failures and other early-startup failures + * are very uncommon, and having the leader sit idle when it could be doing + * useful work is undesirable. However, if the leader needs to wait for + * all of its workers or for a specific worker, it may want to call this + * function before doing so. If not, it must make some other provision for + * the failure-to-start case, lest it wait forever. On the other hand, a + * leader which never waits for a worker that might not be started yet, or + * at least never does so prior to WaitForParallelWorkersToFinish(), need not + * call this function at all. + */ +void +WaitForParallelWorkersToAttach(ParallelContext *pcxt) +{ + int i; + + /* Skip this if we have no launched workers. */ + if (pcxt->nworkers_launched == 0) + return; + + for (;;) + { + /* + * This will process any parallel messages that are pending and it may + * also throw an error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + BgwHandleStatus status; + shm_mq *mq; + int rc; + pid_t pid; + + if (pcxt->known_attached_workers[i]) + continue; + + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. + */ + if (pcxt->worker[i].error_mqh == NULL) + { + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + continue; + } + + status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid); + if (status == BGWH_STARTED) + { + /* Has the worker attached to the error queue? */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) != NULL) + { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + } + else if (status == BGWH_STOPPED) + { + /* + * If the worker stopped without attaching to the error queue, + * throw an error. + */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } + else + { + /* + * Worker not yet started, so we must wait. The postmaster + * will notify us if the worker's state changes. Our latch + * might also get set for some other reason, but if so we'll + * just end up waiting for the same worker again. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, + -1, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + } + + /* If all workers are known to have started, we're done. */ + if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched) + { + Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched); + break; + } + } +} + +/* + * Wait for all workers to finish computing. + * + * Even if the parallel operation seems to have completed successfully, it's + * important to call this function afterwards. We must not miss any errors + * the workers may have thrown during the parallel operation, or any that they + * may yet throw while shutting down. + * + * Also, we want to update our notion of XactLastRecEnd based on worker + * feedback. + */ +void +WaitForParallelWorkersToFinish(ParallelContext *pcxt) +{ + for (;;) + { + bool anyone_alive = false; + int nfinished = 0; + int i; + + /* + * This will process any parallel messages that are pending, which may + * change the outcome of the loop that follows. It may also throw an + * error propagated from a worker. + */ + CHECK_FOR_INTERRUPTS(); + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + /* + * If error_mqh is NULL, then the worker has already exited + * cleanly. If we have received a message through error_mqh from + * the worker, we know it started up cleanly, and therefore we're + * certain to be notified when it exits. + */ + if (pcxt->worker[i].error_mqh == NULL) + ++nfinished; + else if (pcxt->known_attached_workers[i]) + { + anyone_alive = true; + break; + } + } + + if (!anyone_alive) + { + /* If all workers are known to have finished, we're done. */ + if (nfinished >= pcxt->nworkers_launched) + { + Assert(nfinished == pcxt->nworkers_launched); + break; + } + + /* + * We didn't detect any living workers, but not all workers are + * known to have exited cleanly. Either not all workers have + * launched yet, or maybe some of them failed to start or + * terminated abnormally. + */ + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + pid_t pid; + shm_mq *mq; + + /* + * If the worker is BGWH_NOT_YET_STARTED or BGWH_STARTED, we + * should just keep waiting. If it is BGWH_STOPPED, then + * further investigation is needed. + */ + if (pcxt->worker[i].error_mqh == NULL || + pcxt->worker[i].bgwhandle == NULL || + GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, + &pid) != BGWH_STOPPED) + continue; + + /* + * Check whether the worker ended up stopped without ever + * attaching to the error queue. If so, the postmaster was + * unable to fork the worker or it exited without initializing + * properly. We must throw an error, since the caller may + * have been expecting the worker to do some work before + * exiting. + */ + mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); + if (shm_mq_get_sender(mq) == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("parallel worker failed to initialize"), + errhint("More details may be available in the server log."))); + + /* + * The worker is stopped, but is attached to the error queue. + * Unless there's a bug somewhere, this will only happen when + * the worker writes messages and terminates after the + * CHECK_FOR_INTERRUPTS() near the top of this function and + * before the call to GetBackgroundWorkerPid(). In that case, + * or latch should have been set as well and the right things + * will happen on the next pass through the loop. + */ + } + } + + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1, + WAIT_EVENT_PARALLEL_FINISH); + ResetLatch(MyLatch); + } + + if (pcxt->toc != NULL) + { + FixedParallelState *fps; + + fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false); + if (fps->last_xlog_end > XactLastRecEnd) + XactLastRecEnd = fps->last_xlog_end; + } +} + +/* + * Wait for all workers to exit. + * + * This function ensures that workers have been completely shutdown. The + * difference between WaitForParallelWorkersToFinish and this function is + * that the former just ensures that last message sent by a worker backend is + * received by the leader backend whereas this ensures the complete shutdown. + */ +static void +WaitForParallelWorkersToExit(ParallelContext *pcxt) +{ + int i; + + /* Wait until the workers actually die. */ + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + BgwHandleStatus status; + + if (pcxt->worker == NULL || pcxt->worker[i].bgwhandle == NULL) + continue; + + status = WaitForBackgroundWorkerShutdown(pcxt->worker[i].bgwhandle); + + /* + * If the postmaster kicked the bucket, we have no chance of cleaning + * up safely -- we won't be able to tell when our workers are actually + * dead. This doesn't necessitate a PANIC since they will all abort + * eventually, but we can't safely continue this session. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("postmaster exited during a parallel transaction"))); + + /* Release memory. */ + pfree(pcxt->worker[i].bgwhandle); + pcxt->worker[i].bgwhandle = NULL; + } +} + +/* + * Destroy a parallel context. + * + * If expecting a clean exit, you should use WaitForParallelWorkersToFinish() + * first, before calling this function. When this function is invoked, any + * remaining workers are forcibly killed; the dynamic shared memory segment + * is unmapped; and we then wait (uninterruptibly) for the workers to exit. + */ +void +DestroyParallelContext(ParallelContext *pcxt) +{ + int i; + + /* + * Be careful about order of operations here! We remove the parallel + * context from the list before we do anything else; otherwise, if an + * error occurs during a subsequent step, we might try to nuke it again + * from AtEOXact_Parallel or AtEOSubXact_Parallel. + */ + dlist_delete(&pcxt->node); + + /* Kill each worker in turn, and forget their error queues. */ + if (pcxt->worker != NULL) + { + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + if (pcxt->worker[i].error_mqh != NULL) + { + TerminateBackgroundWorker(pcxt->worker[i].bgwhandle); + + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + } + } + } + + /* + * If we have allocated a shared memory segment, detach it. This will + * implicitly detach the error queues, and any other shared memory queues, + * stored there. + */ + if (pcxt->seg != NULL) + { + dsm_detach(pcxt->seg); + pcxt->seg = NULL; + } + + /* + * If this parallel context is actually in backend-private memory rather + * than shared memory, free that memory instead. + */ + if (pcxt->private_memory != NULL) + { + pfree(pcxt->private_memory); + pcxt->private_memory = NULL; + } + + /* + * We can't finish transaction commit or abort until all of the workers + * have exited. This means, in particular, that we can't respond to + * interrupts at this stage. + */ + HOLD_INTERRUPTS(); + WaitForParallelWorkersToExit(pcxt); + RESUME_INTERRUPTS(); + + /* Free the worker array itself. */ + if (pcxt->worker != NULL) + { + pfree(pcxt->worker); + pcxt->worker = NULL; + } + + /* Free memory. */ + pfree(pcxt->library_name); + pfree(pcxt->function_name); + pfree(pcxt); +} + +/* + * Are there any parallel contexts currently active? + */ +bool +ParallelContextActive(void) +{ + return !dlist_is_empty(&pcxt_list); +} + +/* + * Handle receipt of an interrupt indicating a parallel worker message. + * + * Note: this is called within a signal handler! All we can do is set + * a flag that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * HandleParallelMessages(). + */ +void +HandleParallelMessageInterrupt(void) +{ + InterruptPending = true; + ParallelMessagePending = true; + SetLatch(MyLatch); +} + +/* + * Handle any queued protocol messages received from parallel workers. + */ +void +HandleParallelMessages(void) +{ + dlist_iter iter; + MemoryContext oldcontext; + + static MemoryContext hpm_context = NULL; + + /* + * This is invoked from ProcessInterrupts(), and since some of the + * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential + * for recursive calls if more signals are received while this runs. It's + * unclear that recursive entry would be safe, and it doesn't seem useful + * even if it is safe, so let's block interrupts until done. + */ + HOLD_INTERRUPTS(); + + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (hpm_context == NULL) /* first time through? */ + hpm_context = AllocSetContextCreate(TopMemoryContext, + "HandleParallelMessages", + ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(hpm_context); + + oldcontext = MemoryContextSwitchTo(hpm_context); + + /* OK to process messages. Reset the flag saying there are more to do. */ + ParallelMessagePending = false; + + dlist_foreach(iter, &pcxt_list) + { + ParallelContext *pcxt; + int i; + + pcxt = dlist_container(ParallelContext, node, iter.cur); + if (pcxt->worker == NULL) + continue; + + for (i = 0; i < pcxt->nworkers_launched; ++i) + { + /* + * Read as many messages as we can from each worker, but stop when + * either (1) the worker's error queue goes away, which can happen + * if we receive a Terminate message from the worker; or (2) no + * more messages can be read from the worker without blocking. + */ + while (pcxt->worker[i].error_mqh != NULL) + { + shm_mq_result res; + Size nbytes; + void *data; + + res = shm_mq_receive(pcxt->worker[i].error_mqh, &nbytes, + &data, true); + if (res == SHM_MQ_WOULD_BLOCK) + break; + else if (res == SHM_MQ_SUCCESS) + { + StringInfoData msg; + + initStringInfo(&msg); + appendBinaryStringInfo(&msg, data, nbytes); + HandleParallelMessage(pcxt, i, &msg); + pfree(msg.data); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to parallel worker"))); + } + } + } + + MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(hpm_context); + + RESUME_INTERRUPTS(); +} + +/* + * Handle a single protocol message received from a single parallel worker. + */ +static void +HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg) +{ + char msgtype; + + if (pcxt->known_attached_workers != NULL && + !pcxt->known_attached_workers[i]) + { + pcxt->known_attached_workers[i] = true; + pcxt->nknown_attached_workers++; + } + + msgtype = pq_getmsgbyte(msg); + + switch (msgtype) + { + case 'K': /* BackendKeyData */ + { + int32 pid = pq_getmsgint(msg, 4); + + (void) pq_getmsgint(msg, 4); /* discard cancel key */ + (void) pq_getmsgend(msg); + pcxt->worker[i].pid = pid; + break; + } + + case 'E': /* ErrorResponse */ + case 'N': /* NoticeResponse */ + { + ErrorData edata; + ErrorContextCallback *save_error_context_stack; + + /* Parse ErrorResponse or NoticeResponse. */ + pq_parse_errornotice(msg, &edata); + + /* Death of a worker isn't enough justification for suicide. */ + edata.elevel = Min(edata.elevel, ERROR); + + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel worker. Otherwise, it + * can sometimes be confusing to understand what actually + * happened. (We don't do this in FORCE_PARALLEL_REGRESS mode + * because it causes test-result instability depending on + * whether a parallel worker is actually used or not.) + */ + if (force_parallel_mode != FORCE_PARALLEL_REGRESS) + { + if (edata.context) + edata.context = psprintf("%s\n%s", edata.context, + _("parallel worker")); + else + edata.context = pstrdup(_("parallel worker")); + } + + /* + * Context beyond that should use the error context callbacks + * that were in effect when the ParallelContext was created, + * not the current ones. + */ + save_error_context_stack = error_context_stack; + error_context_stack = pcxt->error_context_stack; + + /* Rethrow error or print notice. */ + ThrowErrorData(&edata); + + /* Not an error, so restore previous context stack. */ + error_context_stack = save_error_context_stack; + + break; + } + + case 'A': /* NotifyResponse */ + { + /* Propagate NotifyResponse. */ + int32 pid; + const char *channel; + const char *payload; + + pid = pq_getmsgint(msg, 4); + channel = pq_getmsgrawstring(msg); + payload = pq_getmsgrawstring(msg); + pq_endmessage(msg); + + NotifyMyFrontEnd(channel, payload, pid); + + break; + } + + case 'X': /* Terminate, indicating clean exit */ + { + shm_mq_detach(pcxt->worker[i].error_mqh); + pcxt->worker[i].error_mqh = NULL; + break; + } + + default: + { + elog(ERROR, "unrecognized message type received from parallel worker: %c (message length %d bytes)", + msgtype, msg->len); + } + } +} + +/* + * End-of-subtransaction cleanup for parallel contexts. + * + * Currently, it's forbidden to enter or leave a subtransaction while + * parallel mode is in effect, so we could just blow away everything. But + * we may want to relax that restriction in the future, so this code + * contemplates that there may be multiple subtransaction IDs in pcxt_list. + */ +void +AtEOSubXact_Parallel(bool isCommit, SubTransactionId mySubId) +{ + while (!dlist_is_empty(&pcxt_list)) + { + ParallelContext *pcxt; + + pcxt = dlist_head_element(ParallelContext, node, &pcxt_list); + if (pcxt->subid != mySubId) + break; + if (isCommit) + elog(WARNING, "leaked parallel context"); + DestroyParallelContext(pcxt); + } +} + +/* + * End-of-transaction cleanup for parallel contexts. + */ +void +AtEOXact_Parallel(bool isCommit) +{ + while (!dlist_is_empty(&pcxt_list)) + { + ParallelContext *pcxt; + + pcxt = dlist_head_element(ParallelContext, node, &pcxt_list); + if (isCommit) + elog(WARNING, "leaked parallel context"); + DestroyParallelContext(pcxt); + } +} + +/* + * Main entrypoint for parallel workers. + */ +void +ParallelWorkerMain(Datum main_arg) +{ + dsm_segment *seg; + shm_toc *toc; + FixedParallelState *fps; + char *error_queue_space; + shm_mq *mq; + shm_mq_handle *mqh; + char *libraryspace; + char *entrypointstate; + char *library_name; + char *function_name; + parallel_worker_main_type entrypt; + char *gucspace; + char *combocidspace; + char *tsnapspace; + char *asnapspace; + char *tstatespace; + char *pendingsyncsspace; + char *reindexspace; + char *relmapperspace; + char *uncommittedenumsspace; + StringInfoData msgbuf; + char *session_dsm_handle_space; + Snapshot tsnapshot; + Snapshot asnapshot; + + /* Set flag to indicate that we're initializing a parallel worker. */ + InitializingParallelWorker = true; + + /* Establish signal handlers. */ + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* Determine and set our parallel worker number. */ + Assert(ParallelWorkerNumber == -1); + memcpy(&ParallelWorkerNumber, MyBgworkerEntry->bgw_extra, sizeof(int)); + + /* Set up a memory context to work in, just for cleanliness. */ + CurrentMemoryContext = AllocSetContextCreate(TopMemoryContext, + "Parallel worker", + ALLOCSET_DEFAULT_SIZES); + + /* + * Attach to the dynamic shared memory segment for the parallel query, and + * find its table of contents. + * + * Note: at this point, we have not created any ResourceOwner in this + * process. This will result in our DSM mapping surviving until process + * exit, which is fine. If there were a ResourceOwner, it would acquire + * ownership of the mapping, but we have no need for that. + */ + seg = dsm_attach(DatumGetUInt32(main_arg)); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + toc = shm_toc_attach(PARALLEL_MAGIC, dsm_segment_address(seg)); + if (toc == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid magic number in dynamic shared memory segment"))); + + /* Look up fixed parallel state. */ + fps = shm_toc_lookup(toc, PARALLEL_KEY_FIXED, false); + MyFixedParallelState = fps; + + /* Arrange to signal the leader if we exit. */ + ParallelLeaderPid = fps->parallel_leader_pid; + ParallelLeaderBackendId = fps->parallel_leader_backend_id; + before_shmem_exit(ParallelWorkerShutdown, PointerGetDatum(seg)); + + /* + * Now we can find and attach to the error queue provided for us. That's + * good, because until we do that, any errors that happen here will not be + * reported back to the process that requested that this worker be + * launched. + */ + error_queue_space = shm_toc_lookup(toc, PARALLEL_KEY_ERROR_QUEUE, false); + mq = (shm_mq *) (error_queue_space + + ParallelWorkerNumber * PARALLEL_ERROR_QUEUE_SIZE); + shm_mq_set_sender(mq, MyProc); + mqh = shm_mq_attach(mq, seg, NULL); + pq_redirect_to_shm_mq(seg, mqh); + pq_set_parallel_leader(fps->parallel_leader_pid, + fps->parallel_leader_backend_id); + + /* + * Send a BackendKeyData message to the process that initiated parallelism + * so that it has access to our PID before it receives any other messages + * from us. Our cancel key is sent, too, since that's the way the + * protocol message is defined, but it won't actually be used for anything + * in this case. + */ + pq_beginmessage(&msgbuf, 'K'); + pq_sendint32(&msgbuf, (int32) MyProcPid); + pq_sendint32(&msgbuf, (int32) MyCancelKey); + pq_endmessage(&msgbuf); + + /* + * Hooray! Primary initialization is complete. Now, we need to set up our + * backend-local state to match the original backend. + */ + + /* + * Join locking group. We must do this before anything that could try to + * acquire a heavyweight lock, because any heavyweight locks acquired to + * this point could block either directly against the parallel group + * leader or against some process which in turn waits for a lock that + * conflicts with the parallel group leader, causing an undetected + * deadlock. (If we can't join the lock group, the leader has gone away, + * so just exit quietly.) + */ + if (!BecomeLockGroupMember(fps->parallel_leader_pgproc, + fps->parallel_leader_pid)) + return; + + /* + * Restore transaction and statement start-time timestamps. This must + * happen before anything that would start a transaction, else asserts in + * xact.c will fire. + */ + SetParallelStartTimestamps(fps->xact_ts, fps->stmt_ts); + + /* + * Identify the entry point to be called. In theory this could result in + * loading an additional library, though most likely the entry point is in + * the core backend or in a library we just loaded. + */ + entrypointstate = shm_toc_lookup(toc, PARALLEL_KEY_ENTRYPOINT, false); + library_name = entrypointstate; + function_name = entrypointstate + strlen(library_name) + 1; + + entrypt = LookupParallelWorkerFunction(library_name, function_name); + + /* Restore database connection. */ + BackgroundWorkerInitializeConnectionByOid(fps->database_id, + fps->authenticated_user_id, + 0); + + /* + * Set the client encoding to the database encoding, since that is what + * the leader will expect. + */ + SetClientEncoding(GetDatabaseEncoding()); + + /* + * Load libraries that were loaded by original backend. We want to do + * this before restoring GUCs, because the libraries might define custom + * variables. + */ + libraryspace = shm_toc_lookup(toc, PARALLEL_KEY_LIBRARY, false); + StartTransactionCommand(); + RestoreLibraryState(libraryspace); + + /* Restore GUC values from launching backend. */ + gucspace = shm_toc_lookup(toc, PARALLEL_KEY_GUC, false); + RestoreGUCState(gucspace); + CommitTransactionCommand(); + + /* Crank up a transaction state appropriate to a parallel worker. */ + tstatespace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_STATE, false); + StartParallelWorkerTransaction(tstatespace); + + /* Restore combo CID state. */ + combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false); + RestoreComboCIDState(combocidspace); + + /* Attach to the per-session DSM segment and contained objects. */ + session_dsm_handle_space = + shm_toc_lookup(toc, PARALLEL_KEY_SESSION_DSM, false); + AttachSession(*(dsm_handle *) session_dsm_handle_space); + + /* + * If the transaction isolation level is REPEATABLE READ or SERIALIZABLE, + * the leader has serialized the transaction snapshot and we must restore + * it. At lower isolation levels, there is no transaction-lifetime + * snapshot, but we need TransactionXmin to get set to a value which is + * less than or equal to the xmin of every snapshot that will be used by + * this worker. The easiest way to accomplish that is to install the + * active snapshot as the transaction snapshot. Code running in this + * parallel worker might take new snapshots via GetTransactionSnapshot() + * or GetLatestSnapshot(), but it shouldn't have any way of acquiring a + * snapshot older than the active snapshot. + */ + asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, false); + tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, true); + asnapshot = RestoreSnapshot(asnapspace); + tsnapshot = tsnapspace ? RestoreSnapshot(tsnapspace) : asnapshot; + RestoreTransactionSnapshot(tsnapshot, + fps->parallel_leader_pgproc); + PushActiveSnapshot(asnapshot); + + /* + * We've changed which tuples we can see, and must therefore invalidate + * system caches. + */ + InvalidateSystemCaches(); + + /* + * Restore current role id. Skip verifying whether session user is + * allowed to become this role and blindly restore the leader's state for + * current role. + */ + SetCurrentRoleId(fps->outer_user_id, fps->is_superuser); + + /* Restore user ID and security context. */ + SetUserIdAndSecContext(fps->current_user_id, fps->sec_context); + + /* Restore temp-namespace state to ensure search path matches leader's. */ + SetTempNamespaceState(fps->temp_namespace_id, + fps->temp_toast_namespace_id); + + /* Restore pending syncs. */ + pendingsyncsspace = shm_toc_lookup(toc, PARALLEL_KEY_PENDING_SYNCS, + false); + RestorePendingSyncs(pendingsyncsspace); + + /* Restore reindex state. */ + reindexspace = shm_toc_lookup(toc, PARALLEL_KEY_REINDEX_STATE, false); + RestoreReindexState(reindexspace); + + /* Restore relmapper state. */ + relmapperspace = shm_toc_lookup(toc, PARALLEL_KEY_RELMAPPER_STATE, false); + RestoreRelationMap(relmapperspace); + + /* Restore uncommitted enums. */ + uncommittedenumsspace = shm_toc_lookup(toc, PARALLEL_KEY_UNCOMMITTEDENUMS, + false); + RestoreUncommittedEnums(uncommittedenumsspace); + + /* Attach to the leader's serializable transaction, if SERIALIZABLE. */ + AttachSerializableXact(fps->serializable_xact_handle); + + /* + * We've initialized all of our state now; nothing should change + * hereafter. + */ + InitializingParallelWorker = false; + EnterParallelMode(); + + /* + * Time to do the real work: invoke the caller-supplied code. + */ + entrypt(seg, toc); + + /* Must exit parallel mode to pop active snapshot. */ + ExitParallelMode(); + + /* Must pop active snapshot so snapmgr.c doesn't complain. */ + PopActiveSnapshot(); + + /* Shut down the parallel-worker transaction. */ + EndParallelWorkerTransaction(); + + /* Detach from the per-session DSM segment. */ + DetachSession(); + + /* Report success. */ + pq_putmessage('X', NULL, 0); +} + +/* + * Update shared memory with the ending location of the last WAL record we + * wrote, if it's greater than the value already stored there. + */ +void +ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end) +{ + FixedParallelState *fps = MyFixedParallelState; + + Assert(fps != NULL); + SpinLockAcquire(&fps->mutex); + if (fps->last_xlog_end < last_xlog_end) + fps->last_xlog_end = last_xlog_end; + SpinLockRelease(&fps->mutex); +} + +/* + * Make sure the leader tries to read from our error queue one more time. + * This guards against the case where we exit uncleanly without sending an + * ErrorResponse to the leader, for example because some code calls proc_exit + * directly. + * + * Also explicitly detach from dsm segment so that subsystems using + * on_dsm_detach() have a chance to send stats before the stats subsystem is + * shut down as part of a before_shmem_exit() hook. + * + * One might think this could instead be solved by carefully ordering the + * attaching to dsm segments, so that the pgstats segments get detached from + * later than the parallel query one. That turns out to not work because the + * stats hash might need to grow which can cause new segments to be allocated, + * which then will be detached from earlier. + */ +static void +ParallelWorkerShutdown(int code, Datum arg) +{ + SendProcSignal(ParallelLeaderPid, + PROCSIG_PARALLEL_MESSAGE, + ParallelLeaderBackendId); + + dsm_detach((dsm_segment *) DatumGetPointer(arg)); +} + +/* + * Look up (and possibly load) a parallel worker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalParallelWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalParallelWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static parallel_worker_main_type +LookupParallelWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalParallelWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) + { + int i; + + for (i = 0; i < lengthof(InternalParallelWorkers); i++) + { + if (strcmp(InternalParallelWorkers[i].fn_name, funcname) == 0) + return InternalParallelWorkers[i].fn_addr; + } + + /* We can only reach this by programming error. */ + elog(ERROR, "internal function \"%s\" not found", funcname); + } + + /* Otherwise load from external library. */ + return (parallel_worker_main_type) + load_external_function(libraryname, funcname, true, NULL); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c new file mode 100644 index 0000000..6bb4de3 --- /dev/null +++ b/src/backend/access/transam/rmgr.c @@ -0,0 +1,161 @@ +/* + * rmgr.c + * + * Resource managers definition + * + * src/backend/access/transam/rmgr.c + */ +#include "postgres.h" + +#include "access/brin_xlog.h" +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/generic_xlog.h" +#include "access/ginxlog.h" +#include "access/gistxlog.h" +#include "access/hash_xlog.h" +#include "access/heapam_xlog.h" +#include "access/multixact.h" +#include "access/nbtxlog.h" +#include "access/spgxlog.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/storage_xlog.h" +#include "commands/dbcommands_xlog.h" +#include "commands/sequence.h" +#include "commands/tablespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "replication/decode.h" +#include "replication/message.h" +#include "replication/origin.h" +#include "storage/standby.h" +#include "utils/builtins.h" +#include "utils/relmapper.h" + +/* must be kept in sync with RmgrData definition in xlog_internal.h */ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ + { name, redo, desc, identify, startup, cleanup, mask, decode }, + +RmgrData RmgrTable[RM_MAX_ID + 1] = { +#include "access/rmgrlist.h" +}; + +/* + * Start up all resource managers. + */ +void +RmgrStartup(void) +{ + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (!RmgrIdExists(rmid)) + continue; + + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } +} + +/* + * Clean up all resource managers. + */ +void +RmgrCleanup(void) +{ + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (!RmgrIdExists(rmid)) + continue; + + if (RmgrTable[rmid].rm_cleanup != NULL) + RmgrTable[rmid].rm_cleanup(); + } +} + +/* + * Emit ERROR when we encounter a record with an RmgrId we don't + * recognize. + */ +void +RmgrNotFound(RmgrId rmid) +{ + ereport(ERROR, (errmsg("resource manager with ID %d not registered", rmid), + errhint("Include the extension module that implements this resource manager in shared_preload_libraries."))); +} + +/* + * Register a new custom WAL resource manager. + * + * Resource manager IDs must be globally unique across all extensions. Refer + * to https://wiki.postgresql.org/wiki/CustomWALResourceManagers to reserve a + * unique RmgrId for your extension, to avoid conflicts with other extension + * developers. During development, use RM_EXPERIMENTAL_ID to avoid needlessly + * reserving a new ID. + */ +void +RegisterCustomRmgr(RmgrId rmid, RmgrData *rmgr) +{ + if (rmgr->rm_name == NULL || strlen(rmgr->rm_name) == 0) + ereport(ERROR, (errmsg("custom resource manager name is invalid"), + errhint("Provide a non-empty name for the custom resource manager."))); + + if (!RmgrIdIsCustom(rmid)) + ereport(ERROR, (errmsg("custom resource manager ID %d is out of range", rmid), + errhint("Provide a custom resource manager ID between %d and %d.", + RM_MIN_CUSTOM_ID, RM_MAX_CUSTOM_ID))); + + if (!process_shared_preload_libraries_in_progress) + ereport(ERROR, + (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid), + errdetail("Custom resource manager must be registered while initializing modules in shared_preload_libraries."))); + + if (RmgrTable[rmid].rm_name != NULL) + ereport(ERROR, + (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid), + errdetail("Custom resource manager \"%s\" already registered with the same ID.", + RmgrTable[rmid].rm_name))); + + /* check for existing rmgr with the same name */ + for (int existing_rmid = 0; existing_rmid <= RM_MAX_ID; existing_rmid++) + { + if (!RmgrIdExists(existing_rmid)) + continue; + + if (!pg_strcasecmp(RmgrTable[existing_rmid].rm_name, rmgr->rm_name)) + ereport(ERROR, + (errmsg("failed to register custom resource manager \"%s\" with ID %d", rmgr->rm_name, rmid), + errdetail("Existing resource manager with ID %d has the same name.", existing_rmid))); + } + + /* register it */ + RmgrTable[rmid] = *rmgr; + ereport(LOG, + (errmsg("registered custom resource manager \"%s\" with ID %d", + rmgr->rm_name, rmid))); +} + +/* SQL SRF showing loaded resource managers */ +Datum +pg_get_wal_resource_managers(PG_FUNCTION_ARGS) +{ +#define PG_GET_RESOURCE_MANAGERS_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Datum values[PG_GET_RESOURCE_MANAGERS_COLS]; + bool nulls[PG_GET_RESOURCE_MANAGERS_COLS] = {0}; + + InitMaterializedSRF(fcinfo, 0); + + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (!RmgrIdExists(rmid)) + continue; + values[0] = Int32GetDatum(rmid); + values[1] = CStringGetTextDatum(GetRmgr(rmid).rm_name); + values[2] = BoolGetDatum(RmgrIdIsBuiltin(rmid)); + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + + return (Datum) 0; +} diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c new file mode 100644 index 0000000..af57fe9 --- /dev/null +++ b/src/backend/access/transam/slru.c @@ -0,0 +1,1615 @@ +/*------------------------------------------------------------------------- + * + * slru.c + * Simple LRU buffering for transaction status logfiles + * + * We use a simple least-recently-used scheme to manage a pool of page + * buffers. Under ordinary circumstances we expect that write + * traffic will occur mostly to the latest page (and to the just-prior + * page, soon after a page transition). Read traffic will probably touch + * a larger span of pages, but in any case a fairly small number of page + * buffers should be sufficient. So, we just search the buffers using plain + * linear search; there's no need for a hashtable or anything fancy. + * The management algorithm is straight LRU except that we will never swap + * out the latest page (since we know it's going to be hit again eventually). + * + * We use a control LWLock to protect the shared data structures, plus + * per-buffer LWLocks that synchronize I/O for each buffer. The control lock + * must be held to examine or modify any shared state. A process that is + * reading in or writing out a page buffer does not hold the control lock, + * only the per-buffer lock for the buffer it is working on. + * + * "Holding the control lock" means exclusive lock in all cases except for + * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for + * the implications of that. + * + * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively + * before releasing the control lock. The per-buffer lock is released after + * completing the I/O, re-acquiring the control lock, and updating the shared + * state. (Deadlock is not possible here, because we never try to initiate + * I/O when someone else is already doing I/O on the same buffer.) + * To wait for I/O to complete, release the control lock, acquire the + * per-buffer lock in shared mode, immediately release the per-buffer lock, + * reacquire the control lock, and then recheck state (since arbitrary things + * could have happened while we didn't have the lock). + * + * As with the regular buffer manager, it is possible for another process + * to re-dirty a page that is currently being written out. This is handled + * by re-setting the page's page_dirty flag. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/slru.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "access/slru.h" +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/shmem.h" + +#define SlruFileName(ctl, path, seg) \ + snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) + +/* + * During SimpleLruWriteAll(), we will usually not need to write more than one + * or two physical files, but we may need to write several pages per file. We + * can consolidate the I/O requests by leaving files open until control returns + * to SimpleLruWriteAll(). This data structure remembers which files are open. + */ +#define MAX_WRITEALL_BUFFERS 16 + +typedef struct SlruWriteAllData +{ + int num_files; /* # files actually open */ + int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */ + int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */ +} SlruWriteAllData; + +typedef struct SlruWriteAllData *SlruWriteAll; + +/* + * Populate a file tag describing a segment file. We only use the segment + * number, since we can derive everything else we need by having separate + * sync handler functions for clog, multixact etc. + */ +#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = (xx_handler), \ + (a).segno = (xx_segno) \ +) + +/* + * Macro to mark a buffer slot "most recently used". Note multiple evaluation + * of arguments! + * + * The reason for the if-test is that there are often many consecutive + * accesses to the same page (particularly the latest page). By suppressing + * useless increments of cur_lru_count, we reduce the probability that old + * pages' counts will "wrap around" and make them appear recently used. + * + * We allow this code to be executed concurrently by multiple processes within + * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, + * this should not cause any completely-bogus values to enter the computation. + * However, it is possible for either cur_lru_count or individual + * page_lru_count entries to be "reset" to lower values than they should have, + * in case a process is delayed while it executes this macro. With care in + * SlruSelectLRUPage(), this does little harm, and in any case the absolute + * worst possible consequence is a nonoptimal choice of page to evict. The + * gain from allowing concurrent reads of SLRU pages seems worth it. + */ +#define SlruRecentlyUsed(shared, slotno) \ + do { \ + int new_lru_count = (shared)->cur_lru_count; \ + if (new_lru_count != (shared)->page_lru_count[slotno]) { \ + (shared)->cur_lru_count = ++new_lru_count; \ + (shared)->page_lru_count[slotno] = new_lru_count; \ + } \ + } while (0) + +/* Saved info for SlruReportIOError */ +typedef enum +{ + SLRU_OPEN_FAILED, + SLRU_SEEK_FAILED, + SLRU_READ_FAILED, + SLRU_WRITE_FAILED, + SLRU_FSYNC_FAILED, + SLRU_CLOSE_FAILED +} SlruErrorCause; + +static SlruErrorCause slru_errcause; +static int slru_errno; + + +static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); +static void SimpleLruWaitIO(SlruCtl ctl, int slotno); +static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); +static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); +static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, + SlruWriteAll fdata); +static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); +static int SlruSelectLRUPage(SlruCtl ctl, int pageno); + +static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, + int segpage, void *data); +static void SlruInternalDeleteSegment(SlruCtl ctl, int segno); + +/* + * Initialization of shared memory + */ + +Size +SimpleLruShmemSize(int nslots, int nlsns) +{ + Size sz; + + /* we assume nslots isn't so large as to risk overflow */ + sz = MAXALIGN(sizeof(SlruSharedData)); + sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */ + sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ + sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */ + sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ + sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ + sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ + + if (nlsns > 0) + sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ + + return BUFFERALIGN(sz) + BLCKSZ * nslots; +} + +/* + * Initialize, or attach to, a simple LRU cache in shared memory. + * + * ctl: address of local (unshared) control structure. + * name: name of SLRU. (This is user-visible, pick with care!) + * nslots: number of page slots to use. + * nlsns: number of LSN groups per page (set to zero if not relevant). + * ctllock: LWLock to use to control access to the shared control structure. + * subdir: PGDATA-relative subdirectory that will contain the files. + * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks. + * sync_handler: which set of functions to use to handle sync requests + */ +void +SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler) +{ + SlruShared shared; + bool found; + + shared = (SlruShared) ShmemInitStruct(name, + SimpleLruShmemSize(nslots, nlsns), + &found); + + if (!IsUnderPostmaster) + { + /* Initialize locks and shared memory area */ + char *ptr; + Size offset; + int slotno; + + Assert(!found); + + memset(shared, 0, sizeof(SlruSharedData)); + + shared->ControlLock = ctllock; + + shared->num_slots = nslots; + shared->lsn_groups_per_page = nlsns; + + shared->cur_lru_count = 0; + + /* shared->latest_page_number will be set later */ + + shared->slru_stats_idx = pgstat_get_slru_index(name); + + ptr = (char *) shared; + offset = MAXALIGN(sizeof(SlruSharedData)); + shared->page_buffer = (char **) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(char *)); + shared->page_status = (SlruPageStatus *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); + shared->page_dirty = (bool *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(bool)); + shared->page_number = (int *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(int)); + shared->page_lru_count = (int *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(int)); + + /* Initialize LWLocks */ + shared->buffer_locks = (LWLockPadded *) (ptr + offset); + offset += MAXALIGN(nslots * sizeof(LWLockPadded)); + + if (nlsns > 0) + { + shared->group_lsn = (XLogRecPtr *) (ptr + offset); + offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); + } + + ptr += BUFFERALIGN(offset); + for (slotno = 0; slotno < nslots; slotno++) + { + LWLockInitialize(&shared->buffer_locks[slotno].lock, + tranche_id); + + shared->page_buffer[slotno] = ptr; + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + shared->page_dirty[slotno] = false; + shared->page_lru_count[slotno] = 0; + ptr += BLCKSZ; + } + + /* Should fit to estimated shmem size */ + Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); + } + else + Assert(found); + + /* + * Initialize the unshared control struct, including directory path. We + * assume caller set PagePrecedes. + */ + ctl->shared = shared; + ctl->sync_handler = sync_handler; + strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); +} + +/* + * Initialize (or reinitialize) a page to zeroes. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +int +SimpleLruZeroPage(SlruCtl ctl, int pageno) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* Find a suitable buffer slot for the page */ + slotno = SlruSelectLRUPage(ctl, pageno); + Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) || + shared->page_number[slotno] == pageno); + + /* Mark the slot as containing this page */ + shared->page_number[slotno] = pageno; + shared->page_status[slotno] = SLRU_PAGE_VALID; + shared->page_dirty[slotno] = true; + SlruRecentlyUsed(shared, slotno); + + /* Set the buffer to zeroes */ + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + + /* Set the LSNs for this new page to zero */ + SimpleLruZeroLSNs(ctl, slotno); + + /* Assume this page is now the latest active page */ + shared->latest_page_number = pageno; + + /* update the stats counter of zeroed pages */ + pgstat_count_slru_page_zeroed(shared->slru_stats_idx); + + return slotno; +} + +/* + * Zero all the LSNs we store for this slru page. + * + * This should be called each time we create a new page, and each time we read + * in a page from disk into an existing buffer. (Such an old page cannot + * have any interesting LSNs, since we'd have flushed them before writing + * the page in the first place.) + * + * This assumes that InvalidXLogRecPtr is bitwise-all-0. + */ +static void +SimpleLruZeroLSNs(SlruCtl ctl, int slotno) +{ + SlruShared shared = ctl->shared; + + if (shared->lsn_groups_per_page > 0) + MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0, + shared->lsn_groups_per_page * sizeof(XLogRecPtr)); +} + +/* + * Wait for any active I/O on a page slot to finish. (This does not + * guarantee that new I/O hasn't been started before we return, though. + * In fact the slot might not even contain the same page anymore.) + * + * Control lock must be held at entry, and will be held at exit. + */ +static void +SimpleLruWaitIO(SlruCtl ctl, int slotno) +{ + SlruShared shared = ctl->shared; + + /* See notes at top of file */ + LWLockRelease(shared->ControlLock); + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED); + LWLockRelease(&shared->buffer_locks[slotno].lock); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + /* + * If the slot is still in an io-in-progress state, then either someone + * already started a new I/O on the slot, or a previous I/O failed and + * neglected to reset the page state. That shouldn't happen, really, but + * it seems worth a few extra cycles to check and recover from it. We can + * cheaply test for failure by seeing if the buffer lock is still held (we + * assume that transaction abort would release the lock). + */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || + shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) + { + if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED)) + { + /* indeed, the I/O must have failed */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + else /* write_in_progress */ + { + shared->page_status[slotno] = SLRU_PAGE_VALID; + shared->page_dirty[slotno] = true; + } + LWLockRelease(&shared->buffer_locks[slotno].lock); + } + } +} + +/* + * Find a page in a shared buffer, reading it in if necessary. + * The page number must correspond to an already-initialized page. + * + * If write_ok is true then it is OK to return a page that is in + * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure + * that modification of the page is safe. If write_ok is false then we + * will not return the page until it is not undergoing active I/O. + * + * The passed-in xid is used only for error reporting, and may be + * InvalidTransactionId if no specific xid is associated with the action. + * + * Return value is the shared-buffer slot number now holding the page. + * The buffer's LRU access info is updated. + * + * Control lock must be held at entry, and will be held at exit. + */ +int +SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, + TransactionId xid) +{ + SlruShared shared = ctl->shared; + + /* Outer loop handles restart if we must wait for someone else's I/O */ + for (;;) + { + int slotno; + bool ok; + + /* See if page already is in memory; if not, pick victim slot */ + slotno = SlruSelectLRUPage(ctl, pageno); + + /* Did we find the page in memory? */ + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY) + { + /* + * If page is still being read in, we must wait for I/O. Likewise + * if the page is being written and the caller said that's not OK. + */ + if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || + (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && + !write_ok)) + { + SimpleLruWaitIO(ctl, slotno); + /* Now we must recheck state from the top */ + continue; + } + /* Otherwise, it's ready to use */ + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages found in the SLRU */ + pgstat_count_slru_page_hit(shared->slru_stats_idx); + + return slotno; + } + + /* We found no match; assert we selected a freeable slot */ + Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno])); + + /* Mark the slot read-busy */ + shared->page_number[slotno] = pageno; + shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; + shared->page_dirty[slotno] = false; + + /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); + + /* Release control lock while doing I/O */ + LWLockRelease(shared->ControlLock); + + /* Do the read */ + ok = SlruPhysicalReadPage(ctl, pageno, slotno); + + /* Set the LSNs for this newly read-in page to zero */ + SimpleLruZeroLSNs(ctl, slotno); + + /* Re-acquire control lock and update page state */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + Assert(shared->page_number[slotno] == pageno && + shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && + !shared->page_dirty[slotno]); + + shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; + + LWLockRelease(&shared->buffer_locks[slotno].lock); + + /* Now it's okay to ereport if we failed */ + if (!ok) + SlruReportIOError(ctl, pageno, xid); + + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages not found in SLRU */ + pgstat_count_slru_page_read(shared->slru_stats_idx); + + return slotno; + } +} + +/* + * Find a page in a shared buffer, reading it in if necessary. + * The page number must correspond to an already-initialized page. + * The caller must intend only read-only access to the page. + * + * The passed-in xid is used only for error reporting, and may be + * InvalidTransactionId if no specific xid is associated with the action. + * + * Return value is the shared-buffer slot number now holding the page. + * The buffer's LRU access info is updated. + * + * Control lock must NOT be held at entry, but will be held at exit. + * It is unspecified whether the lock will be shared or exclusive. + */ +int +SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* Try to find the page while holding only shared lock */ + LWLockAcquire(shared->ControlLock, LW_SHARED); + + /* See if page is already in a buffer */ + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY && + shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) + { + /* See comments for SlruRecentlyUsed macro */ + SlruRecentlyUsed(shared, slotno); + + /* update the stats counter of pages found in the SLRU */ + pgstat_count_slru_page_hit(shared->slru_stats_idx); + + return slotno; + } + } + + /* No luck, so switch to normal exclusive lock and do regular read */ + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + return SimpleLruReadPage(ctl, pageno, true, xid); +} + +/* + * Write a page from a shared buffer, if necessary. + * Does nothing if the specified slot is not dirty. + * + * NOTE: only one write attempt is made here. Hence, it is possible that + * the page is still dirty at exit (if someone else re-dirtied it during + * the write). However, we *do* attempt a fresh write even if the page + * is already being written; this is for checkpoints. + * + * Control lock must be held at entry, and will be held at exit. + */ +static void +SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) +{ + SlruShared shared = ctl->shared; + int pageno = shared->page_number[slotno]; + bool ok; + + /* If a write is in progress, wait for it to finish */ + while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && + shared->page_number[slotno] == pageno) + { + SimpleLruWaitIO(ctl, slotno); + } + + /* + * Do nothing if page is not dirty, or if buffer no longer contains the + * same page we were called for. + */ + if (!shared->page_dirty[slotno] || + shared->page_status[slotno] != SLRU_PAGE_VALID || + shared->page_number[slotno] != pageno) + return; + + /* + * Mark the slot write-busy, and clear the dirtybit. After this point, a + * transaction status update on this page will mark it dirty again. + */ + shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; + shared->page_dirty[slotno] = false; + + /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ + LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); + + /* Release control lock while doing I/O */ + LWLockRelease(shared->ControlLock); + + /* Do the write */ + ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); + + /* If we failed, and we're in a flush, better close the files */ + if (!ok && fdata) + { + int i; + + for (i = 0; i < fdata->num_files; i++) + CloseTransientFile(fdata->fd[i]); + } + + /* Re-acquire control lock and update page state */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + Assert(shared->page_number[slotno] == pageno && + shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); + + /* If we failed to write, mark the page dirty again */ + if (!ok) + shared->page_dirty[slotno] = true; + + shared->page_status[slotno] = SLRU_PAGE_VALID; + + LWLockRelease(&shared->buffer_locks[slotno].lock); + + /* Now it's okay to ereport if we failed */ + if (!ok) + SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* If part of a checkpoint, count this as a buffer written. */ + if (fdata) + CheckpointStats.ckpt_bufs_written++; +} + +/* + * Wrapper of SlruInternalWritePage, for external callers. + * fdata is always passed a NULL here. + */ +void +SimpleLruWritePage(SlruCtl ctl, int slotno) +{ + SlruInternalWritePage(ctl, slotno, NULL); +} + +/* + * Return whether the given page exists on disk. + * + * A false return means that either the file does not exist, or that it's not + * large enough to contain the given page. + */ +bool +SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + int offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd; + bool result; + off_t endpos; + + /* update the stats counter of checked pages */ + pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx); + + SlruFileName(ctl, path, segno); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + /* expected: file doesn't exist */ + if (errno == ENOENT) + return false; + + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } + + if ((endpos = lseek(fd, 0, SEEK_END)) < 0) + { + slru_errcause = SLRU_SEEK_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } + + result = endpos >= (off_t) (offset + BLCKSZ); + + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + + return result; +} + +/* + * Physical read of a (previously existing) page into a buffer slot + * + * On failure, we cannot just ereport(ERROR) since caller has put state in + * shared memory that must be undone. So, we return false and save enough + * info in static variables to let SlruReportIOError make the report. + * + * For now, assume it's not worth keeping a file pointer open across + * read/write operations. We could cache one virtual file pointer ... + */ +static bool +SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) +{ + SlruShared shared = ctl->shared; + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + off_t offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd; + + SlruFileName(ctl, path, segno); + + /* + * In a crash-and-restart situation, it's possible for us to receive + * commands to set the commit status of transactions whose bits are in + * already-truncated segments of the commit log (see notes in + * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case + * where the file doesn't exist, and return zeroes instead. + */ + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno != ENOENT || !InRecovery) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); + if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_READ_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + + return true; +} + +/* + * Physical write of a page from a buffer slot + * + * On failure, we cannot just ereport(ERROR) since caller has put state in + * shared memory that must be undone. So, we return false and save enough + * info in static variables to let SlruReportIOError make the report. + * + * For now, assume it's not worth keeping a file pointer open across + * independent read/write operations. We do batch operations during + * SimpleLruWriteAll, though. + * + * fdata is NULL for a standalone write, pointer to open-file info during + * SimpleLruWriteAll. + */ +static bool +SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) +{ + SlruShared shared = ctl->shared; + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + off_t offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd = -1; + + /* update the stats counter of written pages */ + pgstat_count_slru_page_written(shared->slru_stats_idx); + + /* + * Honor the write-WAL-before-data rule, if appropriate, so that we do not + * write out data before associated WAL records. This is the same action + * performed during FlushBuffer() in the main buffer manager. + */ + if (shared->group_lsn != NULL) + { + /* + * We must determine the largest async-commit LSN for the page. This + * is a bit tedious, but since this entire function is a slow path + * anyway, it seems better to do this here than to maintain a per-page + * LSN variable (which'd need an extra comparison in the + * transaction-commit path). + */ + XLogRecPtr max_lsn; + int lsnindex, + lsnoff; + + lsnindex = slotno * shared->lsn_groups_per_page; + max_lsn = shared->group_lsn[lsnindex++]; + for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) + { + XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; + + if (max_lsn < this_lsn) + max_lsn = this_lsn; + } + + if (!XLogRecPtrIsInvalid(max_lsn)) + { + /* + * As noted above, elog(ERROR) is not acceptable here, so if + * XLogFlush were to fail, we must PANIC. This isn't much of a + * restriction because XLogFlush is just about all critical + * section anyway, but let's make sure. + */ + START_CRIT_SECTION(); + XLogFlush(max_lsn); + END_CRIT_SECTION(); + } + } + + /* + * During a WriteAll, we may already have the desired file open. + */ + if (fdata) + { + int i; + + for (i = 0; i < fdata->num_files; i++) + { + if (fdata->segno[i] == segno) + { + fd = fdata->fd[i]; + break; + } + } + } + + if (fd < 0) + { + /* + * If the file doesn't already exist, we should create it. It is + * possible for this to need to happen when writing a page that's not + * first in its segment; we assume the OS can cope with that. (Note: + * it might seem that it'd be okay to create files only when + * SimpleLruZeroPage is called for the first page of a segment. + * However, if after a crash and restart the REDO logic elects to + * replay the log from a checkpoint before the latest one, then it's + * possible that we will get commands to set transaction status of + * transactions that have already been truncated from the commit log. + * Easiest way to deal with that is to accept references to + * nonexistent files here and in SlruPhysicalReadPage.) + * + * Note: it is possible for more than one backend to be executing this + * code simultaneously for different pages of the same file. Hence, + * don't use O_EXCL or O_TRUNC or anything like that. + */ + SlruFileName(ctl, path, segno); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + + if (fdata) + { + if (fdata->num_files < MAX_WRITEALL_BUFFERS) + { + fdata->fd[fdata->num_files] = fd; + fdata->segno[fdata->num_files] = segno; + fdata->num_files++; + } + else + { + /* + * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, + * fall back to treating it as a standalone write. + */ + fdata = NULL; + } + } + } + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + if (!fdata) + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + + /* Queue up a sync request for the checkpointer. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) + { + /* No space to enqueue sync request. Do it synchronously. */ + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } + pgstat_report_wait_end(); + } + } + + /* Close file, unless part of flush request. */ + if (!fdata) + { + if (CloseTransientFile(fd) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } + } + + return true; +} + +/* + * Issue the error message after failure of SlruPhysicalReadPage or + * SlruPhysicalWritePage. Call this after cleaning up shared-memory state. + */ +static void +SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + int offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + + SlruFileName(ctl, path, segno); + errno = slru_errno; + switch (slru_errcause) + { + case SLRU_OPEN_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not open file \"%s\": %m.", path))); + break; + case SLRU_SEEK_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not seek in file \"%s\" to offset %d: %m.", + path, offset))); + break; + case SLRU_READ_FAILED: + if (errno) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not read from file \"%s\" at offset %d: %m.", + path, offset))); + else + ereport(ERROR, + (errmsg("could not access status of transaction %u", xid), + errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset))); + break; + case SLRU_WRITE_FAILED: + if (errno) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not write to file \"%s\" at offset %d: %m.", + path, offset))); + else + ereport(ERROR, + (errmsg("could not access status of transaction %u", xid), + errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.", + path, offset))); + break; + case SLRU_FSYNC_FAILED: + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not fsync file \"%s\": %m.", + path))); + break; + case SLRU_CLOSE_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("Could not close file \"%s\": %m.", + path))); + break; + default: + /* can't get here, we trust */ + elog(ERROR, "unrecognized SimpleLru error cause: %d", + (int) slru_errcause); + break; + } +} + +/* + * Select the slot to re-use when we need a free slot. + * + * The target page number is passed because we need to consider the + * possibility that some other process reads in the target page while + * we are doing I/O to free a slot. Hence, check or recheck to see if + * any slot already holds the target page, and return that slot if so. + * Thus, the returned slot is *either* a slot already holding the pageno + * (could be any state except EMPTY), *or* a freeable slot (state EMPTY + * or CLEAN). + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +SlruSelectLRUPage(SlruCtl ctl, int pageno) +{ + SlruShared shared = ctl->shared; + + /* Outer loop handles restart after I/O */ + for (;;) + { + int slotno; + int cur_count; + int bestvalidslot = 0; /* keep compiler quiet */ + int best_valid_delta = -1; + int best_valid_page_number = 0; /* keep compiler quiet */ + int bestinvalidslot = 0; /* keep compiler quiet */ + int best_invalid_delta = -1; + int best_invalid_page_number = 0; /* keep compiler quiet */ + + /* See if page already has a buffer assigned */ + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_number[slotno] == pageno && + shared->page_status[slotno] != SLRU_PAGE_EMPTY) + return slotno; + } + + /* + * If we find any EMPTY slot, just select that one. Else choose a + * victim page to replace. We normally take the least recently used + * valid page, but we will never take the slot containing + * latest_page_number, even if it appears least recently used. We + * will select a slot that is already I/O busy only if there is no + * other choice: a read-busy slot will not be least recently used once + * the read finishes, and waiting for an I/O on a write-busy slot is + * inferior to just picking some other slot. Testing shows the slot + * we pick instead will often be clean, allowing us to begin a read at + * once. + * + * Normally the page_lru_count values will all be different and so + * there will be a well-defined LRU page. But since we allow + * concurrent execution of SlruRecentlyUsed() within + * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages + * acquire the same lru_count values. In that case we break ties by + * choosing the furthest-back page. + * + * Notice that this next line forcibly advances cur_lru_count to a + * value that is certainly beyond any value that will be in the + * page_lru_count array after the loop finishes. This ensures that + * the next execution of SlruRecentlyUsed will mark the page newly + * used, even if it's for a page that has the current counter value. + * That gets us back on the path to having good data when there are + * multiple pages with the same lru_count. + */ + cur_count = (shared->cur_lru_count)++; + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int this_delta; + int this_page_number; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + return slotno; + this_delta = cur_count - shared->page_lru_count[slotno]; + if (this_delta < 0) + { + /* + * Clean up in case shared updates have caused cur_count + * increments to get "lost". We back off the page counts, + * rather than trying to increase cur_count, to avoid any + * question of infinite loops or failure in the presence of + * wrapped-around counts. + */ + shared->page_lru_count[slotno] = cur_count; + this_delta = 0; + } + this_page_number = shared->page_number[slotno]; + if (this_page_number == shared->latest_page_number) + continue; + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + { + if (this_delta > best_valid_delta || + (this_delta == best_valid_delta && + ctl->PagePrecedes(this_page_number, + best_valid_page_number))) + { + bestvalidslot = slotno; + best_valid_delta = this_delta; + best_valid_page_number = this_page_number; + } + } + else + { + if (this_delta > best_invalid_delta || + (this_delta == best_invalid_delta && + ctl->PagePrecedes(this_page_number, + best_invalid_page_number))) + { + bestinvalidslot = slotno; + best_invalid_delta = this_delta; + best_invalid_page_number = this_page_number; + } + } + } + + /* + * If all pages (except possibly the latest one) are I/O busy, we'll + * have to wait for an I/O to complete and then retry. In that + * unhappy case, we choose to wait for the I/O on the least recently + * used slot, on the assumption that it was likely initiated first of + * all the I/Os in progress and may therefore finish first. + */ + if (best_valid_delta < 0) + { + SimpleLruWaitIO(ctl, bestinvalidslot); + continue; + } + + /* + * If the selected page is clean, we're set. + */ + if (!shared->page_dirty[bestvalidslot]) + return bestvalidslot; + + /* + * Write the page. + */ + SlruInternalWritePage(ctl, bestvalidslot, NULL); + + /* + * Now loop back and try again. This is the easiest way of dealing + * with corner cases such as the victim page being re-dirtied while we + * wrote it. + */ + } +} + +/* + * Write dirty pages to disk during checkpoint or database shutdown. Flushing + * is deferred until the next call to ProcessSyncRequests(), though we do fsync + * the containing directory here to make sure that newly created directory + * entries are on disk. + */ +void +SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) +{ + SlruShared shared = ctl->shared; + SlruWriteAllData fdata; + int slotno; + int pageno = 0; + int i; + bool ok; + + /* update the stats counter of flushes */ + pgstat_count_slru_flush(shared->slru_stats_idx); + + /* + * Find and write dirty pages + */ + fdata.num_files = 0; + + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + SlruInternalWritePage(ctl, slotno, &fdata); + + /* + * In some places (e.g. checkpoints), we cannot assert that the slot + * is clean now, since another process might have re-dirtied it + * already. That's okay. + */ + Assert(allow_redirtied || + shared->page_status[slotno] == SLRU_PAGE_EMPTY || + (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno])); + } + + LWLockRelease(shared->ControlLock); + + /* + * Now close any files that were open + */ + ok = true; + for (i = 0; i < fdata.num_files; i++) + { + if (CloseTransientFile(fdata.fd[i]) != 0) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; + ok = false; + } + } + if (!ok) + SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* Ensure that directory entries for new files are on disk. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + fsync_fname(ctl->Dir, true); +} + +/* + * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. + */ +void +SimpleLruTruncate(SlruCtl ctl, int cutoffPage) +{ + SlruShared shared = ctl->shared; + int slotno; + + /* update the stats counter of truncates */ + pgstat_count_slru_truncate(shared->slru_stats_idx); + + /* + * Scan shared memory and remove any pages preceding the cutoff page, to + * ensure we won't rewrite them later. (Since this is normally called in + * or just after a checkpoint, any dirty pages should have been flushed + * already ... we're just being extra careful here.) + */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + +restart:; + + /* + * While we are holding the lock, make an important safety check: the + * current endpoint page must not be eligible for removal. + */ + if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) + { + LWLockRelease(shared->ControlLock); + ereport(LOG, + (errmsg("could not truncate directory \"%s\": apparent wraparound", + ctl->Dir))); + return; + } + + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + continue; + if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) + continue; + + /* + * If page is clean, just change state to EMPTY (expected case). + */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) + { + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + continue; + } + + /* + * Hmm, we have (or may have) I/O operations acting on the page, so + * we've got to wait for them to finish and then start again. This is + * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, + * wouldn't it be OK to just discard it without writing it? + * SlruMayDeleteSegment() uses a stricter qualification, so we might + * not delete this page in the end; even if we don't delete it, we + * won't have cause to read its data again. For now, keep the logic + * the same as it was.) + */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + SlruInternalWritePage(ctl, slotno, NULL); + else + SimpleLruWaitIO(ctl, slotno); + goto restart; + } + + LWLockRelease(shared->ControlLock); + + /* Now we can remove the old segment(s) */ + (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); +} + +/* + * Delete an individual SLRU segment. + * + * NB: This does not touch the SLRU buffers themselves, callers have to ensure + * they either can't yet contain anything, or have already been cleaned out. + */ +static void +SlruInternalDeleteSegment(SlruCtl ctl, int segno) +{ + char path[MAXPGPATH]; + + /* Forget any fsync requests queued for this segment. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); + } + + /* Unlink the file. */ + SlruFileName(ctl, path, segno); + ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path))); + unlink(path); +} + +/* + * Delete an individual SLRU segment, identified by the segment number. + */ +void +SlruDeleteSegment(SlruCtl ctl, int segno) +{ + SlruShared shared = ctl->shared; + int slotno; + bool did_write; + + /* Clean out any possibly existing references to the segment. */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); +restart: + did_write = false; + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + continue; + + /* not the segment we're looking for */ + if (pagesegno != segno) + continue; + + /* If page is clean, just change state to EMPTY (expected case). */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) + { + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + continue; + } + + /* Same logic as SimpleLruTruncate() */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + SlruInternalWritePage(ctl, slotno, NULL); + else + SimpleLruWaitIO(ctl, slotno); + + did_write = true; + } + + /* + * Be extra careful and re-check. The IO functions release the control + * lock, so new pages could have been read in. + */ + if (did_write) + goto restart; + + SlruInternalDeleteSegment(ctl, segno); + + LWLockRelease(shared->ControlLock); +} + +/* + * Determine whether a segment is okay to delete. + * + * segpage is the first page of the segment, and cutoffPage is the oldest (in + * PagePrecedes order) page in the SLRU containing still-useful data. Since + * every core PagePrecedes callback implements "wrap around", check the + * segment's first and last pages: + * + * first<cutoff && last<cutoff: yes + * first<cutoff && last>=cutoff: no; cutoff falls inside this segment + * first>=cutoff && last<cutoff: no; wrap point falls inside this segment + * first>=cutoff && last>=cutoff: no; every page of this segment is too young + */ +static bool +SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage) +{ + int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1; + + Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0); + + return (ctl->PagePrecedes(segpage, cutoffPage) && + ctl->PagePrecedes(seg_last_page, cutoffPage)); +} + +#ifdef USE_ASSERT_CHECKING +static void +SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) +{ + TransactionId lhs, + rhs; + int newestPage, + oldestPage; + TransactionId newestXact, + oldestXact; + + /* + * Compare an XID pair having undefined order (see RFC 1982), a pair at + * "opposite ends" of the XID space. TransactionIdPrecedes() treats each + * as preceding the other. If RHS is oldestXact, LHS is the first XID we + * must not assign. + */ + lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */ + rhs = lhs + (1U << 31); + Assert(TransactionIdPrecedes(lhs, rhs)); + Assert(TransactionIdPrecedes(rhs, lhs)); + Assert(!TransactionIdPrecedes(lhs - 1, rhs)); + Assert(TransactionIdPrecedes(rhs, lhs - 1)); + Assert(TransactionIdPrecedes(lhs + 1, rhs)); + Assert(!TransactionIdPrecedes(rhs, lhs + 1)); + Assert(!TransactionIdFollowsOrEquals(lhs, rhs)); + Assert(!TransactionIdFollowsOrEquals(rhs, lhs)); + Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page)); + Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page)); + Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page)); + Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page)); + Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page) + || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */ + Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page) + || (1U << 31) % per_page != 0); + Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page)); + Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page)); + Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page)); + + /* + * GetNewTransactionId() has assigned the last XID it can safely use, and + * that XID is in the *LAST* page of the second segment. We must not + * delete that segment. + */ + newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + Assert(!SlruMayDeleteSegment(ctl, + (newestPage - + newestPage % SLRU_PAGES_PER_SEGMENT), + oldestPage)); + + /* + * GetNewTransactionId() has assigned the last XID it can safely use, and + * that XID is in the *FIRST* page of the second segment. We must not + * delete that segment. + */ + newestPage = SLRU_PAGES_PER_SEGMENT; + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + Assert(!SlruMayDeleteSegment(ctl, + (newestPage - + newestPage % SLRU_PAGES_PER_SEGMENT), + oldestPage)); +} + +/* + * Unit-test a PagePrecedes function. + * + * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It + * assumes each value occupies a contiguous, fixed-size region of SLRU bytes. + * (MultiXactMemberCtl separates flags from XIDs. AsyncCtl has + * variable-length entries, no keys, and no random access. These unit tests + * do not apply to them.) + */ +void +SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page) +{ + /* Test first, middle and last entries of a page. */ + SlruPagePrecedesTestOffset(ctl, per_page, 0); + SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2); + SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1); +} +#endif + +/* + * SlruScanDirectory callback + * This callback reports true if there's any segment wholly prior to the + * one containing the page passed as "data". + */ +bool +SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data) +{ + int cutoffPage = *(int *) data; + + if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) + return true; /* found one; don't iterate any more */ + + return false; /* keep going */ +} + +/* + * SlruScanDirectory callback. + * This callback deletes segments prior to the one passed in as "data". + */ +static bool +SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) +{ + int cutoffPage = *(int *) data; + + if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) + SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + + return false; /* keep going */ +} + +/* + * SlruScanDirectory callback. + * This callback deletes all segments. + */ +bool +SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) +{ + SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + + return false; /* keep going */ +} + +/* + * Scan the SimpleLru directory and apply a callback to each file found in it. + * + * If the callback returns true, the scan is stopped. The last return value + * from the callback is returned. + * + * The callback receives the following arguments: 1. the SlruCtl struct for the + * slru being truncated; 2. the filename being considered; 3. the page number + * for the first page of that file; 4. a pointer to the opaque data given to us + * by the caller. + * + * Note that the ordering in which the directory is scanned is not guaranteed. + * + * Note that no locking is applied. + */ +bool +SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) +{ + bool retval = false; + DIR *cldir; + struct dirent *clde; + int segno; + int segpage; + + cldir = AllocateDir(ctl->Dir); + while ((clde = ReadDir(cldir, ctl->Dir)) != NULL) + { + size_t len; + + len = strlen(clde->d_name); + + if ((len == 4 || len == 5 || len == 6) && + strspn(clde->d_name, "0123456789ABCDEF") == len) + { + segno = (int) strtol(clde->d_name, NULL, 16); + segpage = segno * SLRU_PAGES_PER_SEGMENT; + + elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s", + ctl->Dir, clde->d_name); + retval = callback(ctl, clde->d_name, segpage, data); + if (retval) + break; + } + } + FreeDir(cldir); + + return retval; +} + +/* + * Individual SLRUs (clog, ...) have to provide a sync.c handler function so + * that they can provide the correct "SlruCtl" (otherwise we don't know how to + * build the path), but they just forward to this common implementation that + * performs the fsync. + */ +int +SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) +{ + int fd; + int save_errno; + int result; + + SlruFileName(ctl, path, ftag->segno); + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd < 0) + return -1; + + pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); + result = pg_fsync(fd); + pgstat_report_wait_end(); + save_errno = errno; + + CloseTransientFile(fd); + + errno = save_errno; + return result; +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c new file mode 100644 index 0000000..66d3548 --- /dev/null +++ b/src/backend/access/transam/subtrans.c @@ -0,0 +1,374 @@ +/*------------------------------------------------------------------------- + * + * subtrans.c + * PostgreSQL subtransaction-log manager + * + * The pg_subtrans manager is a pg_xact-like manager that stores the parent + * transaction Id for each transaction. It is a fundamental part of the + * nested transactions implementation. A main transaction has a parent + * of InvalidTransactionId, and each subtransaction has its immediate parent. + * The tree can easily be walked from child to parent, but not in the + * opposite direction. + * + * This code is based on xact.c, but the robustness requirements + * are completely different from pg_xact, because we only need to remember + * pg_subtrans information for currently-open transactions. Thus, there is + * no need to preserve data over a crash and restart. + * + * There are no XLOG interactions since we do not care about preserving + * data across crashes. During database startup, we simply force the + * currently-active page of SUBTRANS to zeroes. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/subtrans.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "pg_trace.h" +#include "utils/snapmgr.h" + + +/* + * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * SubTrans page numbering also wraps around at + * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at + * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing + * them in StartupSUBTRANS. + */ + +/* We need four bytes per xact */ +#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) +#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) + + +/* + * Link to shared-memory data structures for SUBTRANS control + */ +static SlruCtlData SubTransCtlData; + +#define SubTransCtl (&SubTransCtlData) + + +static int ZeroSUBTRANSPage(int pageno); +static bool SubTransPagePrecedes(int page1, int page2); + + +/* + * Record the parent of a subtransaction in the subtrans log. + */ +void +SubTransSetParent(TransactionId xid, TransactionId parent) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToEntry(xid); + int slotno; + TransactionId *ptr; + + Assert(TransactionIdIsValid(parent)); + Assert(TransactionIdFollows(xid, parent)); + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid); + ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + ptr += entryno; + + /* + * It's possible we'll try to set the parent xid multiple times but we + * shouldn't ever be changing the xid from one valid xid to another valid + * xid, which would corrupt the data structure. + */ + if (*ptr != parent) + { + Assert(*ptr == InvalidTransactionId); + *ptr = parent; + SubTransCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(SubtransSLRULock); +} + +/* + * Interrogate the parent of a transaction in the subtrans log. + */ +TransactionId +SubTransGetParent(TransactionId xid) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToEntry(xid); + int slotno; + TransactionId *ptr; + TransactionId parent; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + /* Bootstrap and frozen XIDs have no parent */ + if (!TransactionIdIsNormal(xid)) + return InvalidTransactionId; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + + slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); + ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + ptr += entryno; + + parent = *ptr; + + LWLockRelease(SubtransSLRULock); + + return parent; +} + +/* + * SubTransGetTopmostTransaction + * + * Returns the topmost transaction of the given transaction id. + * + * Because we cannot look back further than TransactionXmin, it is possible + * that this function will lie and return an intermediate subtransaction ID + * instead of the true topmost parent ID. This is OK, because in practice + * we only care about detecting whether the topmost parent is still running + * or is part of a current snapshot's list of still-running transactions. + * Therefore, any XID before TransactionXmin is as good as any other. + */ +TransactionId +SubTransGetTopmostTransaction(TransactionId xid) +{ + TransactionId parentXid = xid, + previousXid = xid; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + while (TransactionIdIsValid(parentXid)) + { + previousXid = parentXid; + if (TransactionIdPrecedes(parentXid, TransactionXmin)) + break; + parentXid = SubTransGetParent(parentXid); + + /* + * By convention the parent xid gets allocated first, so should always + * precede the child xid. Anything else points to a corrupted data + * structure that could lead to an infinite loop, so exit. + */ + if (!TransactionIdPrecedes(parentXid, previousXid)) + elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", + previousXid, parentXid); + } + + Assert(TransactionIdIsValid(previousXid)); + + return previousXid; +} + + +/* + * Initialization of shared memory for SUBTRANS + */ +Size +SUBTRANSShmemSize(void) +{ + return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0); +} + +void +SUBTRANSShmemInit(void) +{ + SubTransCtl->PagePrecedes = SubTransPagePrecedes; + SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, + SubtransSLRULock, "pg_subtrans", + LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); + SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE); +} + +/* + * This func must be called ONCE on system install. It creates + * the initial SUBTRANS segment. (The SUBTRANS directory is assumed to + * have been created by the initdb shell script, and SUBTRANSShmemInit + * must have been called already.) + * + * Note: it's not really necessary to create the initial segment now, + * since slru.c would create it on first write anyway. But we may as well + * do it to be sure the directory is set up correctly. + */ +void +BootStrapSUBTRANS(void) +{ + int slotno; + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + /* Create and zero the first page of the subtrans log */ + slotno = ZeroSUBTRANSPage(0); + + /* Make sure it's written out */ + SimpleLruWritePage(SubTransCtl, slotno); + Assert(!SubTransCtl->shared->page_dirty[slotno]); + + LWLockRelease(SubtransSLRULock); +} + +/* + * Initialize (or reinitialize) a page of SUBTRANS to zeroes. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroSUBTRANSPage(int pageno) +{ + return SimpleLruZeroPage(SubTransCtl, pageno); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + * + * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid + * if there are none. + */ +void +StartupSUBTRANS(TransactionId oldestActiveXID) +{ + FullTransactionId nextXid; + int startPage; + int endPage; + + /* + * Since we don't expect pg_subtrans to be valid across crashes, we + * initialize the currently-active page(s) to zeroes during startup. + * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero + * the new page without regard to whatever was previously on disk. + */ + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + startPage = TransactionIdToPage(oldestActiveXID); + nextXid = ShmemVariableCache->nextXid; + endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid)); + + while (startPage != endPage) + { + (void) ZeroSUBTRANSPage(startPage); + startPage++; + /* must account for wraparound */ + if (startPage > TransactionIdToPage(MaxTransactionId)) + startPage = 0; + } + (void) ZeroSUBTRANSPage(startPage); + + LWLockRelease(SubtransSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointSUBTRANS(void) +{ + /* + * Write dirty SUBTRANS pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely to improve the odds that writing of dirty pages is done by + * the checkpoint process and not by backends. + */ + TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); + SimpleLruWriteAll(SubTransCtl, true); + TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); +} + + +/* + * Make sure that SUBTRANS has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty subtrans page to make room + * in shared memory. + */ +void +ExtendSUBTRANS(TransactionId newestXact) +{ + int pageno; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToEntry(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + + /* Zero the page */ + ZeroSUBTRANSPage(pageno); + + LWLockRelease(SubtransSLRULock); +} + + +/* + * Remove all SUBTRANS segments before the one holding the passed transaction ID + * + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. + */ +void +TruncateSUBTRANS(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. We step + * back one transaction to avoid passing a cutoff page that hasn't been + * created yet in the rare case that oldestXact would be the first item on + * a page and oldestXact == next XID. In that case, if we didn't subtract + * one, we'd trigger SimpleLruTruncate's wraparound detection. + */ + TransactionIdRetreat(oldestXact); + cutoffPage = TransactionIdToPage(oldestXact); + + SimpleLruTruncate(SubTransCtl, cutoffPage); +} + + +/* + * Decide whether a SUBTRANS page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + */ +static bool +SubTransPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + SUBTRANS_XACTS_PER_PAGE - 1)); +} diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c new file mode 100644 index 0000000..be21968 --- /dev/null +++ b/src/backend/access/transam/timeline.c @@ -0,0 +1,600 @@ +/*------------------------------------------------------------------------- + * + * timeline.c + * Functions for reading and writing timeline history files. + * + * A timeline history file lists the timeline changes of the timeline, in + * a simple text format. They are archived along with the WAL segments. + * + * The files are named like "<tli>.history". For example, if the database + * starts up and switches to timeline 5, the timeline history file would be + * called "00000005.history". + * + * Each line in the file represents a timeline switch: + * + * <parentTLI> <switchpoint> <reason> + * + * parentTLI ID of the parent timeline + * switchpoint XLogRecPtr of the WAL location where the switch happened + * reason human-readable explanation of why the timeline was changed + * + * The fields are separated by tabs. Lines beginning with # are comments, and + * are ignored. Empty lines are also ignored. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/timeline.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xlogdefs.h" +#include "pgstat.h" +#include "storage/fd.h" + +/* + * Copies all timeline history files with id's between 'begin' and 'end' + * from archive to pg_wal. + */ +void +restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end) +{ + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + TimeLineID tli; + + for (tli = begin; tli < end; tli++) + { + if (tli == 1) + continue; + + TLHistoryFileName(histfname, tli); + if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false)) + KeepFileRestoredFromArchive(path, histfname); + } +} + +/* + * Try to read a timeline's history file. + * + * If successful, return the list of component TLIs (the given TLI followed by + * its ancestor TLIs). If we can't find the history file, assume that the + * timeline has no parents, and return a list of just the specified timeline + * ID. + */ +List * +readTimeLineHistory(TimeLineID targetTLI) +{ + List *result; + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + FILE *fd; + TimeLineHistoryEntry *entry; + TimeLineID lasttli = 0; + XLogRecPtr prevend; + bool fromArchive = false; + + /* Timeline 1 does not have a history file, so no need to check */ + if (targetTLI == 1) + { + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); + } + + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, targetTLI); + fromArchive = + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, targetTLI); + + fd = AllocateFile(path, "r"); + if (fd == NULL) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + /* Not there, so assume no parents */ + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = entry->end = InvalidXLogRecPtr; + return list_make1(entry); + } + + result = NIL; + + /* + * Parse the file... + */ + prevend = InvalidXLogRecPtr; + for (;;) + { + char fline[MAXPGPATH]; + char *res; + char *ptr; + TimeLineID tli; + uint32 switchpoint_hi; + uint32 switchpoint_lo; + int nfields; + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ); + res = fgets(fline, sizeof(fline), fd); + pgstat_report_wait_end(); + if (res == NULL) + { + if (ferror(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + + break; + } + + /* skip leading whitespace and check for # comment */ + for (ptr = fline; *ptr; ptr++) + { + if (!isspace((unsigned char) *ptr)) + break; + } + if (*ptr == '\0' || *ptr == '#') + continue; + + nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + + if (nfields < 1) + { + /* expect a numeric timeline ID as first field of line */ + ereport(FATAL, + (errmsg("syntax error in history file: %s", fline), + errhint("Expected a numeric timeline ID."))); + } + if (nfields != 3) + ereport(FATAL, + (errmsg("syntax error in history file: %s", fline), + errhint("Expected a write-ahead log switchpoint location."))); + + if (result && tli <= lasttli) + ereport(FATAL, + (errmsg("invalid data in history file: %s", fline), + errhint("Timeline IDs must be in increasing sequence."))); + + lasttli = tli; + + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = tli; + entry->begin = prevend; + entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo; + prevend = entry->end; + + /* Build list with newest item first */ + result = lcons(entry, result); + + /* we ignore the remainder of each line */ + } + + FreeFile(fd); + + if (result && targetTLI <= lasttli) + ereport(FATAL, + (errmsg("invalid data in history file \"%s\"", path), + errhint("Timeline IDs must be less than child timeline's ID."))); + + /* + * Create one more entry for the "tip" of the timeline, which has no entry + * in the history file. + */ + entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry)); + entry->tli = targetTLI; + entry->begin = prevend; + entry->end = InvalidXLogRecPtr; + + result = lcons(entry, result); + + /* + * If the history file was fetched from archive, save it in pg_wal for + * future reference. + */ + if (fromArchive) + KeepFileRestoredFromArchive(path, histfname); + + return result; +} + +/* + * Probe whether a timeline history file exists for the given timeline ID + */ +bool +existsTimeLineHistory(TimeLineID probeTLI) +{ + char path[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + FILE *fd; + + /* Timeline 1 does not have a history file, so no need to check */ + if (probeTLI == 1) + return false; + + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, probeTLI); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, probeTLI); + + fd = AllocateFile(path, "r"); + if (fd != NULL) + { + FreeFile(fd); + return true; + } + else + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return false; + } +} + +/* + * Find the newest existing timeline, assuming that startTLI exists. + * + * Note: while this is somewhat heuristic, it does positively guarantee + * that (result + 1) is not a known timeline, and therefore it should + * be safe to assign that ID to a new timeline. + */ +TimeLineID +findNewestTimeLine(TimeLineID startTLI) +{ + TimeLineID newestTLI; + TimeLineID probeTLI; + + /* + * The algorithm is just to probe for the existence of timeline history + * files. XXX is it useful to allow gaps in the sequence? + */ + newestTLI = startTLI; + + for (probeTLI = startTLI + 1;; probeTLI++) + { + if (existsTimeLineHistory(probeTLI)) + { + newestTLI = probeTLI; /* probeTLI exists */ + } + else + { + /* doesn't exist, assume we're done */ + break; + } + } + + return newestTLI; +} + +/* + * Create a new timeline history file. + * + * newTLI: ID of the new timeline + * parentTLI: ID of its immediate parent + * switchpoint: WAL location where the system switched to the new timeline + * reason: human-readable explanation of why the timeline was switched + * + * Currently this is only used at the end recovery, and so there are no locking + * considerations. But we should be just as tense as XLogFileInit to avoid + * emplacing a bogus file. + */ +void +writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, + XLogRecPtr switchpoint, char *reason) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + char histfname[MAXFNAMELEN]; + char buffer[BLCKSZ]; + int srcfd; + int fd; + int nbytes; + + Assert(newTLI > parentTLI); /* else bad selection of newTLI */ + + /* + * Write into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + /* + * If a history file exists for the parent, copy it verbatim + */ + if (ArchiveRecoveryRequested) + { + TLHistoryFileName(histfname, parentTLI); + RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false); + } + else + TLHistoryFilePath(path, parentTLI); + + srcfd = OpenTransientFile(path, O_RDONLY); + if (srcfd < 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + /* Not there, so assume parent has no parents */ + } + else + { + for (;;) + { + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_READ); + nbytes = (int) read(srcfd, buffer, sizeof(buffer)); + pgstat_report_wait_end(); + if (nbytes < 0 || errno != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + if (nbytes == 0) + break; + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE); + if ((int) write(fd, buffer, nbytes) != nbytes) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk + * space + */ + unlink(tmppath); + + /* + * if write didn't set errno, assume problem is no disk space + */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + } + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + } + + /* + * Append one line with the details of this timeline split. + * + * If we did have a parent file, insert an extra newline just in case the + * parent file failed to end with one. + */ + snprintf(buffer, sizeof(buffer), + "%s%u\t%X/%X\t%s\n", + (srcfd < 0) ? "" : "\n", + parentTLI, + LSN_FORMAT_ARGS(switchpoint), + reason); + + nbytes = strlen(buffer); + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_WRITE); + if ((int) write(fd, buffer, nbytes) != nbytes) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the completed history file into place with its final name. + */ + TLHistoryFilePath(path, newTLI); + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + durable_rename_excl(tmppath, path, ERROR); + + /* The history file can be archived immediately. */ + if (XLogArchivingActive()) + { + TLHistoryFileName(histfname, newTLI); + XLogArchiveNotify(histfname); + } +} + +/* + * Writes a history file for given timeline and contents. + * + * Currently this is only used in the walreceiver process, and so there are + * no locking considerations. But we should be just as tense as XLogFileInit + * to avoid emplacing a bogus file. + */ +void +writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + int fd; + + /* + * Write into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE); + if ((int) write(fd, content, size) != size) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the completed history file into place with its final name. + */ + TLHistoryFilePath(path, tli); + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + durable_rename_excl(tmppath, path, ERROR); +} + +/* + * Returns true if 'expectedTLEs' contains a timeline with id 'tli' + */ +bool +tliInHistory(TimeLineID tli, List *expectedTLEs) +{ + ListCell *cell; + + foreach(cell, expectedTLEs) + { + if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli) + return true; + } + + return false; +} + +/* + * Returns the ID of the timeline in use at a particular point in time, in + * the given timeline history. + */ +TimeLineID +tliOfPointInHistory(XLogRecPtr ptr, List *history) +{ + ListCell *cell; + + foreach(cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + + if ((XLogRecPtrIsInvalid(tle->begin) || tle->begin <= ptr) && + (XLogRecPtrIsInvalid(tle->end) || ptr < tle->end)) + { + /* found it */ + return tle->tli; + } + } + + /* shouldn't happen. */ + elog(ERROR, "timeline history was not contiguous"); + return 0; /* keep compiler quiet */ +} + +/* + * Returns the point in history where we branched off the given timeline, + * and the timeline we branched to (*nextTLI). Returns InvalidXLogRecPtr if + * the timeline is current, ie. we have not branched off from it, and throws + * an error if the timeline is not part of this server's history. + */ +XLogRecPtr +tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI) +{ + ListCell *cell; + + if (nextTLI) + *nextTLI = 0; + foreach(cell, history) + { + TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell); + + if (tle->tli == tli) + return tle->end; + if (nextTLI) + *nextTLI = tle->tli; + } + + ereport(ERROR, + (errmsg("requested timeline %u is not in this server's history", + tli))); + return InvalidXLogRecPtr; /* keep compiler quiet */ +} diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c new file mode 100644 index 0000000..5865810 --- /dev/null +++ b/src/backend/access/transam/transam.c @@ -0,0 +1,398 @@ +/*------------------------------------------------------------------------- + * + * transam.c + * postgres transaction (commit) log interface routines + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/transam.c + * + * NOTES + * This file contains the high level access-method interface to the + * transaction system. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/clog.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "utils/snapmgr.h" + +/* + * Single-item cache for results of TransactionLogFetch. It's worth having + * such a cache because we frequently find ourselves repeatedly checking the + * same XID, for example when scanning a table just after a bulk insert, + * update, or delete. + */ +static TransactionId cachedFetchXid = InvalidTransactionId; +static XidStatus cachedFetchXidStatus; +static XLogRecPtr cachedCommitLSN; + +/* Local functions */ +static XidStatus TransactionLogFetch(TransactionId transactionId); + + +/* ---------------------------------------------------------------- + * Postgres log access method interface + * + * TransactionLogFetch + * ---------------------------------------------------------------- + */ + +/* + * TransactionLogFetch --- fetch commit status of specified transaction id + */ +static XidStatus +TransactionLogFetch(TransactionId transactionId) +{ + XidStatus xidstatus; + XLogRecPtr xidlsn; + + /* + * Before going to the commit log manager, check our single item cache to + * see if we didn't just check the transaction status a moment ago. + */ + if (TransactionIdEquals(transactionId, cachedFetchXid)) + return cachedFetchXidStatus; + + /* + * Also, check to see if the transaction ID is a permanent one. + */ + if (!TransactionIdIsNormal(transactionId)) + { + if (TransactionIdEquals(transactionId, BootstrapTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + if (TransactionIdEquals(transactionId, FrozenTransactionId)) + return TRANSACTION_STATUS_COMMITTED; + return TRANSACTION_STATUS_ABORTED; + } + + /* + * Get the transaction status. + */ + xidstatus = TransactionIdGetStatus(transactionId, &xidlsn); + + /* + * Cache it, but DO NOT cache status for unfinished or sub-committed + * transactions! We only cache status that is guaranteed not to change. + */ + if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS && + xidstatus != TRANSACTION_STATUS_SUB_COMMITTED) + { + cachedFetchXid = transactionId; + cachedFetchXidStatus = xidstatus; + cachedCommitLSN = xidlsn; + } + + return xidstatus; +} + +/* ---------------------------------------------------------------- + * Interface functions + * + * TransactionIdDidCommit + * TransactionIdDidAbort + * ======== + * these functions test the transaction status of + * a specified transaction id. + * + * TransactionIdCommitTree + * TransactionIdAsyncCommitTree + * TransactionIdAbortTree + * ======== + * these functions set the transaction status of the specified + * transaction tree. + * + * See also TransactionIdIsInProgress, which once was in this module + * but now lives in procarray.c. + * ---------------------------------------------------------------- + */ + +/* + * TransactionIdDidCommit + * True iff transaction associated with the identifier did commit. + * + * Note: + * Assumes transaction identifier is valid and exists in clog. + */ +bool /* true if given transaction committed */ +TransactionIdDidCommit(TransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = TransactionLogFetch(transactionId); + + /* + * If it's marked committed, it's committed. + */ + if (xidstatus == TRANSACTION_STATUS_COMMITTED) + return true; + + /* + * If it's marked subcommitted, we have to check the parent recursively. + * However, if it's older than TransactionXmin, we can't look at + * pg_subtrans; instead assume that the parent crashed without cleaning up + * its children. + * + * Originally we Assert'ed that the result of SubTransGetParent was not + * zero. However with the introduction of prepared transactions, there can + * be a window just after database startup where we do not have complete + * knowledge in pg_subtrans of the transactions after TransactionXmin. + * StartupSUBTRANS() has ensured that any missing information will be + * zeroed. Since this case should not happen under normal conditions, it + * seems reasonable to emit a WARNING for it. + */ + if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) + { + TransactionId parentXid; + + if (TransactionIdPrecedes(transactionId, TransactionXmin)) + return false; + parentXid = SubTransGetParent(transactionId); + if (!TransactionIdIsValid(parentXid)) + { + elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", + transactionId); + return false; + } + return TransactionIdDidCommit(parentXid); + } + + /* + * It's not committed. + */ + return false; +} + +/* + * TransactionIdDidAbort + * True iff transaction associated with the identifier did abort. + * + * Note: + * Assumes transaction identifier is valid and exists in clog. + */ +bool /* true if given transaction aborted */ +TransactionIdDidAbort(TransactionId transactionId) +{ + XidStatus xidstatus; + + xidstatus = TransactionLogFetch(transactionId); + + /* + * If it's marked aborted, it's aborted. + */ + if (xidstatus == TRANSACTION_STATUS_ABORTED) + return true; + + /* + * If it's marked subcommitted, we have to check the parent recursively. + * However, if it's older than TransactionXmin, we can't look at + * pg_subtrans; instead assume that the parent crashed without cleaning up + * its children. + */ + if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) + { + TransactionId parentXid; + + if (TransactionIdPrecedes(transactionId, TransactionXmin)) + return true; + parentXid = SubTransGetParent(transactionId); + if (!TransactionIdIsValid(parentXid)) + { + /* see notes in TransactionIdDidCommit */ + elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", + transactionId); + return true; + } + return TransactionIdDidAbort(parentXid); + } + + /* + * It's not aborted. + */ + return false; +} + +/* + * TransactionIdCommitTree + * Marks the given transaction and children as committed + * + * "xid" is a toplevel transaction commit, and the xids array contains its + * committed subtransactions. + * + * This commit operation is not guaranteed to be atomic, but if not, subxids + * are correctly marked subcommit first. + */ +void +TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_COMMITTED, + InvalidXLogRecPtr); +} + +/* + * TransactionIdAsyncCommitTree + * Same as above, but for async commits. The commit record LSN is needed. + */ +void +TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, + XLogRecPtr lsn) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_COMMITTED, lsn); +} + +/* + * TransactionIdAbortTree + * Marks the given transaction and children as aborted. + * + * "xid" is a toplevel transaction commit, and the xids array contains its + * committed subtransactions. + * + * We don't need to worry about the non-atomic behavior, since any onlookers + * will consider all the xacts as not-yet-committed anyway. + */ +void +TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids) +{ + TransactionIdSetTreeStatus(xid, nxids, xids, + TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr); +} + +/* + * TransactionIdPrecedes --- is id1 logically < id2? + */ +bool +TransactionIdPrecedes(TransactionId id1, TransactionId id2) +{ + /* + * If either ID is a permanent XID then we can just do unsigned + * comparison. If both are normal, do a modulo-2^32 comparison. + */ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 < id2); + + diff = (int32) (id1 - id2); + return (diff < 0); +} + +/* + * TransactionIdPrecedesOrEquals --- is id1 logically <= id2? + */ +bool +TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 <= id2); + + diff = (int32) (id1 - id2); + return (diff <= 0); +} + +/* + * TransactionIdFollows --- is id1 logically > id2? + */ +bool +TransactionIdFollows(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 > id2); + + diff = (int32) (id1 - id2); + return (diff > 0); +} + +/* + * TransactionIdFollowsOrEquals --- is id1 logically >= id2? + */ +bool +TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2) +{ + int32 diff; + + if (!TransactionIdIsNormal(id1) || !TransactionIdIsNormal(id2)) + return (id1 >= id2); + + diff = (int32) (id1 - id2); + return (diff >= 0); +} + + +/* + * TransactionIdLatest --- get latest XID among a main xact and its children + */ +TransactionId +TransactionIdLatest(TransactionId mainxid, + int nxids, const TransactionId *xids) +{ + TransactionId result; + + /* + * In practice it is highly likely that the xids[] array is sorted, and so + * we could save some cycles by just taking the last child XID, but this + * probably isn't so performance-critical that it's worth depending on + * that assumption. But just to show we're not totally stupid, scan the + * array back-to-front to avoid useless assignments. + */ + result = mainxid; + while (--nxids >= 0) + { + if (TransactionIdPrecedes(result, xids[nxids])) + result = xids[nxids]; + } + return result; +} + + +/* + * TransactionIdGetCommitLSN + * + * This function returns an LSN that is late enough to be able + * to guarantee that if we flush up to the LSN returned then we + * will have flushed the transaction's commit record to disk. + * + * The result is not necessarily the exact LSN of the transaction's + * commit record! For example, for long-past transactions (those whose + * clog pages already migrated to disk), we'll return InvalidXLogRecPtr. + * Also, because we group transactions on the same clog page to conserve + * storage, we might return the LSN of a later transaction that falls into + * the same group. + */ +XLogRecPtr +TransactionIdGetCommitLSN(TransactionId xid) +{ + XLogRecPtr result; + + /* + * Currently, all uses of this function are for xids that were just + * reported to be committed by TransactionLogFetch, so we expect that + * checking TransactionLogFetch's cache will usually succeed and avoid an + * extra trip to shared memory. + */ + if (TransactionIdEquals(xid, cachedFetchXid)) + return cachedCommitLSN; + + /* Special XIDs are always known committed */ + if (!TransactionIdIsNormal(xid)) + return InvalidXLogRecPtr; + + /* + * Get the transaction status. + */ + (void) TransactionIdGetStatus(xid, &result); + + return result; +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c new file mode 100644 index 0000000..5293c69 --- /dev/null +++ b/src/backend/access/transam/twophase.c @@ -0,0 +1,2662 @@ +/*------------------------------------------------------------------------- + * + * twophase.c + * Two-phase commit support functions. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/transam/twophase.c + * + * NOTES + * Each global transaction is associated with a global transaction + * identifier (GID). The client assigns a GID to a postgres + * transaction with the PREPARE TRANSACTION command. + * + * We keep all active global transactions in a shared memory array. + * When the PREPARE TRANSACTION command is issued, the GID is + * reserved for the transaction in the array. This is done before + * a WAL entry is made, because the reservation checks for duplicate + * GIDs and aborts the transaction if there already is a global + * transaction in prepared state with the same GID. + * + * A global transaction (gxact) also has dummy PGPROC; this is what keeps + * the XID considered running by TransactionIdIsInProgress. It is also + * convenient as a PGPROC to hook the gxact's locks to. + * + * Information to recover prepared transactions in case of crash is + * now stored in WAL for the common case. In some cases there will be + * an extended period between preparing a GXACT and commit/abort, in + * which case we need to separately record prepared transaction data + * in permanent storage. This includes locking information, pending + * notifications etc. All that state information is written to the + * per-transaction state file in the pg_twophase directory. + * All prepared transactions will be written prior to shutdown. + * + * Life track of state data is following: + * + * * On PREPARE TRANSACTION backend writes state data only to the WAL and + * stores pointer to the start of the WAL record in + * gxact->prepare_start_lsn. + * * If COMMIT occurs before checkpoint then backend reads data from WAL + * using prepare_start_lsn. + * * On checkpoint state data copied to files in pg_twophase directory and + * fsynced + * * If COMMIT happens after checkpoint then backend reads state data from + * files + * + * During replay and replication, TwoPhaseState also holds information + * about active prepared transactions that haven't been moved to disk yet. + * + * Replay of twophase records happens by the following rules: + * + * * At the beginning of recovery, pg_twophase is scanned once, filling + * TwoPhaseState with entries marked with gxact->inredo and + * gxact->ondisk. Two-phase file data older than the XID horizon of + * the redo position are discarded. + * * On PREPARE redo, the transaction is added to TwoPhaseState->prepXacts. + * gxact->inredo is set to true for such entries. + * * On Checkpoint we iterate through TwoPhaseState->prepXacts entries + * that have gxact->inredo set and are behind the redo_horizon. We + * save them to disk and then switch gxact->ondisk to true. + * * On COMMIT/ABORT we delete the entry from TwoPhaseState->prepXacts. + * If gxact->ondisk is true, the corresponding entry from the disk + * is additionally deleted. + * * RecoverPreparedTransactions(), StandbyRecoverPreparedTransactions() + * and PrescanPreparedTransactions() have been modified to go through + * gxact->inredo entries that have not made it to disk. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <sys/stat.h> +#include <time.h> +#include <unistd.h> + +#include "access/commit_ts.h" +#include "access/htup_details.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogreader.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "catalog/storage.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "replication/origin.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/md.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +/* + * Directory where Two-phase commit files reside within PGDATA + */ +#define TWOPHASE_DIR "pg_twophase" + +/* GUC variable, can't be changed after startup */ +int max_prepared_xacts = 0; + +/* + * This struct describes one global transaction that is in prepared state + * or attempting to become prepared. + * + * The lifecycle of a global transaction is: + * + * 1. After checking that the requested GID is not in use, set up an entry in + * the TwoPhaseState->prepXacts array with the correct GID and valid = false, + * and mark it as locked by my backend. + * + * 2. After successfully completing prepare, set valid = true and enter the + * referenced PGPROC into the global ProcArray. + * + * 3. To begin COMMIT PREPARED or ROLLBACK PREPARED, check that the entry is + * valid and not locked, then mark the entry as locked by storing my current + * backend ID into locking_backend. This prevents concurrent attempts to + * commit or rollback the same prepared xact. + * + * 4. On completion of COMMIT PREPARED or ROLLBACK PREPARED, remove the entry + * from the ProcArray and the TwoPhaseState->prepXacts array and return it to + * the freelist. + * + * Note that if the preparing transaction fails between steps 1 and 2, the + * entry must be removed so that the GID and the GlobalTransaction struct + * can be reused. See AtAbort_Twophase(). + * + * typedef struct GlobalTransactionData *GlobalTransaction appears in + * twophase.h + */ + +typedef struct GlobalTransactionData +{ + GlobalTransaction next; /* list link for free list */ + int pgprocno; /* ID of associated dummy PGPROC */ + BackendId dummyBackendId; /* similar to backend id for backends */ + TimestampTz prepared_at; /* time of preparation */ + + /* + * Note that we need to keep track of two LSNs for each GXACT. We keep + * track of the start LSN because this is the address we must use to read + * state data back from WAL when committing a prepared GXACT. We keep + * track of the end LSN because that is the LSN we need to wait for prior + * to commit. + */ + XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ + XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ + TransactionId xid; /* The GXACT id */ + + Oid owner; /* ID of user that executed the xact */ + BackendId locking_backend; /* backend currently working on the xact */ + bool valid; /* true if PGPROC entry is in proc array */ + bool ondisk; /* true if prepare state file is on disk */ + bool inredo; /* true if entry was added via xlog_redo */ + char gid[GIDSIZE]; /* The GID assigned to the prepared xact */ +} GlobalTransactionData; + +/* + * Two Phase Commit shared state. Access to this struct is protected + * by TwoPhaseStateLock. + */ +typedef struct TwoPhaseStateData +{ + /* Head of linked list of free GlobalTransactionData structs */ + GlobalTransaction freeGXacts; + + /* Number of valid prepXacts entries. */ + int numPrepXacts; + + /* There are max_prepared_xacts items in this array */ + GlobalTransaction prepXacts[FLEXIBLE_ARRAY_MEMBER]; +} TwoPhaseStateData; + +static TwoPhaseStateData *TwoPhaseState; + +/* + * Global transaction entry currently locked by us, if any. Note that any + * access to the entry pointed to by this variable must be protected by + * TwoPhaseStateLock, though obviously the pointer itself doesn't need to be + * (since it's just local memory). + */ +static GlobalTransaction MyLockedGxact = NULL; + +static bool twophaseExitRegistered = false; + +static void RecordTransactionCommitPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int nstats, + xl_xact_stats_item *stats, + int ninvalmsgs, + SharedInvalidationMessage *invalmsgs, + bool initfileinval, + const char *gid); +static void RecordTransactionAbortPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int nstats, + xl_xact_stats_item *stats, + const char *gid); +static void ProcessRecords(char *bufptr, TransactionId xid, + const TwoPhaseCallback callbacks[]); +static void RemoveGXact(GlobalTransaction gxact); + +static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); +static char *ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, bool setParent, bool setNextXid); +static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid); +static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); +static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); + +/* + * Initialization of shared memory + */ +Size +TwoPhaseShmemSize(void) +{ + Size size; + + /* Need the fixed struct, the array of pointers, and the GTD structs */ + size = offsetof(TwoPhaseStateData, prepXacts); + size = add_size(size, mul_size(max_prepared_xacts, + sizeof(GlobalTransaction))); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_prepared_xacts, + sizeof(GlobalTransactionData))); + + return size; +} + +void +TwoPhaseShmemInit(void) +{ + bool found; + + TwoPhaseState = ShmemInitStruct("Prepared Transaction Table", + TwoPhaseShmemSize(), + &found); + if (!IsUnderPostmaster) + { + GlobalTransaction gxacts; + int i; + + Assert(!found); + TwoPhaseState->freeGXacts = NULL; + TwoPhaseState->numPrepXacts = 0; + + /* + * Initialize the linked list of free GlobalTransactionData structs + */ + gxacts = (GlobalTransaction) + ((char *) TwoPhaseState + + MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) + + sizeof(GlobalTransaction) * max_prepared_xacts)); + for (i = 0; i < max_prepared_xacts; i++) + { + /* insert into linked list */ + gxacts[i].next = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = &gxacts[i]; + + /* associate it with a PGPROC assigned by InitProcGlobal */ + gxacts[i].pgprocno = PreparedXactProcs[i].pgprocno; + + /* + * Assign a unique ID for each dummy proc, so that the range of + * dummy backend IDs immediately follows the range of normal + * backend IDs. We don't dare to assign a real backend ID to dummy + * procs, because prepared transactions don't take part in cache + * invalidation like a real backend ID would imply, but having a + * unique ID for them is nevertheless handy. This arrangement + * allows you to allocate an array of size (MaxBackends + + * max_prepared_xacts + 1), and have a slot for every backend and + * prepared transaction. Currently multixact.c uses that + * technique. + */ + gxacts[i].dummyBackendId = MaxBackends + 1 + i; + } + } + else + Assert(found); +} + +/* + * Exit hook to unlock the global transaction entry we're working on. + */ +static void +AtProcExit_Twophase(int code, Datum arg) +{ + /* same logic as abort */ + AtAbort_Twophase(); +} + +/* + * Abort hook to unlock the global transaction entry we're working on. + */ +void +AtAbort_Twophase(void) +{ + if (MyLockedGxact == NULL) + return; + + /* + * What to do with the locked global transaction entry? If we were in the + * process of preparing the transaction, but haven't written the WAL + * record and state file yet, the transaction must not be considered as + * prepared. Likewise, if we are in the process of finishing an + * already-prepared transaction, and fail after having already written the + * 2nd phase commit or rollback record to the WAL, the transaction should + * not be considered as prepared anymore. In those cases, just remove the + * entry from shared memory. + * + * Otherwise, the entry must be left in place so that the transaction can + * be finished later, so just unlock it. + * + * If we abort during prepare, after having written the WAL record, we + * might not have transferred all locks and other state to the prepared + * transaction yet. Likewise, if we abort during commit or rollback, + * after having written the WAL record, we might not have released all the + * resources held by the transaction yet. In those cases, the in-memory + * state can be wrong, but it's too late to back out. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + if (!MyLockedGxact->valid) + RemoveGXact(MyLockedGxact); + else + MyLockedGxact->locking_backend = InvalidBackendId; + LWLockRelease(TwoPhaseStateLock); + + MyLockedGxact = NULL; +} + +/* + * This is called after we have finished transferring state to the prepared + * PGPROC entry. + */ +void +PostPrepare_Twophase(void) +{ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + MyLockedGxact->locking_backend = InvalidBackendId; + LWLockRelease(TwoPhaseStateLock); + + MyLockedGxact = NULL; +} + + +/* + * MarkAsPreparing + * Reserve the GID for the given transaction. + */ +GlobalTransaction +MarkAsPreparing(TransactionId xid, const char *gid, + TimestampTz prepared_at, Oid owner, Oid databaseid) +{ + GlobalTransaction gxact; + int i; + + if (strlen(gid) >= GIDSIZE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("transaction identifier \"%s\" is too long", + gid))); + + /* fail immediately if feature is disabled */ + if (max_prepared_xacts == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepared transactions are disabled"), + errhint("Set max_prepared_transactions to a nonzero value."))); + + /* on first call, register the exit hook */ + if (!twophaseExitRegistered) + { + before_shmem_exit(AtProcExit_Twophase, 0); + twophaseExitRegistered = true; + } + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + /* Check for conflicting GID */ + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; + if (strcmp(gxact->gid, gid) == 0) + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("transaction identifier \"%s\" is already in use", + gid))); + } + } + + /* Get a free gxact from the freelist */ + if (TwoPhaseState->freeGXacts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of prepared transactions reached"), + errhint("Increase max_prepared_transactions (currently %d).", + max_prepared_xacts))); + gxact = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact->next; + + MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + + gxact->ondisk = false; + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + LWLockRelease(TwoPhaseStateLock); + + return gxact; +} + +/* + * MarkAsPreparingGuts + * + * This uses a gxact struct and puts it into the active array. + * NOTE: this is also used when reloading a gxact after a crash; so avoid + * assuming that we can use very much backend context. + * + * Note: This function should be called with appropriate locks held. + */ +static void +MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, + TimestampTz prepared_at, Oid owner, Oid databaseid) +{ + PGPROC *proc; + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + Assert(gxact != NULL); + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* Initialize the PGPROC entry */ + MemSet(proc, 0, sizeof(PGPROC)); + proc->pgprocno = gxact->pgprocno; + SHMQueueElemInit(&(proc->links)); + proc->waitStatus = PROC_WAIT_STATUS_OK; + if (LocalTransactionIdIsValid(MyProc->lxid)) + { + /* clone VXID, for TwoPhaseGetXidByVirtualXID() to find */ + proc->lxid = MyProc->lxid; + proc->backendId = MyBackendId; + } + else + { + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + /* GetLockConflicts() uses this to specify a wait on the XID */ + proc->lxid = xid; + proc->backendId = InvalidBackendId; + } + proc->xid = xid; + Assert(proc->xmin == InvalidTransactionId); + proc->delayChkptFlags = 0; + proc->statusFlags = 0; + proc->pid = 0; + proc->databaseId = databaseid; + proc->roleId = owner; + proc->tempNamespaceId = InvalidOid; + proc->isBackgroundWorker = false; + proc->lwWaiting = false; + proc->lwWaitMode = 0; + proc->waitLock = NULL; + proc->waitProcLock = NULL; + pg_atomic_init_u64(&proc->waitStart, 0); + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + SHMQueueInit(&(proc->myProcLocks[i])); + /* subxid data must be filled later by GXactLoadSubxactData */ + proc->subxidStatus.overflowed = false; + proc->subxidStatus.count = 0; + + gxact->prepared_at = prepared_at; + gxact->xid = xid; + gxact->owner = owner; + gxact->locking_backend = MyBackendId; + gxact->valid = false; + gxact->inredo = false; + strcpy(gxact->gid, gid); + + /* + * Remember that we have this GlobalTransaction entry locked for us. If we + * abort after this, we must release it. + */ + MyLockedGxact = gxact; +} + +/* + * GXactLoadSubxactData + * + * If the transaction being persisted had any subtransactions, this must + * be called before MarkAsPrepared() to load information into the dummy + * PGPROC. + */ +static void +GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, + TransactionId *children) +{ + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* We need no extra lock since the GXACT isn't valid yet */ + if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS) + { + proc->subxidStatus.overflowed = true; + nsubxacts = PGPROC_MAX_CACHED_SUBXIDS; + } + if (nsubxacts > 0) + { + memcpy(proc->subxids.xids, children, + nsubxacts * sizeof(TransactionId)); + proc->subxidStatus.count = nsubxacts; + } +} + +/* + * MarkAsPrepared + * Mark the GXACT as fully valid, and enter it into the global ProcArray. + * + * lock_held indicates whether caller already holds TwoPhaseStateLock. + */ +static void +MarkAsPrepared(GlobalTransaction gxact, bool lock_held) +{ + /* Lock here may be overkill, but I'm not convinced of that ... */ + if (!lock_held) + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + Assert(!gxact->valid); + gxact->valid = true; + if (!lock_held) + LWLockRelease(TwoPhaseStateLock); + + /* + * Put it into the global ProcArray so TransactionIdIsInProgress considers + * the XID as still running. + */ + ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]); +} + +/* + * LockGXact + * Locate the prepared transaction and mark it busy for COMMIT or PREPARE. + */ +static GlobalTransaction +LockGXact(const char *gid, Oid user) +{ + int i; + + /* on first call, register the exit hook */ + if (!twophaseExitRegistered) + { + before_shmem_exit(AtProcExit_Twophase, 0); + twophaseExitRegistered = true; + } + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + + /* Ignore not-yet-valid GIDs */ + if (!gxact->valid) + continue; + if (strcmp(gxact->gid, gid) != 0) + continue; + + /* Found it, but has someone else got it locked? */ + if (gxact->locking_backend != InvalidBackendId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("prepared transaction with identifier \"%s\" is busy", + gid))); + + if (user != gxact->owner && !superuser_arg(user)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to finish prepared transaction"), + errhint("Must be superuser or the user that prepared the transaction."))); + + /* + * Note: it probably would be possible to allow committing from + * another database; but at the moment NOTIFY is known not to work and + * there may be some other issues as well. Hence disallow until + * someone gets motivated to make it work. + */ + if (MyDatabaseId != proc->databaseId) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("prepared transaction belongs to another database"), + errhint("Connect to the database where the transaction was prepared to finish it."))); + + /* OK for me to lock it */ + gxact->locking_backend = MyBackendId; + MyLockedGxact = gxact; + + LWLockRelease(TwoPhaseStateLock); + + return gxact; + } + + LWLockRelease(TwoPhaseStateLock); + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("prepared transaction with identifier \"%s\" does not exist", + gid))); + + /* NOTREACHED */ + return NULL; +} + +/* + * RemoveGXact + * Remove the prepared transaction from the shared memory array. + * + * NB: caller should have already removed it from ProcArray + */ +static void +RemoveGXact(GlobalTransaction gxact) +{ + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + if (gxact == TwoPhaseState->prepXacts[i]) + { + /* remove from the active array */ + TwoPhaseState->numPrepXacts--; + TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts]; + + /* and put it back in the freelist */ + gxact->next = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact; + + return; + } + } + + elog(ERROR, "failed to find %p in GlobalTransaction array", gxact); +} + +/* + * Returns an array of all prepared transactions for the user-level + * function pg_prepared_xact. + * + * The returned array and all its elements are copies of internal data + * structures, to minimize the time we need to hold the TwoPhaseStateLock. + * + * WARNING -- we return even those transactions that are not fully prepared + * yet. The caller should filter them out if he doesn't want them. + * + * The returned array is palloc'd. + */ +static int +GetPreparedTransactionList(GlobalTransaction *gxacts) +{ + GlobalTransaction array; + int num; + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + if (TwoPhaseState->numPrepXacts == 0) + { + LWLockRelease(TwoPhaseStateLock); + + *gxacts = NULL; + return 0; + } + + num = TwoPhaseState->numPrepXacts; + array = (GlobalTransaction) palloc(sizeof(GlobalTransactionData) * num); + *gxacts = array; + for (i = 0; i < num; i++) + memcpy(array + i, TwoPhaseState->prepXacts[i], + sizeof(GlobalTransactionData)); + + LWLockRelease(TwoPhaseStateLock); + + return num; +} + + +/* Working status for pg_prepared_xact */ +typedef struct +{ + GlobalTransaction array; + int ngxacts; + int currIdx; +} Working_State; + +/* + * pg_prepared_xact + * Produce a view with one row per prepared transaction. + * + * This function is here so we don't have to export the + * GlobalTransactionData struct definition. + */ +Datum +pg_prepared_xact(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Working_State *status; + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext oldcontext; + + /* create a function context for cross-call persistence */ + funcctx = SRF_FIRSTCALL_INIT(); + + /* + * Switch to memory context appropriate for multiple function calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* build tupdesc for result tuples */ + /* this had better match pg_prepared_xacts view in system_views.sql */ + tupdesc = CreateTemplateTupleDesc(5); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction", + XIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared", + TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid", + OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid", + OIDOID, -1, 0); + + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* + * Collect all the 2PC status information that we will format and send + * out as a result set. + */ + status = (Working_State *) palloc(sizeof(Working_State)); + funcctx->user_fctx = (void *) status; + + status->ngxacts = GetPreparedTransactionList(&status->array); + status->currIdx = 0; + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + status = (Working_State *) funcctx->user_fctx; + + while (status->array != NULL && status->currIdx < status->ngxacts) + { + GlobalTransaction gxact = &status->array[status->currIdx++]; + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + Datum values[5]; + bool nulls[5]; + HeapTuple tuple; + Datum result; + + if (!gxact->valid) + continue; + + /* + * Form tuple with appropriate data. + */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = TransactionIdGetDatum(proc->xid); + values[1] = CStringGetTextDatum(gxact->gid); + values[2] = TimestampTzGetDatum(gxact->prepared_at); + values[3] = ObjectIdGetDatum(gxact->owner); + values[4] = ObjectIdGetDatum(proc->databaseId); + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + result = HeapTupleGetDatum(tuple); + SRF_RETURN_NEXT(funcctx, result); + } + + SRF_RETURN_DONE(funcctx); +} + +/* + * TwoPhaseGetGXact + * Get the GlobalTransaction struct for a prepared transaction + * specified by XID + * + * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the + * caller had better hold it. + */ +static GlobalTransaction +TwoPhaseGetGXact(TransactionId xid, bool lock_held) +{ + GlobalTransaction result = NULL; + int i; + + static TransactionId cached_xid = InvalidTransactionId; + static GlobalTransaction cached_gxact = NULL; + + Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock)); + + /* + * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called + * repeatedly for the same XID. We can save work with a simple cache. + */ + if (xid == cached_xid) + return cached_gxact; + + if (!lock_held) + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->xid == xid) + { + result = gxact; + break; + } + } + + if (!lock_held) + LWLockRelease(TwoPhaseStateLock); + + if (result == NULL) /* should not happen */ + elog(ERROR, "failed to find GlobalTransaction for xid %u", xid); + + cached_xid = xid; + cached_gxact = result; + + return result; +} + +/* + * TwoPhaseGetXidByVirtualXID + * Lookup VXID among xacts prepared since last startup. + * + * (This won't find recovered xacts.) If more than one matches, return any + * and set "have_more" to true. To witness multiple matches, a single + * BackendId must consume 2^32 LXIDs, with no intervening database restart. + */ +TransactionId +TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, + bool *have_more) +{ + int i; + TransactionId result = InvalidTransactionId; + + Assert(VirtualTransactionIdIsValid(vxid)); + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + PGPROC *proc; + VirtualTransactionId proc_vxid; + + if (!gxact->valid) + continue; + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + GET_VXID_FROM_PGPROC(proc_vxid, *proc); + if (VirtualTransactionIdEquals(vxid, proc_vxid)) + { + /* Startup process sets proc->backendId to InvalidBackendId. */ + Assert(!gxact->inredo); + + if (result != InvalidTransactionId) + { + *have_more = true; + break; + } + result = gxact->xid; + } + } + + LWLockRelease(TwoPhaseStateLock); + + return result; +} + +/* + * TwoPhaseGetDummyBackendId + * Get the dummy backend ID for prepared transaction specified by XID + * + * Dummy backend IDs are similar to real backend IDs of real backends. + * They start at MaxBackends + 1, and are unique across all currently active + * real backends and prepared transactions. If lock_held is set to true, + * TwoPhaseStateLock will not be taken, so the caller had better hold it. + */ +BackendId +TwoPhaseGetDummyBackendId(TransactionId xid, bool lock_held) +{ + GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + + return gxact->dummyBackendId; +} + +/* + * TwoPhaseGetDummyProc + * Get the PGPROC that represents a prepared transaction specified by XID + * + * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the + * caller had better hold it. + */ +PGPROC * +TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) +{ + GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + + return &ProcGlobal->allProcs[gxact->pgprocno]; +} + +/************************************************************************/ +/* State file support */ +/************************************************************************/ + +#define TwoPhaseFilePath(path, xid) \ + snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X", xid) + +/* + * 2PC state file format: + * + * 1. TwoPhaseFileHeader + * 2. TransactionId[] (subtransactions) + * 3. RelFileNode[] (files to be deleted at commit) + * 4. RelFileNode[] (files to be deleted at abort) + * 5. SharedInvalidationMessage[] (inval messages to be sent at commit) + * 6. TwoPhaseRecordOnDisk + * 7. ... + * 8. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID) + * 9. checksum (CRC-32C) + * + * Each segment except the final checksum is MAXALIGN'd. + */ + +/* + * Header for a 2PC state file + */ +#define TWOPHASE_MAGIC 0x57F94534 /* format identifier */ + +typedef xl_xact_prepare TwoPhaseFileHeader; + +/* + * Header for each record in a state file + * + * NOTE: len counts only the rmgr data, not the TwoPhaseRecordOnDisk header. + * The rmgr data will be stored starting on a MAXALIGN boundary. + */ +typedef struct TwoPhaseRecordOnDisk +{ + uint32 len; /* length of rmgr data */ + TwoPhaseRmgrId rmid; /* resource manager for this record */ + uint16 info; /* flag bits for use by rmgr */ +} TwoPhaseRecordOnDisk; + +/* + * During prepare, the state file is assembled in memory before writing it + * to WAL and the actual state file. We use a chain of StateFileChunk blocks + * for that. + */ +typedef struct StateFileChunk +{ + char *data; + uint32 len; + struct StateFileChunk *next; +} StateFileChunk; + +static struct xllist +{ + StateFileChunk *head; /* first data block in the chain */ + StateFileChunk *tail; /* last block in chain */ + uint32 num_chunks; + uint32 bytes_free; /* free bytes left in tail block */ + uint32 total_len; /* total data bytes in chain */ +} records; + + +/* + * Append a block of data to records data structure. + * + * NB: each block is padded to a MAXALIGN multiple. This must be + * accounted for when the file is later read! + * + * The data is copied, so the caller is free to modify it afterwards. + */ +static void +save_state_data(const void *data, uint32 len) +{ + uint32 padlen = MAXALIGN(len); + + if (padlen > records.bytes_free) + { + records.tail->next = palloc0(sizeof(StateFileChunk)); + records.tail = records.tail->next; + records.tail->len = 0; + records.tail->next = NULL; + records.num_chunks++; + + records.bytes_free = Max(padlen, 512); + records.tail->data = palloc(records.bytes_free); + } + + memcpy(((char *) records.tail->data) + records.tail->len, data, len); + records.tail->len += padlen; + records.bytes_free -= padlen; + records.total_len += padlen; +} + +/* + * Start preparing a state file. + * + * Initializes data structure and inserts the 2PC file header record. + */ +void +StartPrepare(GlobalTransaction gxact) +{ + PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; + TransactionId xid = gxact->xid; + TwoPhaseFileHeader hdr; + TransactionId *children; + RelFileNode *commitrels; + RelFileNode *abortrels; + xl_xact_stats_item *abortstats = NULL; + xl_xact_stats_item *commitstats = NULL; + SharedInvalidationMessage *invalmsgs; + + /* Initialize linked list */ + records.head = palloc0(sizeof(StateFileChunk)); + records.head->len = 0; + records.head->next = NULL; + + records.bytes_free = Max(sizeof(TwoPhaseFileHeader), 512); + records.head->data = palloc(records.bytes_free); + + records.tail = records.head; + records.num_chunks = 1; + + records.total_len = 0; + + /* Create header */ + hdr.magic = TWOPHASE_MAGIC; + hdr.total_len = 0; /* EndPrepare will fill this in */ + hdr.xid = xid; + hdr.database = proc->databaseId; + hdr.prepared_at = gxact->prepared_at; + hdr.owner = gxact->owner; + hdr.nsubxacts = xactGetCommittedChildren(&children); + hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels); + hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels); + hdr.ncommitstats = + pgstat_get_transactional_drops(true, &commitstats); + hdr.nabortstats = + pgstat_get_transactional_drops(false, &abortstats); + hdr.ninvalmsgs = xactGetCommittedInvalidationMessages(&invalmsgs, + &hdr.initfileinval); + hdr.gidlen = strlen(gxact->gid) + 1; /* Include '\0' */ + /* EndPrepare will fill the origin data, if necessary */ + hdr.origin_lsn = InvalidXLogRecPtr; + hdr.origin_timestamp = 0; + + save_state_data(&hdr, sizeof(TwoPhaseFileHeader)); + save_state_data(gxact->gid, hdr.gidlen); + + /* + * Add the additional info about subxacts, deletable files and cache + * invalidation messages. + */ + if (hdr.nsubxacts > 0) + { + save_state_data(children, hdr.nsubxacts * sizeof(TransactionId)); + /* While we have the child-xact data, stuff it in the gxact too */ + GXactLoadSubxactData(gxact, hdr.nsubxacts, children); + } + if (hdr.ncommitrels > 0) + { + save_state_data(commitrels, hdr.ncommitrels * sizeof(RelFileNode)); + pfree(commitrels); + } + if (hdr.nabortrels > 0) + { + save_state_data(abortrels, hdr.nabortrels * sizeof(RelFileNode)); + pfree(abortrels); + } + if (hdr.ncommitstats > 0) + { + save_state_data(commitstats, + hdr.ncommitstats * sizeof(xl_xact_stats_item)); + pfree(commitstats); + } + if (hdr.nabortstats > 0) + { + save_state_data(abortstats, + hdr.nabortstats * sizeof(xl_xact_stats_item)); + pfree(abortstats); + } + if (hdr.ninvalmsgs > 0) + { + save_state_data(invalmsgs, + hdr.ninvalmsgs * sizeof(SharedInvalidationMessage)); + pfree(invalmsgs); + } +} + +/* + * Finish preparing state data and writing it to WAL. + */ +void +EndPrepare(GlobalTransaction gxact) +{ + TwoPhaseFileHeader *hdr; + StateFileChunk *record; + bool replorigin; + + /* Add the end sentinel to the list of 2PC records */ + RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0, + NULL, 0); + + /* Go back and fill in total_len in the file header record */ + hdr = (TwoPhaseFileHeader *) records.head->data; + Assert(hdr->magic == TWOPHASE_MAGIC); + hdr->total_len = records.total_len + sizeof(pg_crc32c); + + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + if (replorigin) + { + hdr->origin_lsn = replorigin_session_origin_lsn; + hdr->origin_timestamp = replorigin_session_origin_timestamp; + } + + /* + * If the data size exceeds MaxAllocSize, we won't be able to read it in + * ReadTwoPhaseFile. Check for that now, rather than fail in the case + * where we write data to file and then re-read at commit time. + */ + if (hdr->total_len > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("two-phase state file maximum length exceeded"))); + + /* + * Now writing 2PC state data to WAL. We let the WAL's CRC protection + * cover us, so no need to calculate a separate CRC. + * + * We have to set DELAY_CHKPT_START here, too; otherwise a checkpoint + * starting immediately after the WAL record is inserted could complete + * without fsync'ing our state file. (This is essentially the same kind + * of race condition as the COMMIT-to-clog-write case that + * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.) + * + * We save the PREPARE record's location in the gxact for later use by + * CheckPointTwoPhase. + */ + XLogEnsureRecordSpace(0, records.num_chunks); + + START_CRIT_SECTION(); + + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + XLogBeginInsert(); + for (record = records.head; record != NULL; record = record->next) + XLogRegisterData(record->data, record->len); + + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE); + + if (replorigin) + { + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + gxact->prepare_end_lsn); + } + + XLogFlush(gxact->prepare_end_lsn); + + /* If we crash now, we have prepared: WAL replay will fix things */ + + /* Store record's start location to read that later on Commit */ + gxact->prepare_start_lsn = ProcLastRecPtr; + + /* + * Mark the prepared transaction as valid. As soon as xact.c marks MyProc + * as not running our XID (which it will do immediately after this + * function returns), others can commit/rollback the xact. + * + * NB: a side effect of this is to make a dummy ProcArray entry for the + * prepared XID. This must happen before we clear the XID from MyProc / + * ProcGlobal->xids[], else there is a window where the XID is not running + * according to TransactionIdIsInProgress, and onlookers would be entitled + * to assume the xact crashed. Instead we have a window where the same + * XID appears twice in ProcArray, which is OK. + */ + MarkAsPrepared(gxact, false); + + /* + * Now we can mark ourselves as out of the commit critical section: a + * checkpoint starting after this will certainly see the gxact as a + * candidate for fsyncing. + */ + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Remember that we have this GlobalTransaction entry locked for us. If + * we crash after this point, it's too late to abort, but we must unlock + * it so that the prepared transaction can be committed or rolled back. + */ + MyLockedGxact = gxact; + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked the prepare, but still show as + * running in the procarray (twice!) and continue to hold locks. + */ + SyncRepWaitForLSN(gxact->prepare_end_lsn, false); + + records.tail = records.head = NULL; + records.num_chunks = 0; +} + +/* + * Register a 2PC record to be written to state file. + */ +void +RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, + const void *data, uint32 len) +{ + TwoPhaseRecordOnDisk record; + + record.rmid = rmid; + record.info = info; + record.len = len; + save_state_data(&record, sizeof(TwoPhaseRecordOnDisk)); + if (len > 0) + save_state_data(data, len); +} + + +/* + * Read and validate the state file for xid. + * + * If it looks OK (has a valid magic number and CRC), return the palloc'd + * contents of the file, issuing an error when finding corrupted data. If + * missing_ok is true, which indicates that missing files can be safely + * ignored, then return NULL. This state can be reached when doing recovery. + */ +static char * +ReadTwoPhaseFile(TransactionId xid, bool missing_ok) +{ + char path[MAXPGPATH]; + char *buf; + TwoPhaseFileHeader *hdr; + int fd; + struct stat stat; + uint32 crc_offset; + pg_crc32c calc_crc, + file_crc; + int r; + + TwoPhaseFilePath(path, xid); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (missing_ok && errno == ENOENT) + return NULL; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + /* + * Check file length. We can determine a lower bound pretty easily. We + * set an upper bound to avoid palloc() failure on a corrupt file, though + * we can't guarantee that we won't get an out of memory error anyway, + * even on a valid file. + */ + if (fstat(fd, &stat)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + + if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) + + MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) + + sizeof(pg_crc32c)) || + stat.st_size > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_plural("incorrect size of file \"%s\": %lld byte", + "incorrect size of file \"%s\": %lld bytes", + (long long int) stat.st_size, path, + (long long int) stat.st_size))); + + crc_offset = stat.st_size - sizeof(pg_crc32c); + if (crc_offset != MAXALIGN(crc_offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("incorrect alignment of CRC offset for file \"%s\"", + path))); + + /* + * OK, slurp in the file. + */ + buf = (char *) palloc(stat.st_size); + + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_READ); + r = read(fd, buf, stat.st_size); + if (r != stat.st_size) + { + if (r < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + else + ereport(ERROR, + (errmsg("could not read file \"%s\": read %d of %lld", + path, r, (long long int) stat.st_size))); + } + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + hdr = (TwoPhaseFileHeader *) buf; + if (hdr->magic != TWOPHASE_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid magic number stored in file \"%s\"", + path))); + + if (hdr->total_len != stat.st_size) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid size stored in file \"%s\"", + path))); + + INIT_CRC32C(calc_crc); + COMP_CRC32C(calc_crc, buf, crc_offset); + FIN_CRC32C(calc_crc); + + file_crc = *((pg_crc32c *) (buf + crc_offset)); + + if (!EQ_CRC32C(calc_crc, file_crc)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\"", + path))); + + return buf; +} + + +/* + * Reads 2PC data from xlog. During checkpoint this data will be moved to + * twophase files and ReadTwoPhaseFile should be used instead. + * + * Note clearly that this function can access WAL during normal operation, + * similarly to the way WALSender or Logical Decoding would do. + */ +static void +XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) +{ + XLogRecord *record; + XLogReaderState *xlogreader; + char *errormsg; + + xlogreader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &read_local_xlog_page, + .segment_open = &wal_segment_open, + .segment_close = &wal_segment_close), + NULL); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + XLogBeginRead(xlogreader, lsn); + record = XLogReadRecord(xlogreader, &errormsg); + + if (record == NULL) + { + if (errormsg) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read two-phase state from WAL at %X/%X: %s", + LSN_FORMAT_ARGS(lsn), errormsg))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read two-phase state from WAL at %X/%X", + LSN_FORMAT_ARGS(lsn)))); + } + + if (XLogRecGetRmid(xlogreader) != RM_XACT_ID || + (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("expected two-phase state data is not present in WAL at %X/%X", + LSN_FORMAT_ARGS(lsn)))); + + if (len != NULL) + *len = XLogRecGetDataLen(xlogreader); + + *buf = palloc(sizeof(char) * XLogRecGetDataLen(xlogreader)); + memcpy(*buf, XLogRecGetData(xlogreader), sizeof(char) * XLogRecGetDataLen(xlogreader)); + + XLogReaderFree(xlogreader); +} + + +/* + * Confirms an xid is prepared, during recovery + */ +bool +StandbyTransactionIdIsPrepared(TransactionId xid) +{ + char *buf; + TwoPhaseFileHeader *hdr; + bool result; + + Assert(TransactionIdIsValid(xid)); + + if (max_prepared_xacts <= 0) + return false; /* nothing to do */ + + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, true); + if (buf == NULL) + return false; + + /* Check header also */ + hdr = (TwoPhaseFileHeader *) buf; + result = TransactionIdEquals(hdr->xid, xid); + pfree(buf); + + return result; +} + +/* + * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED + */ +void +FinishPreparedTransaction(const char *gid, bool isCommit) +{ + GlobalTransaction gxact; + PGPROC *proc; + TransactionId xid; + char *buf; + char *bufptr; + TwoPhaseFileHeader *hdr; + TransactionId latestXid; + TransactionId *children; + RelFileNode *commitrels; + RelFileNode *abortrels; + RelFileNode *delrels; + int ndelrels; + xl_xact_stats_item *commitstats; + xl_xact_stats_item *abortstats; + SharedInvalidationMessage *invalmsgs; + + /* + * Validate the GID, and lock the GXACT to ensure that two backends do not + * try to commit the same GID at once. + */ + gxact = LockGXact(gid, GetUserId()); + proc = &ProcGlobal->allProcs[gxact->pgprocno]; + xid = gxact->xid; + + /* + * Read and validate 2PC state data. State data will typically be stored + * in WAL files if the LSN is after the last checkpoint record, or moved + * to disk if for some reason they have lived for a long time. + */ + if (gxact->ondisk) + buf = ReadTwoPhaseFile(xid, false); + else + XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); + + + /* + * Disassemble the header area + */ + hdr = (TwoPhaseFileHeader *) buf; + Assert(TransactionIdEquals(hdr->xid, xid)); + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + bufptr += MAXALIGN(hdr->gidlen); + children = (TransactionId *) bufptr; + bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); + commitrels = (RelFileNode *) bufptr; + bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); + abortrels = (RelFileNode *) bufptr; + bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); + commitstats = (xl_xact_stats_item *) bufptr; + bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item)); + abortstats = (xl_xact_stats_item *) bufptr; + bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item)); + invalmsgs = (SharedInvalidationMessage *) bufptr; + bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); + + /* compute latestXid among all children */ + latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * The order of operations here is critical: make the XLOG entry for + * commit or abort, then mark the transaction committed or aborted in + * pg_xact, then remove its PGPROC from the global ProcArray (which means + * TransactionIdIsInProgress will stop saying the prepared xact is in + * progress), then run the post-commit or post-abort callbacks. The + * callbacks will release the locks the transaction held. + */ + if (isCommit) + RecordTransactionCommitPrepared(xid, + hdr->nsubxacts, children, + hdr->ncommitrels, commitrels, + hdr->ncommitstats, + commitstats, + hdr->ninvalmsgs, invalmsgs, + hdr->initfileinval, gid); + else + RecordTransactionAbortPrepared(xid, + hdr->nsubxacts, children, + hdr->nabortrels, abortrels, + hdr->nabortstats, + abortstats, + gid); + + ProcArrayRemove(proc, latestXid); + + /* + * In case we fail while running the callbacks, mark the gxact invalid so + * no one else will try to commit/rollback, and so it will be recycled if + * we fail after this point. It is still locked by our backend so it + * won't go away yet. + * + * (We assume it's safe to do this without taking TwoPhaseStateLock.) + */ + gxact->valid = false; + + /* + * We have to remove any files that were supposed to be dropped. For + * consistency with the regular xact.c code paths, must do this before + * releasing locks, so do it before running the callbacks. + * + * NB: this code knows that we couldn't be dropping any temp rels ... + */ + if (isCommit) + { + delrels = commitrels; + ndelrels = hdr->ncommitrels; + } + else + { + delrels = abortrels; + ndelrels = hdr->nabortrels; + } + + /* Make sure files supposed to be dropped are dropped */ + DropRelationFiles(delrels, ndelrels, false); + + if (isCommit) + pgstat_execute_transactional_drops(hdr->ncommitstats, commitstats, false); + else + pgstat_execute_transactional_drops(hdr->nabortstats, abortstats, false); + + /* + * Handle cache invalidation messages. + * + * Relcache init file invalidation requires processing both before and + * after we send the SI messages, only when committing. See + * AtEOXact_Inval(). + */ + if (isCommit) + { + if (hdr->initfileinval) + RelationCacheInitFilePreInvalidate(); + SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs); + if (hdr->initfileinval) + RelationCacheInitFilePostInvalidate(); + } + + /* + * Acquire the two-phase lock. We want to work on the two-phase callbacks + * while holding it to avoid potential conflicts with other transactions + * attempting to use the same GID, so the lock is released once the shared + * memory state is cleared. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + + /* And now do the callbacks */ + if (isCommit) + ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); + else + ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + + PredicateLockTwoPhaseFinish(xid, isCommit); + + /* Clear shared memory state */ + RemoveGXact(gxact); + + /* + * Release the lock as all callbacks are called and shared memory cleanup + * is done. + */ + LWLockRelease(TwoPhaseStateLock); + + /* Count the prepared xact as committed or aborted */ + AtEOXact_PgStat(isCommit, false); + + /* + * And now we can clean up any files we may have left. + */ + if (gxact->ondisk) + RemoveTwoPhaseFile(xid, true); + + MyLockedGxact = NULL; + + RESUME_INTERRUPTS(); + + pfree(buf); +} + +/* + * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record. + */ +static void +ProcessRecords(char *bufptr, TransactionId xid, + const TwoPhaseCallback callbacks[]) +{ + for (;;) + { + TwoPhaseRecordOnDisk *record = (TwoPhaseRecordOnDisk *) bufptr; + + Assert(record->rmid <= TWOPHASE_RM_MAX_ID); + if (record->rmid == TWOPHASE_RM_END_ID) + break; + + bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk)); + + if (callbacks[record->rmid] != NULL) + callbacks[record->rmid] (xid, record->info, + (void *) bufptr, record->len); + + bufptr += MAXALIGN(record->len); + } +} + +/* + * Remove the 2PC file for the specified XID. + * + * If giveWarning is false, do not complain about file-not-present; + * this is an expected case during WAL replay. + */ +static void +RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) +{ + char path[MAXPGPATH]; + + TwoPhaseFilePath(path, xid); + if (unlink(path)) + if (errno != ENOENT || giveWarning) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); +} + +/* + * Recreates a state file. This is used in WAL replay and during + * checkpoint creation. + * + * Note: content and len don't include CRC. + */ +static void +RecreateTwoPhaseFile(TransactionId xid, void *content, int len) +{ + char path[MAXPGPATH]; + pg_crc32c statefile_crc; + int fd; + + /* Recompute CRC */ + INIT_CRC32C(statefile_crc); + COMP_CRC32C(statefile_crc, content, len); + FIN_CRC32C(statefile_crc); + + TwoPhaseFilePath(path, xid); + + fd = OpenTransientFile(path, + O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not recreate file \"%s\": %m", path))); + + /* Write content and CRC */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_WRITE); + if (write(fd, content, len) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", path))); + } + if (write(fd, &statefile_crc, sizeof(pg_crc32c)) != sizeof(pg_crc32c)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", path))); + } + pgstat_report_wait_end(); + + /* + * We must fsync the file because the end-of-replay checkpoint will not do + * so, there being no GXACT in shared memory yet to tell it to. + */ + pgstat_report_wait_start(WAIT_EVENT_TWOPHASE_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", path))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + +/* + * CheckPointTwoPhase -- handle 2PC component of checkpointing. + * + * We must fsync the state file of any GXACT that is valid or has been + * generated during redo and has a PREPARE LSN <= the checkpoint's redo + * horizon. (If the gxact isn't valid yet, has not been generated in + * redo, or has a later LSN, this checkpoint is not responsible for + * fsyncing it.) + * + * This is deliberately run as late as possible in the checkpoint sequence, + * because GXACTs ordinarily have short lifespans, and so it is quite + * possible that GXACTs that were valid at checkpoint start will no longer + * exist if we wait a little bit. With typical checkpoint settings this + * will be about 3 minutes for an online checkpoint, so as a result we + * expect that there will be no GXACTs that need to be copied to disk. + * + * If a GXACT remains valid across multiple checkpoints, it will already + * be on disk so we don't bother to repeat that write. + */ +void +CheckPointTwoPhase(XLogRecPtr redo_horizon) +{ + int i; + int serialized_xacts = 0; + + if (max_prepared_xacts <= 0) + return; /* nothing to do */ + + TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); + + /* + * We are expecting there to be zero GXACTs that need to be copied to + * disk, so we perform all I/O while holding TwoPhaseStateLock for + * simplicity. This prevents any new xacts from preparing while this + * occurs, which shouldn't be a problem since the presence of long-lived + * prepared xacts indicates the transaction manager isn't active. + * + * It's also possible to move I/O out of the lock, but on every error we + * should check whether somebody committed our transaction in different + * backend. Let's leave this optimization for future, if somebody will + * spot that this place cause bottleneck. + * + * Note that it isn't possible for there to be a GXACT with a + * prepare_end_lsn set prior to the last checkpoint yet is marked invalid, + * because of the efforts with delayChkptFlags. + */ + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + /* + * Note that we are using gxact not PGPROC so this works in recovery + * also + */ + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + if ((gxact->valid || gxact->inredo) && + !gxact->ondisk && + gxact->prepare_end_lsn <= redo_horizon) + { + char *buf; + int len; + + XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); + RecreateTwoPhaseFile(gxact->xid, buf, len); + gxact->ondisk = true; + gxact->prepare_start_lsn = InvalidXLogRecPtr; + gxact->prepare_end_lsn = InvalidXLogRecPtr; + pfree(buf); + serialized_xacts++; + } + } + LWLockRelease(TwoPhaseStateLock); + + /* + * Flush unconditionally the parent directory to make any information + * durable on disk. Two-phase files could have been removed and those + * removals need to be made persistent as well as any files newly created + * previously since the last checkpoint. + */ + fsync_fname(TWOPHASE_DIR, true); + + TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); + + if (log_checkpoints && serialized_xacts > 0) + ereport(LOG, + (errmsg_plural("%u two-phase state file was written " + "for a long-running prepared transaction", + "%u two-phase state files were written " + "for long-running prepared transactions", + serialized_xacts, + serialized_xacts))); +} + +/* + * restoreTwoPhaseData + * + * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data. + * This is called once at the beginning of recovery, saving any extra + * lookups in the future. Two-phase files that are newer than the + * minimum XID horizon are discarded on the way. + */ +void +restoreTwoPhaseData(void) +{ + DIR *cldir; + struct dirent *clde; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + cldir = AllocateDir(TWOPHASE_DIR); + while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) + { + if (strlen(clde->d_name) == 8 && + strspn(clde->d_name, "0123456789ABCDEF") == 8) + { + TransactionId xid; + char *buf; + + xid = (TransactionId) strtoul(clde->d_name, NULL, 16); + + buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + true, false, false); + if (buf == NULL) + continue; + + PrepareRedoAdd(buf, InvalidXLogRecPtr, + InvalidXLogRecPtr, InvalidRepOriginId); + } + } + LWLockRelease(TwoPhaseStateLock); + FreeDir(cldir); +} + +/* + * PrescanPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and determine the range + * of valid XIDs present. This is run during database startup, after we + * have completed reading WAL. ShmemVariableCache->nextXid has been set to + * one more than the highest XID for which evidence exists in WAL. + * + * We throw away any prepared xacts with main XID beyond nextXid --- if any + * are present, it suggests that the DBA has done a PITR recovery to an + * earlier point in time without cleaning out pg_twophase. We dare not + * try to recover such prepared xacts since they likely depend on database + * state that doesn't exist now. + * + * However, we will advance nextXid beyond any subxact XIDs belonging to + * valid prepared xacts. We need to do this since subxact commit doesn't + * write a WAL entry, and so there might be no evidence in WAL of those + * subxact XIDs. + * + * On corrupted two-phase files, fail immediately. Keeping around broken + * entries and let replay continue causes harm on the system, and a new + * backup should be rolled in. + * + * Our other responsibility is to determine and return the oldest valid XID + * among the prepared xacts (if none, return ShmemVariableCache->nextXid). + * This is needed to synchronize pg_subtrans startup properly. + * + * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all + * top-level xids is stored in *xids_p. The number of entries in the array + * is returned in *nxids_p. + */ +TransactionId +PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) +{ + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); + TransactionId result = origNextXid; + TransactionId *xids = NULL; + int nxids = 0; + int allocsize = 0; + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + Assert(gxact->inredo); + + xid = gxact->xid; + + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, true); + + if (buf == NULL) + continue; + + /* + * OK, we think this file is valid. Incorporate xid into the + * running-minimum result. + */ + if (TransactionIdPrecedes(xid, result)) + result = xid; + + if (xids_p) + { + if (nxids == allocsize) + { + if (nxids == 0) + { + allocsize = 10; + xids = palloc(allocsize * sizeof(TransactionId)); + } + else + { + allocsize = allocsize * 2; + xids = repalloc(xids, allocsize * sizeof(TransactionId)); + } + } + xids[nxids++] = xid; + } + + pfree(buf); + } + LWLockRelease(TwoPhaseStateLock); + + if (xids_p) + { + *xids_p = xids; + *nxids_p = nxids; + } + + return result; +} + +/* + * StandbyRecoverPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and setup all the required + * information to allow standby queries to treat prepared transactions as still + * active. + * + * This is never called at the end of recovery - we use + * RecoverPreparedTransactions() at that point. + * + * The lack of calls to SubTransSetParent() calls here is by design; + * those calls are made by RecoverPreparedTransactions() at the end of recovery + * for those xacts that need this. + */ +void +StandbyRecoverPreparedTransactions(void) +{ + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + Assert(gxact->inredo); + + xid = gxact->xid; + + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, false, false); + if (buf != NULL) + pfree(buf); + } + LWLockRelease(TwoPhaseStateLock); +} + +/* + * RecoverPreparedTransactions + * + * Scan the shared memory entries of TwoPhaseState and reload the state for + * each prepared transaction (reacquire locks, etc). + * + * This is run at the end of recovery, but before we allow backends to write + * WAL. + * + * At the end of recovery the way we take snapshots will change. We now need + * to mark all running transactions with their full SubTransSetParent() info + * to allow normal snapshots to work correctly if snapshots overflow. + * We do this here because by definition prepared transactions are the only + * type of write transaction still running, so this is necessary and + * complete. + */ +void +RecoverPreparedTransactions(void) +{ + int i; + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + TransactionId xid; + char *buf; + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + char *bufptr; + TwoPhaseFileHeader *hdr; + TransactionId *subxids; + const char *gid; + + xid = gxact->xid; + + /* + * Reconstruct subtrans state for the transaction --- needed because + * pg_subtrans is not preserved over a restart. Note that we are + * linking all the subtransactions directly to the top-level XID; + * there may originally have been a more complex hierarchy, but + * there's no need to restore that exactly. It's possible that + * SubTransSetParent has been set before, if the prepared transaction + * generated xid assignment records. + */ + buf = ProcessTwoPhaseBuffer(xid, + gxact->prepare_start_lsn, + gxact->ondisk, true, false); + if (buf == NULL) + continue; + + ereport(LOG, + (errmsg("recovering prepared transaction %u from shared memory", xid))); + + hdr = (TwoPhaseFileHeader *) buf; + Assert(TransactionIdEquals(hdr->xid, xid)); + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + bufptr += MAXALIGN(hdr->gidlen); + subxids = (TransactionId *) bufptr; + bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); + bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); + bufptr += MAXALIGN(hdr->ncommitstats * sizeof(xl_xact_stats_item)); + bufptr += MAXALIGN(hdr->nabortstats * sizeof(xl_xact_stats_item)); + bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); + + /* + * Recreate its GXACT and dummy PGPROC. But, check whether it was + * added in redo and already has a shmem entry for it. + */ + MarkAsPreparingGuts(gxact, xid, gid, + hdr->prepared_at, + hdr->owner, hdr->database); + + /* recovered, so reset the flag for entries generated by redo */ + gxact->inredo = false; + + GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); + MarkAsPrepared(gxact, true); + + LWLockRelease(TwoPhaseStateLock); + + /* + * Recover other state (notably locks) using resource managers. + */ + ProcessRecords(bufptr, xid, twophase_recover_callbacks); + + /* + * Release locks held by the standby process after we process each + * prepared transaction. As a result, we don't need too many + * additional locks at any one time. + */ + if (InHotStandby) + StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + + /* + * We're done with recovering this transaction. Clear MyLockedGxact, + * like we do in PrepareTransaction() during normal operation. + */ + PostPrepare_Twophase(); + + pfree(buf); + + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + } + + LWLockRelease(TwoPhaseStateLock); +} + +/* + * ProcessTwoPhaseBuffer + * + * Given a transaction id, read it either from disk or read it directly + * via shmem xlog record pointer using the provided "prepare_start_lsn". + * + * If setParent is true, set up subtransaction parent linkages. + * + * If setNextXid is true, set ShmemVariableCache->nextXid to the newest + * value scanned. + */ +static char * +ProcessTwoPhaseBuffer(TransactionId xid, + XLogRecPtr prepare_start_lsn, + bool fromdisk, + bool setParent, bool setNextXid) +{ + FullTransactionId nextXid = ShmemVariableCache->nextXid; + TransactionId origNextXid = XidFromFullTransactionId(nextXid); + TransactionId *subxids; + char *buf; + TwoPhaseFileHeader *hdr; + int i; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + + if (!fromdisk) + Assert(prepare_start_lsn != InvalidXLogRecPtr); + + /* Already processed? */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing stale two-phase state file for transaction %u", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing stale two-phase state from memory for transaction %u", + xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + /* Reject XID if too new */ + if (TransactionIdFollowsOrEquals(xid, origNextXid)) + { + if (fromdisk) + { + ereport(WARNING, + (errmsg("removing future two-phase state file for transaction %u", + xid))); + RemoveTwoPhaseFile(xid, true); + } + else + { + ereport(WARNING, + (errmsg("removing future two-phase state from memory for transaction %u", + xid))); + PrepareRedoRemove(xid, true); + } + return NULL; + } + + if (fromdisk) + { + /* Read and validate file */ + buf = ReadTwoPhaseFile(xid, false); + } + else + { + /* Read xlog data */ + XlogReadTwoPhaseData(prepare_start_lsn, &buf, NULL); + } + + /* Deconstruct header */ + hdr = (TwoPhaseFileHeader *) buf; + if (!TransactionIdEquals(hdr->xid, xid)) + { + if (fromdisk) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted two-phase state file for transaction %u", + xid))); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted two-phase state in memory for transaction %u", + xid))); + } + + /* + * Examine subtransaction XIDs ... they should all follow main XID, and + * they may force us to advance nextXid. + */ + subxids = (TransactionId *) (buf + + MAXALIGN(sizeof(TwoPhaseFileHeader)) + + MAXALIGN(hdr->gidlen)); + for (i = 0; i < hdr->nsubxacts; i++) + { + TransactionId subxid = subxids[i]; + + Assert(TransactionIdFollows(subxid, xid)); + + /* update nextXid if needed */ + if (setNextXid) + AdvanceNextFullTransactionIdPastXid(subxid); + + if (setParent) + SubTransSetParent(subxid, xid); + } + + return buf; +} + + +/* + * RecordTransactionCommitPrepared + * + * This is basically the same as RecordTransactionCommit (q.v. if you change + * this function): in particular, we must set DELAY_CHKPT_START to avoid a + * race condition. + * + * We know the transaction made at least one XLOG entry (its PREPARE), + * so it is never possible to optimize out the commit record. + */ +static void +RecordTransactionCommitPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int nstats, + xl_xact_stats_item *stats, + int ninvalmsgs, + SharedInvalidationMessage *invalmsgs, + bool initfileinval, + const char *gid) +{ + XLogRecPtr recptr; + TimestampTz committs = GetCurrentTimestamp(); + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, are + * we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + START_CRIT_SECTION(); + + /* See notes in RecordTransactionCommit */ + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + /* + * Emit the XLOG commit record. Note that we mark 2PC commits as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ + recptr = XactLogCommitRecord(committs, + nchildren, children, nrels, rels, + nstats, stats, + ninvalmsgs, invalmsgs, + initfileinval, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, + xid, gid); + + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* + * Record commit timestamp. The value comes from plain commit timestamp + * if replorigin is not enabled, or replorigin already set a value for us + * in replorigin_session_origin_timestamp otherwise. + * + * We don't need to WAL-log anything here, as the commit record written + * above already contains the data. + */ + if (!replorigin || replorigin_session_origin_timestamp == 0) + replorigin_session_origin_timestamp = committs; + + TransactionTreeSetCommitTsData(xid, nchildren, children, + replorigin_session_origin_timestamp, + replorigin_session_origin); + + /* + * We don't currently try to sleep before flush here ... nor is there any + * support for async commit of a prepared xact (the very idea is probably + * a contradiction) + */ + + /* Flush XLOG to disk */ + XLogFlush(recptr); + + /* Mark the transaction committed in pg_xact */ + TransactionIdCommitTree(xid, nchildren, children); + + /* Checkpoint can proceed now */ + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr, true); +} + +/* + * RecordTransactionAbortPrepared + * + * This is basically the same as RecordTransactionAbort. + * + * We know the transaction made at least one XLOG entry (its PREPARE), + * so it is never possible to optimize out the abort record. + */ +static void +RecordTransactionAbortPrepared(TransactionId xid, + int nchildren, + TransactionId *children, + int nrels, + RelFileNode *rels, + int nstats, + xl_xact_stats_item *stats, + const char *gid) +{ + XLogRecPtr recptr; + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, are + * we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + /* + * Catch the scenario where we aborted partway through + * RecordTransactionCommitPrepared ... + */ + if (TransactionIdDidCommit(xid)) + elog(PANIC, "cannot abort transaction %u, it was already committed", + xid); + + START_CRIT_SECTION(); + + /* + * Emit the XLOG commit record. Note that we mark 2PC aborts as + * potentially having AccessExclusiveLocks since we don't know whether or + * not they do. + */ + recptr = XactLogAbortRecord(GetCurrentTimestamp(), + nchildren, children, + nrels, rels, + nstats, stats, + MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK, + xid, gid); + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* Always flush, since we're about to remove the 2PC state file */ + XLogFlush(recptr); + + /* + * Mark the transaction aborted in clog. This is not absolutely necessary + * but we may as well do it while we are here. + */ + TransactionIdAbortTree(xid, nchildren, children); + + END_CRIT_SECTION(); + + /* + * Wait for synchronous replication, if required. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + SyncRepWaitForLSN(recptr, false); +} + +/* + * PrepareRedoAdd + * + * Store pointers to the start/end of the WAL record along with the xid in + * a gxact entry in shared memory TwoPhaseState structure. If caller + * specifies InvalidXLogRecPtr as WAL location to fetch the two-phase + * data, the entry is marked as located on disk. + */ +void +PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, + XLogRecPtr end_lsn, RepOriginId origin_id) +{ + TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; + char *bufptr; + const char *gid; + GlobalTransaction gxact; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); + gid = (const char *) bufptr; + + /* + * Reserve the GID for the given transaction in the redo code path. + * + * This creates a gxact struct and puts it into the active array. + * + * In redo, this struct is mainly used to track PREPARE/COMMIT entries in + * shared memory. Hence, we only fill up the bare minimum contents here. + * The gxact also gets marked with gxact->inredo set to true to indicate + * that it got added in the redo phase + */ + + /* + * In the event of a crash while a checkpoint was running, it may be + * possible that some two-phase data found its way to disk while its + * corresponding record needs to be replayed in the follow-up recovery. As + * the 2PC data was on disk, it has already been restored at the beginning + * of recovery with restoreTwoPhaseData(), so skip this record to avoid + * duplicates in TwoPhaseState. If a consistent state has been reached, + * the record is added to TwoPhaseState and it should have no + * corresponding file in pg_twophase. + */ + if (!XLogRecPtrIsInvalid(start_lsn)) + { + char path[MAXPGPATH]; + + TwoPhaseFilePath(path, hdr->xid); + + if (access(path, F_OK) == 0) + { + ereport(reachedConsistency ? ERROR : WARNING, + (errmsg("could not recover two-phase state file for transaction %u", + hdr->xid), + errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.", + LSN_FORMAT_ARGS(start_lsn)))); + return; + } + + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access file \"%s\": %m", path))); + } + + /* Get a free gxact from the freelist */ + if (TwoPhaseState->freeGXacts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("maximum number of prepared transactions reached"), + errhint("Increase max_prepared_transactions (currently %d).", + max_prepared_xacts))); + gxact = TwoPhaseState->freeGXacts; + TwoPhaseState->freeGXacts = gxact->next; + + gxact->prepared_at = hdr->prepared_at; + gxact->prepare_start_lsn = start_lsn; + gxact->prepare_end_lsn = end_lsn; + gxact->xid = hdr->xid; + gxact->owner = hdr->owner; + gxact->locking_backend = InvalidBackendId; + gxact->valid = false; + gxact->ondisk = XLogRecPtrIsInvalid(start_lsn); + gxact->inredo = true; /* yes, added in redo */ + strcpy(gxact->gid, gid); + + /* And insert it into the active array */ + Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); + TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; + + if (origin_id != InvalidRepOriginId) + { + /* recover apply progress */ + replorigin_advance(origin_id, hdr->origin_lsn, end_lsn, + false /* backward */ , false /* WAL */ ); + } + + elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid); +} + +/* + * PrepareRedoRemove + * + * Remove the corresponding gxact entry from TwoPhaseState. Also remove + * the 2PC file if a prepared transaction was saved via an earlier checkpoint. + * + * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState + * is updated. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + GlobalTransaction gxact = NULL; + int i; + bool found = false; + + Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); + Assert(RecoveryInProgress()); + + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + gxact = TwoPhaseState->prepXacts[i]; + + if (gxact->xid == xid) + { + Assert(gxact->inredo); + found = true; + break; + } + } + + /* + * Just leave if there is nothing, this is expected during WAL replay. + */ + if (!found) + return; + + /* + * And now we can clean up any files we may have left. + */ + elog(DEBUG2, "removing 2PC data for transaction %u", xid); + if (gxact->ondisk) + RemoveTwoPhaseFile(xid, giveWarning); + RemoveGXact(gxact); +} + +/* + * LookupGXact + * Check if the prepared transaction with the given GID, lsn and timestamp + * exists. + * + * Note that we always compare with the LSN where prepare ends because that is + * what is stored as origin_lsn in the 2PC file. + * + * This function is primarily used to check if the prepared transaction + * received from the upstream (remote node) already exists. Checking only GID + * is not sufficient because a different prepared xact with the same GID can + * exist on the same node. So, we are ensuring to match origin_lsn and + * origin_timestamp of prepared xact to avoid the possibility of a match of + * prepared xact from two different nodes. + */ +bool +LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, + TimestampTz origin_prepare_timestamp) +{ + int i; + bool found = false; + + LWLockAcquire(TwoPhaseStateLock, LW_SHARED); + for (i = 0; i < TwoPhaseState->numPrepXacts; i++) + { + GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + + /* Ignore not-yet-valid GIDs. */ + if (gxact->valid && strcmp(gxact->gid, gid) == 0) + { + char *buf; + TwoPhaseFileHeader *hdr; + + /* + * We are not expecting collisions of GXACTs (same gid) between + * publisher and subscribers, so we perform all I/O while holding + * TwoPhaseStateLock for simplicity. + * + * To move the I/O out of the lock, we need to ensure that no + * other backend commits the prepared xact in the meantime. We can + * do this optimization if we encounter many collisions in GID + * between publisher and subscriber. + */ + if (gxact->ondisk) + buf = ReadTwoPhaseFile(gxact->xid, false); + else + { + Assert(gxact->prepare_start_lsn); + XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); + } + + hdr = (TwoPhaseFileHeader *) buf; + + if (hdr->origin_lsn == prepare_end_lsn && + hdr->origin_timestamp == origin_prepare_timestamp) + { + found = true; + pfree(buf); + break; + } + + pfree(buf); + } + } + LWLockRelease(TwoPhaseStateLock); + return found; +} diff --git a/src/backend/access/transam/twophase_rmgr.c b/src/backend/access/transam/twophase_rmgr.c new file mode 100644 index 0000000..35a9b32 --- /dev/null +++ b/src/backend/access/transam/twophase_rmgr.c @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * twophase_rmgr.c + * Two-phase-commit resource managers tables + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/twophase_rmgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/multixact.h" +#include "access/twophase_rmgr.h" +#include "pgstat.h" +#include "storage/lock.h" +#include "storage/predicate.h" + + +const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_recover, /* Lock */ + NULL, /* pgstat */ + multixact_twophase_recover, /* MultiXact */ + predicatelock_twophase_recover /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_postcommit, /* Lock */ + pgstat_twophase_postcommit, /* pgstat */ + multixact_twophase_postcommit, /* MultiXact */ + NULL /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_postabort, /* Lock */ + pgstat_twophase_postabort, /* pgstat */ + multixact_twophase_postabort, /* MultiXact */ + NULL /* PredicateLock */ +}; + +const TwoPhaseCallback twophase_standby_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] = +{ + NULL, /* END ID */ + lock_twophase_standby_recover, /* Lock */ + NULL, /* pgstat */ + NULL, /* MultiXact */ + NULL /* PredicateLock */ +}; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c new file mode 100644 index 0000000..748120a --- /dev/null +++ b/src/backend/access/transam/varsup.c @@ -0,0 +1,678 @@ +/*------------------------------------------------------------------------- + * + * varsup.c + * postgres OID & XID variables support routines + * + * Copyright (c) 2000-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/varsup.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlogutils.h" +#include "commands/dbcommands.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "utils/syscache.h" + + +/* Number of OIDs to prefetch (preallocate) per XLOG write */ +#define VAR_OID_PREFETCH 8192 + +/* pointer to "variable cache" in shared memory (set up by shmem.c) */ +VariableCache ShmemVariableCache = NULL; + + +/* + * Allocate the next FullTransactionId for a new transaction or + * subtransaction. + * + * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before + * returning. + * + * Note: when this is called, we are actually already inside a valid + * transaction, since XIDs are now not allocated until the transaction + * does something. So it is safe to do a database lookup if we want to + * issue a warning about XID wrap. + */ +FullTransactionId +GetNewTransactionId(bool isSubXact) +{ + FullTransactionId full_xid; + TransactionId xid; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs after that point. + */ + if (IsInParallelMode()) + elog(ERROR, "cannot assign TransactionIds during a parallel operation"); + + /* + * During bootstrap initialization, we return the special bootstrap + * transaction id. + */ + if (IsBootstrapProcessingMode()) + { + Assert(!isSubXact); + MyProc->xid = BootstrapTransactionId; + ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId; + return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId); + } + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign TransactionIds during recovery"); + + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + full_xid = ShmemVariableCache->nextXid; + xid = XidFromFullTransactionId(full_xid); + + /*---------- + * Check to see if it's safe to assign another XID. This protects against + * catastrophic data loss due to XID wraparound. The basic rules are: + * + * If we're past xidVacLimit, start trying to force autovacuum cycles. + * If we're past xidWarnLimit, start issuing warnings. + * If we're past xidStopLimit, refuse to execute transactions, unless + * we are running in single-user mode (which gives an escape hatch + * to the DBA who somehow got past the earlier defenses). + * + * Note that this coding also appears in GetNewMultiXactId. + *---------- + */ + if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit)) + { + /* + * For safety's sake, we release XidGenLock while sending signals, + * warnings, etc. This is not so much because we care about + * preserving concurrency in this situation, as to avoid any + * possibility of deadlock while doing get_database_name(). First, + * copy all the shared values we'll need in this path. + */ + TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit; + TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit; + TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit; + Oid oldest_datoid = ShmemVariableCache->oldestXidDB; + + LWLockRelease(XidGenLock); + + /* + * To avoid swamping the postmaster with signals, we issue the autovac + * request only once per 64K transaction starts. This still gives + * plenty of chances before we get into real trouble. + */ + if (IsUnderPostmaster && (xid % 65536) == 0) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + if (IsUnderPostmaster && + TransactionIdFollowsOrEquals(xid, xidStopLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"", + oldest_datname), + errhint("Stop the postmaster and vacuum that database in single-user mode.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u", + oldest_datoid), + errhint("Stop the postmaster and vacuum that database in single-user mode.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit)) + { + char *oldest_datname = get_database_name(oldest_datoid); + + /* complain even if that DB has disappeared */ + if (oldest_datname) + ereport(WARNING, + (errmsg("database \"%s\" must be vacuumed within %u transactions", + oldest_datname, + xidWrapLimit - xid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg("database with OID %u must be vacuumed within %u transactions", + oldest_datoid, + xidWrapLimit - xid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } + + /* Re-acquire lock and start over */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + full_xid = ShmemVariableCache->nextXid; + xid = XidFromFullTransactionId(full_xid); + } + + /* + * If we are allocating the first XID of a new page of the commit log, + * zero out that commit-log page before returning. We must do this while + * holding XidGenLock, else another xact could acquire and commit a later + * XID before we zero the page. Fortunately, a page of the commit log + * holds 32K or more transactions, so we don't have to do this very often. + * + * Extend pg_subtrans and pg_commit_ts too. + */ + ExtendCLOG(xid); + ExtendCommitTs(xid); + ExtendSUBTRANS(xid); + + /* + * Now advance the nextXid counter. This must not happen until after we + * have successfully completed ExtendCLOG() --- if that routine fails, we + * want the next incoming transaction to try it again. We cannot assign + * more XIDs until there is CLOG space for them. + */ + FullTransactionIdAdvance(&ShmemVariableCache->nextXid); + + /* + * We must store the new XID into the shared ProcArray before releasing + * XidGenLock. This ensures that every active XID older than + * latestCompletedXid is present in the ProcArray, which is essential for + * correct OldestXmin tracking; see src/backend/access/transam/README. + * + * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful to + * fetch the value for each proc only once, rather than assume they can + * read a value multiple times and get the same answer each time. Note we + * are assuming that TransactionId and int fetch/store are atomic. + * + * The same comments apply to the subxact xid count and overflow fields. + * + * Use of a write barrier prevents dangerous code rearrangement in this + * function; other backends could otherwise e.g. be examining my subxids + * info concurrently, and we don't want them to see an invalid + * intermediate state, such as an incremented nxids before the array entry + * is filled. + * + * Other processes that read nxids should do so before reading xids + * elements with a pg_read_barrier() in between, so that they can be sure + * not to read an uninitialized array element; see + * src/backend/storage/lmgr/README.barrier. + * + * If there's no room to fit a subtransaction XID into PGPROC, set the + * cache-overflowed flag instead. This forces readers to look in + * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a + * race-condition window, in that the new XID will not appear as running + * until its parent link has been placed into pg_subtrans. However, that + * will happen before anyone could possibly have a reason to inquire about + * the status of the XID, so it seems OK. (Snapshots taken during this + * window *will* include the parent XID, so they will deliver the correct + * answer later on when someone does have a reason to inquire.) + */ + if (!isSubXact) + { + Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0); + Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed); + Assert(MyProc->subxidStatus.count == 0); + Assert(!MyProc->subxidStatus.overflowed); + + /* LWLockRelease acts as barrier */ + MyProc->xid = xid; + ProcGlobal->xids[MyProc->pgxactoff] = xid; + } + else + { + XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + int nxids = MyProc->subxidStatus.count; + + Assert(substat->count == MyProc->subxidStatus.count); + Assert(substat->overflowed == MyProc->subxidStatus.overflowed); + + if (nxids < PGPROC_MAX_CACHED_SUBXIDS) + { + MyProc->subxids.xids[nxids] = xid; + pg_write_barrier(); + MyProc->subxidStatus.count = substat->count = nxids + 1; + } + else + MyProc->subxidStatus.overflowed = substat->overflowed = true; + } + + LWLockRelease(XidGenLock); + + return full_xid; +} + +/* + * Read nextXid but don't allocate it. + */ +FullTransactionId +ReadNextFullTransactionId(void) +{ + FullTransactionId fullXid; + + LWLockAcquire(XidGenLock, LW_SHARED); + fullXid = ShmemVariableCache->nextXid; + LWLockRelease(XidGenLock); + + return fullXid; +} + +/* + * Advance nextXid to the value after a given xid. The epoch is inferred. + * This must only be called during recovery or from two-phase start-up code. + */ +void +AdvanceNextFullTransactionIdPastXid(TransactionId xid) +{ + FullTransactionId newNextFullXid; + TransactionId next_xid; + uint32 epoch; + + /* + * It is safe to read nextXid without a lock, because this is only called + * from the startup process or single-process mode, meaning that no other + * process can modify it. + */ + Assert(AmStartupProcess() || !IsUnderPostmaster); + + /* Fast return if this isn't an xid high enough to move the needle. */ + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + if (!TransactionIdFollowsOrEquals(xid, next_xid)) + return; + + /* + * Compute the FullTransactionId that comes after the given xid. To do + * this, we preserve the existing epoch, but detect when we've wrapped + * into a new epoch. This is necessary because WAL records and 2PC state + * currently contain 32 bit xids. The wrap logic is safe in those cases + * because the span of active xids cannot exceed one epoch at any given + * point in the WAL stream. + */ + TransactionIdAdvance(xid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); + if (unlikely(xid < next_xid)) + ++epoch; + newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid); + + /* + * We still need to take a lock to modify the value when there are + * concurrent readers. + */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextXid = newNextFullXid; + LWLockRelease(XidGenLock); +} + +/* + * Advance the cluster-wide value for the oldest valid clog entry. + * + * We must acquire XactTruncationLock to advance the oldestClogXid. It's not + * necessary to hold the lock during the actual clog truncation, only when we + * advance the limit, as code looking up arbitrary xids is required to hold + * XactTruncationLock from when it tests oldestClogXid through to when it + * completes the clog lookup. + */ +void +AdvanceOldestClogXid(TransactionId oldest_datfrozenxid) +{ + LWLockAcquire(XactTruncationLock, LW_EXCLUSIVE); + if (TransactionIdPrecedes(ShmemVariableCache->oldestClogXid, + oldest_datfrozenxid)) + { + ShmemVariableCache->oldestClogXid = oldest_datfrozenxid; + } + LWLockRelease(XactTruncationLock); +} + +/* + * Determine the last safe XID to allocate using the currently oldest + * datfrozenxid (ie, the oldest XID that might exist in any database + * of our cluster), and the OID of the (or a) database with that value. + */ +void +SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) +{ + TransactionId xidVacLimit; + TransactionId xidWarnLimit; + TransactionId xidStopLimit; + TransactionId xidWrapLimit; + TransactionId curXid; + + Assert(TransactionIdIsNormal(oldest_datfrozenxid)); + + /* + * The place where we actually get into deep trouble is halfway around + * from the oldest potentially-existing XID. (This calculation is + * probably off by one or two counts, because the special XIDs reduce the + * size of the loop a little bit. But we throw in plenty of slop below, + * so it doesn't matter.) + */ + xidWrapLimit = oldest_datfrozenxid + (MaxTransactionId >> 1); + if (xidWrapLimit < FirstNormalTransactionId) + xidWrapLimit += FirstNormalTransactionId; + + /* + * We'll refuse to continue assigning XIDs in interactive mode once we get + * within 3M transactions of data loss. This leaves lots of room for the + * DBA to fool around fixing things in a standalone backend, while not + * being significant compared to total XID space. (VACUUM requires an XID + * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA + * might do by reflex, assigns an XID. Hence, we had better be sure + * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two + * completely-idle segments. In the event of edge-case bugs involving + * page or segment arithmetic, idle segments render the bugs unreachable + * outside of single-user mode. + */ + xidStopLimit = xidWrapLimit - 3000000; + if (xidStopLimit < FirstNormalTransactionId) + xidStopLimit -= FirstNormalTransactionId; + + /* + * We'll start complaining loudly when we get within 40M transactions of + * data loss. This is kind of arbitrary, but if you let your gas gauge + * get down to 2% of full, would you be looking for the next gas station? + * We need to be fairly liberal about this number because there are lots + * of scenarios where most transactions are done by automatic clients that + * won't pay attention to warnings. (No, we're not gonna make this + * configurable. If you know enough to configure it, you know enough to + * not get in this kind of trouble in the first place.) + */ + xidWarnLimit = xidWrapLimit - 40000000; + if (xidWarnLimit < FirstNormalTransactionId) + xidWarnLimit -= FirstNormalTransactionId; + + /* + * We'll start trying to force autovacuums when oldest_datfrozenxid gets + * to be more than autovacuum_freeze_max_age transactions old. + * + * Note: guc.c ensures that autovacuum_freeze_max_age is in a sane range, + * so that xidVacLimit will be well before xidWarnLimit. + * + * Note: autovacuum_freeze_max_age is a PGC_POSTMASTER parameter so that + * we don't have to worry about dealing with on-the-fly changes in its + * value. It doesn't look practical to update shared state from a GUC + * assign hook (too many processes would try to execute the hook, + * resulting in race conditions as well as crashes of those not connected + * to shared memory). Perhaps this can be improved someday. See also + * SetMultiXactIdLimit. + */ + xidVacLimit = oldest_datfrozenxid + autovacuum_freeze_max_age; + if (xidVacLimit < FirstNormalTransactionId) + xidVacLimit += FirstNormalTransactionId; + + /* Grab lock for just long enough to set the new limit values */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->oldestXid = oldest_datfrozenxid; + ShmemVariableCache->xidVacLimit = xidVacLimit; + ShmemVariableCache->xidWarnLimit = xidWarnLimit; + ShmemVariableCache->xidStopLimit = xidStopLimit; + ShmemVariableCache->xidWrapLimit = xidWrapLimit; + ShmemVariableCache->oldestXidDB = oldest_datoid; + curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + /* Log the info */ + ereport(DEBUG1, + (errmsg_internal("transaction ID wrap limit is %u, limited by database with OID %u", + xidWrapLimit, oldest_datoid))); + + /* + * If past the autovacuum force point, immediately signal an autovac + * request. The reason for this is that autovac only processes one + * database per invocation. Once it's finished cleaning up the oldest + * database, it'll call here, and we'll signal the postmaster to start + * another iteration immediately if there are still any old databases. + */ + if (TransactionIdFollowsOrEquals(curXid, xidVacLimit) && + IsUnderPostmaster && !InRecovery) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); + + /* Give an immediate warning if past the wrap warn point */ + if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit) && !InRecovery) + { + char *oldest_datname; + + /* + * We can be called when not inside a transaction, for example during + * StartupXLOG(). In such a case we cannot do database access, so we + * must just report the oldest DB's OID. + * + * Note: it's also possible that get_database_name fails and returns + * NULL, for example because the database just got dropped. We'll + * still warn, even though the warning might now be unnecessary. + */ + if (IsTransactionState()) + oldest_datname = get_database_name(oldest_datoid); + else + oldest_datname = NULL; + + if (oldest_datname) + ereport(WARNING, + (errmsg("database \"%s\" must be vacuumed within %u transactions", + oldest_datname, + xidWrapLimit - curXid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + else + ereport(WARNING, + (errmsg("database with OID %u must be vacuumed within %u transactions", + oldest_datoid, + xidWrapLimit - curXid), + errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" + "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); + } +} + + +/* + * ForceTransactionIdLimitUpdate -- does the XID wrap-limit data need updating? + * + * We primarily check whether oldestXidDB is valid. The cases we have in + * mind are that that database was dropped, or the field was reset to zero + * by pg_resetwal. In either case we should force recalculation of the + * wrap limit. Also do it if oldestXid is old enough to be forcing + * autovacuums or other actions; this ensures we update our state as soon + * as possible once extra overhead is being incurred. + */ +bool +ForceTransactionIdLimitUpdate(void) +{ + TransactionId nextXid; + TransactionId xidVacLimit; + TransactionId oldestXid; + Oid oldestXidDB; + + /* Locking is probably not really necessary, but let's be careful */ + LWLockAcquire(XidGenLock, LW_SHARED); + nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + xidVacLimit = ShmemVariableCache->xidVacLimit; + oldestXid = ShmemVariableCache->oldestXid; + oldestXidDB = ShmemVariableCache->oldestXidDB; + LWLockRelease(XidGenLock); + + if (!TransactionIdIsNormal(oldestXid)) + return true; /* shouldn't happen, but just in case */ + if (!TransactionIdIsValid(xidVacLimit)) + return true; /* this shouldn't happen anymore either */ + if (TransactionIdFollowsOrEquals(nextXid, xidVacLimit)) + return true; /* past xidVacLimit, don't delay updating */ + if (!SearchSysCacheExists1(DATABASEOID, ObjectIdGetDatum(oldestXidDB))) + return true; /* could happen, per comments above */ + return false; +} + + +/* + * GetNewObjectId -- allocate a new OID + * + * OIDs are generated by a cluster-wide counter. Since they are only 32 bits + * wide, counter wraparound will occur eventually, and therefore it is unwise + * to assume they are unique unless precautions are taken to make them so. + * Hence, this routine should generally not be used directly. The only direct + * callers should be GetNewOidWithIndex() and GetNewRelFileNode() in + * catalog/catalog.c. + */ +Oid +GetNewObjectId(void) +{ + Oid result; + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign OIDs during recovery"); + + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + + /* + * Check for wraparound of the OID counter. We *must* not return 0 + * (InvalidOid), and in normal operation we mustn't return anything below + * FirstNormalObjectId since that range is reserved for initdb (see + * IsCatalogRelationOid()). Note we are relying on unsigned comparison. + * + * During initdb, we start the OID generator at FirstGenbkiObjectId, so we + * only wrap if before that point when in bootstrap or standalone mode. + * The first time through this routine after normal postmaster start, the + * counter will be forced up to FirstNormalObjectId. This mechanism + * leaves the OIDs between FirstGenbkiObjectId and FirstNormalObjectId + * available for automatic assignment during initdb, while ensuring they + * will never conflict with user-assigned OIDs. + */ + if (ShmemVariableCache->nextOid < ((Oid) FirstNormalObjectId)) + { + if (IsPostmasterEnvironment) + { + /* wraparound, or first post-initdb assignment, in normal mode */ + ShmemVariableCache->nextOid = FirstNormalObjectId; + ShmemVariableCache->oidCount = 0; + } + else + { + /* we may be bootstrapping, so don't enforce the full range */ + if (ShmemVariableCache->nextOid < ((Oid) FirstGenbkiObjectId)) + { + /* wraparound in standalone mode (unlikely but possible) */ + ShmemVariableCache->nextOid = FirstNormalObjectId; + ShmemVariableCache->oidCount = 0; + } + } + } + + /* If we run out of logged for use oids then we must log more */ + if (ShmemVariableCache->oidCount == 0) + { + XLogPutNextOid(ShmemVariableCache->nextOid + VAR_OID_PREFETCH); + ShmemVariableCache->oidCount = VAR_OID_PREFETCH; + } + + result = ShmemVariableCache->nextOid; + + (ShmemVariableCache->nextOid)++; + (ShmemVariableCache->oidCount)--; + + LWLockRelease(OidGenLock); + + return result; +} + +/* + * SetNextObjectId + * + * This may only be called during initdb; it advances the OID counter + * to the specified value. + */ +static void +SetNextObjectId(Oid nextOid) +{ + /* Safety check, this is only allowable during initdb */ + if (IsPostmasterEnvironment) + elog(ERROR, "cannot advance OID counter anymore"); + + /* Taking the lock is, therefore, just pro forma; but do it anyway */ + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + + if (ShmemVariableCache->nextOid > nextOid) + elog(ERROR, "too late to advance OID counter to %u, it is now %u", + nextOid, ShmemVariableCache->nextOid); + + ShmemVariableCache->nextOid = nextOid; + ShmemVariableCache->oidCount = 0; + + LWLockRelease(OidGenLock); +} + +/* + * StopGeneratingPinnedObjectIds + * + * This is called once during initdb to force the OID counter up to + * FirstUnpinnedObjectId. This supports letting initdb's post-bootstrap + * processing create some pinned objects early on. Once it's done doing + * so, it calls this (via pg_stop_making_pinned_objects()) so that the + * remaining objects it makes will be considered un-pinned. + */ +void +StopGeneratingPinnedObjectIds(void) +{ + SetNextObjectId(FirstUnpinnedObjectId); +} + + +#ifdef USE_ASSERT_CHECKING + +/* + * Assert that xid is between [oldestXid, nextXid], which is the range we + * expect XIDs coming from tables etc to be in. + * + * As ShmemVariableCache->oldestXid could change just after this call without + * further precautions, and as a wrapped-around xid could again fall within + * the valid range, this assertion can only detect if something is definitely + * wrong, but not establish correctness. + * + * This intentionally does not expose a return value, to avoid code being + * introduced that depends on the return value. + */ +void +AssertTransactionIdInAllowableRange(TransactionId xid) +{ + TransactionId oldest_xid; + TransactionId next_xid; + + Assert(TransactionIdIsValid(xid)); + + /* we may see bootstrap / frozen */ + if (!TransactionIdIsNormal(xid)) + return; + + /* + * We can't acquire XidGenLock, as this may be called with XidGenLock + * already held (or with other locks that don't allow XidGenLock to be + * nested). That's ok for our purposes though, since we already rely on + * 32bit reads to be atomic. While nextXid is 64 bit, we only look at the + * lower 32bit, so a skewed read doesn't hurt. + * + * There's no increased danger of falling outside [oldest, next] by + * accessing them without a lock. xid needs to have been created with + * GetNewTransactionId() in the originating session, and the locks there + * pair with the memory barrier below. We do however accept xid to be <= + * to next_xid, instead of just <, as xid could be from the procarray, + * before we see the updated nextXid value. + */ + pg_memory_barrier(); + oldest_xid = ShmemVariableCache->oldestXid; + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) || + TransactionIdPrecedesOrEquals(xid, next_xid)); +} +#endif diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c new file mode 100644 index 0000000..e0c7ad1 --- /dev/null +++ b/src/backend/access/transam/xact.c @@ -0,0 +1,6249 @@ +/*------------------------------------------------------------------------- + * + * xact.c + * top level transaction system support routines + * + * See src/backend/access/transam/README for more information. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/xact.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <time.h> +#include <unistd.h> + +#include "access/commit_ts.h" +#include "access/multixact.h" +#include "access/parallel.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/pg_enum.h" +#include "catalog/storage.h" +#include "commands/async.h" +#include "commands/tablecmds.h" +#include "commands/trigger.h" +#include "common/pg_prng.h" +#include "executor/spi.h" +#include "libpq/be-fsstubs.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/snapbuild.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/md.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/combocid.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* + * User-tweakable parameters + */ +int DefaultXactIsoLevel = XACT_READ_COMMITTED; +int XactIsoLevel; + +bool DefaultXactReadOnly = false; +bool XactReadOnly; + +bool DefaultXactDeferrable = false; +bool XactDeferrable; + +int synchronous_commit = SYNCHRONOUS_COMMIT_ON; + +/* + * CheckXidAlive is a xid value pointing to a possibly ongoing (sub) + * transaction. Currently, it is used in logical decoding. It's possible + * that such transactions can get aborted while the decoding is ongoing in + * which case we skip decoding that particular transaction. To ensure that we + * check whether the CheckXidAlive is aborted after fetching the tuple from + * system tables. We also ensure that during logical decoding we never + * directly access the tableam or heap APIs because we are checking for the + * concurrent aborts only in systable_* APIs. + */ +TransactionId CheckXidAlive = InvalidTransactionId; +bool bsysscan = false; + +/* + * When running as a parallel worker, we place only a single + * TransactionStateData on the parallel worker's state stack, and the XID + * reflected there will be that of the *innermost* currently-active + * subtransaction in the backend that initiated parallelism. However, + * GetTopTransactionId() and TransactionIdIsCurrentTransactionId() + * need to return the same answers in the parallel worker as they would have + * in the user backend, so we need some additional bookkeeping. + * + * XactTopFullTransactionId stores the XID of our toplevel transaction, which + * will be the same as TopTransactionStateData.fullTransactionId in an + * ordinary backend; but in a parallel backend, which does not have the entire + * transaction state, it will instead be copied from the backend that started + * the parallel operation. + * + * nParallelCurrentXids will be 0 and ParallelCurrentXids NULL in an ordinary + * backend, but in a parallel backend, nParallelCurrentXids will contain the + * number of XIDs that need to be considered current, and ParallelCurrentXids + * will contain the XIDs themselves. This includes all XIDs that were current + * or sub-committed in the parent at the time the parallel operation began. + * The XIDs are stored sorted in numerical order (not logical order) to make + * lookups as fast as possible. + */ +static FullTransactionId XactTopFullTransactionId = {InvalidTransactionId}; +static int nParallelCurrentXids = 0; +static TransactionId *ParallelCurrentXids; + +/* + * Miscellaneous flag bits to record events which occur on the top level + * transaction. These flags are only persisted in MyXactFlags and are intended + * so we remember to do certain things later on in the transaction. This is + * globally accessible, so can be set from anywhere in the code that requires + * recording flags. + */ +int MyXactFlags; + +/* + * transaction states - transaction state from server perspective + */ +typedef enum TransState +{ + TRANS_DEFAULT, /* idle */ + TRANS_START, /* transaction starting */ + TRANS_INPROGRESS, /* inside a valid transaction */ + TRANS_COMMIT, /* commit in progress */ + TRANS_ABORT, /* abort in progress */ + TRANS_PREPARE /* prepare in progress */ +} TransState; + +/* + * transaction block states - transaction state of client queries + * + * Note: the subtransaction states are used only for non-topmost + * transactions; the others appear only in the topmost transaction. + */ +typedef enum TBlockState +{ + /* not-in-transaction-block states */ + TBLOCK_DEFAULT, /* idle */ + TBLOCK_STARTED, /* running single-query transaction */ + + /* transaction block states */ + TBLOCK_BEGIN, /* starting transaction block */ + TBLOCK_INPROGRESS, /* live transaction */ + TBLOCK_IMPLICIT_INPROGRESS, /* live transaction after implicit BEGIN */ + TBLOCK_PARALLEL_INPROGRESS, /* live transaction inside parallel worker */ + TBLOCK_END, /* COMMIT received */ + TBLOCK_ABORT, /* failed xact, awaiting ROLLBACK */ + TBLOCK_ABORT_END, /* failed xact, ROLLBACK received */ + TBLOCK_ABORT_PENDING, /* live xact, ROLLBACK received */ + TBLOCK_PREPARE, /* live xact, PREPARE received */ + + /* subtransaction states */ + TBLOCK_SUBBEGIN, /* starting a subtransaction */ + TBLOCK_SUBINPROGRESS, /* live subtransaction */ + TBLOCK_SUBRELEASE, /* RELEASE received */ + TBLOCK_SUBCOMMIT, /* COMMIT received while TBLOCK_SUBINPROGRESS */ + TBLOCK_SUBABORT, /* failed subxact, awaiting ROLLBACK */ + TBLOCK_SUBABORT_END, /* failed subxact, ROLLBACK received */ + TBLOCK_SUBABORT_PENDING, /* live subxact, ROLLBACK received */ + TBLOCK_SUBRESTART, /* live subxact, ROLLBACK TO received */ + TBLOCK_SUBABORT_RESTART /* failed subxact, ROLLBACK TO received */ +} TBlockState; + +/* + * transaction state structure + */ +typedef struct TransactionStateData +{ + FullTransactionId fullTransactionId; /* my FullTransactionId */ + SubTransactionId subTransactionId; /* my subxact ID */ + char *name; /* savepoint name, if any */ + int savepointLevel; /* savepoint level */ + TransState state; /* low-level state */ + TBlockState blockState; /* high-level state */ + int nestingLevel; /* transaction nesting depth */ + int gucNestLevel; /* GUC context nesting depth */ + MemoryContext curTransactionContext; /* my xact-lifetime context */ + ResourceOwner curTransactionOwner; /* my query resources */ + TransactionId *childXids; /* subcommitted child XIDs, in XID order */ + int nChildXids; /* # of subcommitted child XIDs */ + int maxChildXids; /* allocated size of childXids[] */ + Oid prevUser; /* previous CurrentUserId setting */ + int prevSecContext; /* previous SecurityRestrictionContext */ + bool prevXactReadOnly; /* entry-time xact r/o state */ + bool startedInRecovery; /* did we start in recovery? */ + bool didLogXid; /* has xid been included in WAL record? */ + int parallelModeLevel; /* Enter/ExitParallelMode counter */ + bool chain; /* start a new block after this one */ + bool topXidLogged; /* for a subxact: is top-level XID logged? */ + struct TransactionStateData *parent; /* back link to parent */ +} TransactionStateData; + +typedef TransactionStateData *TransactionState; + +/* + * Serialized representation used to transmit transaction state to parallel + * workers through shared memory. + */ +typedef struct SerializedTransactionState +{ + int xactIsoLevel; + bool xactDeferrable; + FullTransactionId topFullTransactionId; + FullTransactionId currentFullTransactionId; + CommandId currentCommandId; + int nParallelCurrentXids; + TransactionId parallelCurrentXids[FLEXIBLE_ARRAY_MEMBER]; +} SerializedTransactionState; + +/* The size of SerializedTransactionState, not including the final array. */ +#define SerializedTransactionStateHeaderSize \ + offsetof(SerializedTransactionState, parallelCurrentXids) + +/* + * CurrentTransactionState always points to the current transaction state + * block. It will point to TopTransactionStateData when not in a + * transaction at all, or when in a top-level transaction. + */ +static TransactionStateData TopTransactionStateData = { + .state = TRANS_DEFAULT, + .blockState = TBLOCK_DEFAULT, + .topXidLogged = false, +}; + +/* + * unreportedXids holds XIDs of all subtransactions that have not yet been + * reported in an XLOG_XACT_ASSIGNMENT record. + */ +static int nUnreportedXids; +static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS]; + +static TransactionState CurrentTransactionState = &TopTransactionStateData; + +/* + * The subtransaction ID and command ID assignment counters are global + * to a whole transaction, so we do not keep them in the state stack. + */ +static SubTransactionId currentSubTransactionId; +static CommandId currentCommandId; +static bool currentCommandIdUsed; + +/* + * xactStartTimestamp is the value of transaction_timestamp(). + * stmtStartTimestamp is the value of statement_timestamp(). + * xactStopTimestamp is the time at which we log a commit or abort WAL record. + * These do not change as we enter and exit subtransactions, so we don't + * keep them inside the TransactionState stack. + */ +static TimestampTz xactStartTimestamp; +static TimestampTz stmtStartTimestamp; +static TimestampTz xactStopTimestamp; + +/* + * GID to be used for preparing the current transaction. This is also + * global to a whole transaction, so we don't keep it in the state stack. + */ +static char *prepareGID; + +/* + * Some commands want to force synchronous commit. + */ +static bool forceSyncCommit = false; + +/* Flag for logging statements in a transaction. */ +bool xact_is_sampled = false; + +/* + * Private context for transaction-abort work --- we reserve space for this + * at startup to ensure that AbortTransaction and AbortSubTransaction can work + * when we've run out of memory. + */ +static MemoryContext TransactionAbortContext = NULL; + +/* + * List of add-on start- and end-of-xact callbacks + */ +typedef struct XactCallbackItem +{ + struct XactCallbackItem *next; + XactCallback callback; + void *arg; +} XactCallbackItem; + +static XactCallbackItem *Xact_callbacks = NULL; + +/* + * List of add-on start- and end-of-subxact callbacks + */ +typedef struct SubXactCallbackItem +{ + struct SubXactCallbackItem *next; + SubXactCallback callback; + void *arg; +} SubXactCallbackItem; + +static SubXactCallbackItem *SubXact_callbacks = NULL; + + +/* local function prototypes */ +static void AssignTransactionId(TransactionState s); +static void AbortTransaction(void); +static void AtAbort_Memory(void); +static void AtCleanup_Memory(void); +static void AtAbort_ResourceOwner(void); +static void AtCCI_LocalCache(void); +static void AtCommit_Memory(void); +static void AtStart_Cache(void); +static void AtStart_Memory(void); +static void AtStart_ResourceOwner(void); +static void CallXactCallbacks(XactEvent event); +static void CallSubXactCallbacks(SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid); +static void CleanupTransaction(void); +static void CheckTransactionBlock(bool isTopLevel, bool throwError, + const char *stmtType); +static void CommitTransaction(void); +static TransactionId RecordTransactionAbort(bool isSubXact); +static void StartTransaction(void); + +static void StartSubTransaction(void); +static void CommitSubTransaction(void); +static void AbortSubTransaction(void); +static void CleanupSubTransaction(void); +static void PushTransaction(void); +static void PopTransaction(void); + +static void AtSubAbort_Memory(void); +static void AtSubCleanup_Memory(void); +static void AtSubAbort_ResourceOwner(void); +static void AtSubCommit_Memory(void); +static void AtSubStart_Memory(void); +static void AtSubStart_ResourceOwner(void); + +static void ShowTransactionState(const char *str); +static void ShowTransactionStateRec(const char *str, TransactionState state); +static const char *BlockStateAsString(TBlockState blockState); +static const char *TransStateAsString(TransState state); + + +/* ---------------------------------------------------------------- + * transaction state accessors + * ---------------------------------------------------------------- + */ + +/* + * IsTransactionState + * + * This returns true if we are inside a valid transaction; that is, + * it is safe to initiate database access, take heavyweight locks, etc. + */ +bool +IsTransactionState(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states. However, we + * also reject the startup/shutdown states TRANS_START, TRANS_COMMIT, + * TRANS_PREPARE since it might be too soon or too late within those + * transition states to do anything interesting. Hence, the only "valid" + * state is TRANS_INPROGRESS. + */ + return (s->state == TRANS_INPROGRESS); +} + +/* + * IsAbortedTransactionBlockState + * + * This returns true if we are within an aborted transaction block. + */ +bool +IsAbortedTransactionBlockState(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_ABORT || + s->blockState == TBLOCK_SUBABORT) + return true; + + return false; +} + + +/* + * GetTopTransactionId + * + * This will return the XID of the main transaction, assigning one if + * it's not yet set. Be careful to call this only inside a valid xact. + */ +TransactionId +GetTopTransactionId(void) +{ + if (!FullTransactionIdIsValid(XactTopFullTransactionId)) + AssignTransactionId(&TopTransactionStateData); + return XidFromFullTransactionId(XactTopFullTransactionId); +} + +/* + * GetTopTransactionIdIfAny + * + * This will return the XID of the main transaction, if one is assigned. + * It will return InvalidTransactionId if we are not currently inside a + * transaction, or inside a transaction that hasn't yet been assigned an XID. + */ +TransactionId +GetTopTransactionIdIfAny(void) +{ + return XidFromFullTransactionId(XactTopFullTransactionId); +} + +/* + * GetCurrentTransactionId + * + * This will return the XID of the current transaction (main or sub + * transaction), assigning one if it's not yet set. Be careful to call this + * only inside a valid xact. + */ +TransactionId +GetCurrentTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + if (!FullTransactionIdIsValid(s->fullTransactionId)) + AssignTransactionId(s); + return XidFromFullTransactionId(s->fullTransactionId); +} + +/* + * GetCurrentTransactionIdIfAny + * + * This will return the XID of the current sub xact, if one is assigned. + * It will return InvalidTransactionId if we are not currently inside a + * transaction, or inside a transaction that hasn't been assigned an XID yet. + */ +TransactionId +GetCurrentTransactionIdIfAny(void) +{ + return XidFromFullTransactionId(CurrentTransactionState->fullTransactionId); +} + +/* + * GetTopFullTransactionId + * + * This will return the FullTransactionId of the main transaction, assigning + * one if it's not yet set. Be careful to call this only inside a valid xact. + */ +FullTransactionId +GetTopFullTransactionId(void) +{ + if (!FullTransactionIdIsValid(XactTopFullTransactionId)) + AssignTransactionId(&TopTransactionStateData); + return XactTopFullTransactionId; +} + +/* + * GetTopFullTransactionIdIfAny + * + * This will return the FullTransactionId of the main transaction, if one is + * assigned. It will return InvalidFullTransactionId if we are not currently + * inside a transaction, or inside a transaction that hasn't yet been assigned + * one. + */ +FullTransactionId +GetTopFullTransactionIdIfAny(void) +{ + return XactTopFullTransactionId; +} + +/* + * GetCurrentFullTransactionId + * + * This will return the FullTransactionId of the current transaction (main or + * sub transaction), assigning one if it's not yet set. Be careful to call + * this only inside a valid xact. + */ +FullTransactionId +GetCurrentFullTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + if (!FullTransactionIdIsValid(s->fullTransactionId)) + AssignTransactionId(s); + return s->fullTransactionId; +} + +/* + * GetCurrentFullTransactionIdIfAny + * + * This will return the FullTransactionId of the current sub xact, if one is + * assigned. It will return InvalidFullTransactionId if we are not currently + * inside a transaction, or inside a transaction that hasn't been assigned one + * yet. + */ +FullTransactionId +GetCurrentFullTransactionIdIfAny(void) +{ + return CurrentTransactionState->fullTransactionId; +} + +/* + * MarkCurrentTransactionIdLoggedIfAny + * + * Remember that the current xid - if it is assigned - now has been wal logged. + */ +void +MarkCurrentTransactionIdLoggedIfAny(void) +{ + if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId)) + CurrentTransactionState->didLogXid = true; +} + +/* + * IsSubxactTopXidLogPending + * + * This is used to decide whether we need to WAL log the top-level XID for + * operation in a subtransaction. We require that for logical decoding, see + * LogicalDecodingProcessRecord. + * + * This returns true if wal_level >= logical and we are inside a valid + * subtransaction, for which the assignment was not yet written to any WAL + * record. + */ +bool +IsSubxactTopXidLogPending(void) +{ + /* check whether it is already logged */ + if (CurrentTransactionState->topXidLogged) + return false; + + /* wal_level has to be logical */ + if (!XLogLogicalInfoActive()) + return false; + + /* we need to be in a transaction state */ + if (!IsTransactionState()) + return false; + + /* it has to be a subtransaction */ + if (!IsSubTransaction()) + return false; + + /* the subtransaction has to have a XID assigned */ + if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return false; + + return true; +} + +/* + * MarkSubxactTopXidLogged + * + * Remember that the top transaction id for the current subtransaction is WAL + * logged now. + */ +void +MarkSubxactTopXidLogged(void) +{ + Assert(IsSubxactTopXidLogPending()); + + CurrentTransactionState->topXidLogged = true; +} + +/* + * GetStableLatestTransactionId + * + * Get the transaction's XID if it has one, else read the next-to-be-assigned + * XID. Once we have a value, return that same value for the remainder of the + * current transaction. This is meant to provide the reference point for the + * age(xid) function, but might be useful for other maintenance tasks as well. + */ +TransactionId +GetStableLatestTransactionId(void) +{ + static LocalTransactionId lxid = InvalidLocalTransactionId; + static TransactionId stablexid = InvalidTransactionId; + + if (lxid != MyProc->lxid) + { + lxid = MyProc->lxid; + stablexid = GetTopTransactionIdIfAny(); + if (!TransactionIdIsValid(stablexid)) + stablexid = ReadNextTransactionId(); + } + + Assert(TransactionIdIsValid(stablexid)); + + return stablexid; +} + +/* + * AssignTransactionId + * + * Assigns a new permanent FullTransactionId to the given TransactionState. + * We do not assign XIDs to transactions until/unless this is called. + * Also, any parent TransactionStates that don't yet have XIDs are assigned + * one; this maintains the invariant that a child transaction has an XID + * following its parent's. + */ +static void +AssignTransactionId(TransactionState s) +{ + bool isSubXact = (s->parent != NULL); + ResourceOwner currentOwner; + bool log_unknown_top = false; + + /* Assert that caller didn't screw up */ + Assert(!FullTransactionIdIsValid(s->fullTransactionId)); + Assert(s->state == TRANS_INPROGRESS); + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new XIDs at this point. + */ + if (IsInParallelMode() || IsParallelWorker()) + elog(ERROR, "cannot assign XIDs during a parallel operation"); + + /* + * Ensure parent(s) have XIDs, so that a child always has an XID later + * than its parent. Mustn't recurse here, or we might get a stack + * overflow if we're at the bottom of a huge stack of subtransactions none + * of which have XIDs yet. + */ + if (isSubXact && !FullTransactionIdIsValid(s->parent->fullTransactionId)) + { + TransactionState p = s->parent; + TransactionState *parents; + size_t parentOffset = 0; + + parents = palloc(sizeof(TransactionState) * s->nestingLevel); + while (p != NULL && !FullTransactionIdIsValid(p->fullTransactionId)) + { + parents[parentOffset++] = p; + p = p->parent; + } + + /* + * This is technically a recursive call, but the recursion will never + * be more than one layer deep. + */ + while (parentOffset != 0) + AssignTransactionId(parents[--parentOffset]); + + pfree(parents); + } + + /* + * When wal_level=logical, guarantee that a subtransaction's xid can only + * be seen in the WAL stream if its toplevel xid has been logged before. + * If necessary we log an xact_assignment record with fewer than + * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set + * for a transaction even though it appears in a WAL record, we just might + * superfluously log something. That can happen when an xid is included + * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in + * xl_standby_locks. + */ + if (isSubXact && XLogLogicalInfoActive() && + !TopTransactionStateData.didLogXid) + log_unknown_top = true; + + /* + * Generate a new FullTransactionId and record its xid in PG_PROC and + * pg_subtrans. + * + * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in + * shared storage other than PG_PROC; because if there's no room for it in + * PG_PROC, the subtrans entry is needed to ensure that other backends see + * the Xid as "running". See GetNewTransactionId. + */ + s->fullTransactionId = GetNewTransactionId(isSubXact); + if (!isSubXact) + XactTopFullTransactionId = s->fullTransactionId; + + if (isSubXact) + SubTransSetParent(XidFromFullTransactionId(s->fullTransactionId), + XidFromFullTransactionId(s->parent->fullTransactionId)); + + /* + * If it's a top-level transaction, the predicate locking system needs to + * be told about it too. + */ + if (!isSubXact) + RegisterPredicateLockingXid(XidFromFullTransactionId(s->fullTransactionId)); + + /* + * Acquire lock on the transaction XID. (We assume this cannot block.) We + * have to ensure that the lock is assigned to the transaction's own + * ResourceOwner. + */ + currentOwner = CurrentResourceOwner; + CurrentResourceOwner = s->curTransactionOwner; + + XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId)); + + CurrentResourceOwner = currentOwner; + + /* + * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each + * top-level transaction we issue a WAL record for the assignment. We + * include the top-level xid and all the subxids that have not yet been + * reported using XLOG_XACT_ASSIGNMENT records. + * + * This is required to limit the amount of shared memory required in a hot + * standby server to keep track of in-progress XIDs. See notes for + * RecordKnownAssignedTransactionIds(). + * + * We don't keep track of the immediate parent of each subxid, only the + * top-level transaction that each subxact belongs to. This is correct in + * recovery only because aborted subtransactions are separately WAL + * logged. + * + * This is correct even for the case where several levels above us didn't + * have an xid assigned as we recursed up to them beforehand. + */ + if (isSubXact && XLogStandbyInfoActive()) + { + unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId); + nUnreportedXids++; + + /* + * ensure this test matches similar one in + * RecoverPreparedTransactions() + */ + if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS || + log_unknown_top) + { + xl_xact_assignment xlrec; + + /* + * xtop is always set by now because we recurse up transaction + * stack to the highest unassigned xid and then come back down + */ + xlrec.xtop = GetTopTransactionId(); + Assert(TransactionIdIsValid(xlrec.xtop)); + xlrec.nsubxacts = nUnreportedXids; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment); + XLogRegisterData((char *) unreportedXids, + nUnreportedXids * sizeof(TransactionId)); + + (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT); + + nUnreportedXids = 0; + /* mark top, not current xact as having been logged */ + TopTransactionStateData.didLogXid = true; + } + } +} + +/* + * GetCurrentSubTransactionId + */ +SubTransactionId +GetCurrentSubTransactionId(void) +{ + TransactionState s = CurrentTransactionState; + + return s->subTransactionId; +} + +/* + * SubTransactionIsActive + * + * Test if the specified subxact ID is still active. Note caller is + * responsible for checking whether this ID is relevant to the current xact. + */ +bool +SubTransactionIsActive(SubTransactionId subxid) +{ + TransactionState s; + + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (s->state == TRANS_ABORT) + continue; + if (s->subTransactionId == subxid) + return true; + } + return false; +} + + +/* + * GetCurrentCommandId + * + * "used" must be true if the caller intends to use the command ID to mark + * inserted/updated/deleted tuples. false means the ID is being fetched + * for read-only purposes (ie, as a snapshot validity cutoff). See + * CommandCounterIncrement() for discussion. + */ +CommandId +GetCurrentCommandId(bool used) +{ + /* this is global to a transaction, not subtransaction-local */ + if (used) + { + /* + * Forbid setting currentCommandIdUsed in a parallel worker, because + * we have no provision for communicating this back to the leader. We + * could relax this restriction when currentCommandIdUsed was already + * true at the start of the parallel operation. + */ + Assert(!IsParallelWorker()); + currentCommandIdUsed = true; + } + return currentCommandId; +} + +/* + * SetParallelStartTimestamps + * + * In a parallel worker, we should inherit the parent transaction's + * timestamps rather than setting our own. The parallel worker + * infrastructure must call this to provide those values before + * calling StartTransaction() or SetCurrentStatementStartTimestamp(). + */ +void +SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts) +{ + Assert(IsParallelWorker()); + xactStartTimestamp = xact_ts; + stmtStartTimestamp = stmt_ts; +} + +/* + * GetCurrentTransactionStartTimestamp + */ +TimestampTz +GetCurrentTransactionStartTimestamp(void) +{ + return xactStartTimestamp; +} + +/* + * GetCurrentStatementStartTimestamp + */ +TimestampTz +GetCurrentStatementStartTimestamp(void) +{ + return stmtStartTimestamp; +} + +/* + * GetCurrentTransactionStopTimestamp + * + * We return current time if the transaction stop time hasn't been set + * (which can happen if we decide we don't need to log an XLOG record). + */ +TimestampTz +GetCurrentTransactionStopTimestamp(void) +{ + if (xactStopTimestamp != 0) + return xactStopTimestamp; + return GetCurrentTimestamp(); +} + +/* + * SetCurrentStatementStartTimestamp + * + * In a parallel worker, this should already have been provided by a call + * to SetParallelStartTimestamps(). + */ +void +SetCurrentStatementStartTimestamp(void) +{ + if (!IsParallelWorker()) + stmtStartTimestamp = GetCurrentTimestamp(); + else + Assert(stmtStartTimestamp != 0); +} + +/* + * SetCurrentTransactionStopTimestamp + */ +static inline void +SetCurrentTransactionStopTimestamp(void) +{ + xactStopTimestamp = GetCurrentTimestamp(); +} + +/* + * GetCurrentTransactionNestLevel + * + * Note: this will return zero when not inside any transaction, one when + * inside a top-level transaction, etc. + */ +int +GetCurrentTransactionNestLevel(void) +{ + TransactionState s = CurrentTransactionState; + + return s->nestingLevel; +} + + +/* + * TransactionIdIsCurrentTransactionId + */ +bool +TransactionIdIsCurrentTransactionId(TransactionId xid) +{ + TransactionState s; + + /* + * We always say that BootstrapTransactionId is "not my transaction ID" + * even when it is (ie, during bootstrap). Along with the fact that + * transam.c always treats BootstrapTransactionId as already committed, + * this causes the heapam_visibility.c routines to see all tuples as + * committed, which is what we need during bootstrap. (Bootstrap mode + * only inserts tuples, it never updates or deletes them, so all tuples + * can be presumed good immediately.) + * + * Likewise, InvalidTransactionId and FrozenTransactionId are certainly + * not my transaction ID, so we can just return "false" immediately for + * any non-normal XID. + */ + if (!TransactionIdIsNormal(xid)) + return false; + + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return true; + + /* + * In parallel workers, the XIDs we must consider as current are stored in + * ParallelCurrentXids rather than the transaction-state stack. Note that + * the XIDs in this array are sorted numerically rather than according to + * transactionIdPrecedes order. + */ + if (nParallelCurrentXids > 0) + { + int low, + high; + + low = 0; + high = nParallelCurrentXids - 1; + while (low <= high) + { + int middle; + TransactionId probe; + + middle = low + (high - low) / 2; + probe = ParallelCurrentXids[middle]; + if (probe == xid) + return true; + else if (probe < xid) + low = middle + 1; + else + high = middle - 1; + } + return false; + } + + /* + * We will return true for the Xid of the current subtransaction, any of + * its subcommitted children, any of its parents, or any of their + * previously subcommitted children. However, a transaction being aborted + * is no longer "current", even though it may still have an entry on the + * state stack. + */ + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + int low, + high; + + if (s->state == TRANS_ABORT) + continue; + if (!FullTransactionIdIsValid(s->fullTransactionId)) + continue; /* it can't have any child XIDs either */ + if (TransactionIdEquals(xid, XidFromFullTransactionId(s->fullTransactionId))) + return true; + /* As the childXids array is ordered, we can use binary search */ + low = 0; + high = s->nChildXids - 1; + while (low <= high) + { + int middle; + TransactionId probe; + + middle = low + (high - low) / 2; + probe = s->childXids[middle]; + if (TransactionIdEquals(probe, xid)) + return true; + else if (TransactionIdPrecedes(probe, xid)) + low = middle + 1; + else + high = middle - 1; + } + } + + return false; +} + +/* + * TransactionStartedDuringRecovery + * + * Returns true if the current transaction started while recovery was still + * in progress. Recovery might have ended since so RecoveryInProgress() might + * return false already. + */ +bool +TransactionStartedDuringRecovery(void) +{ + return CurrentTransactionState->startedInRecovery; +} + +/* + * EnterParallelMode + */ +void +EnterParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parallelModeLevel >= 0); + + ++s->parallelModeLevel; +} + +/* + * ExitParallelMode + */ +void +ExitParallelMode(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parallelModeLevel > 0); + Assert(s->parallelModeLevel > 1 || !ParallelContextActive()); + + --s->parallelModeLevel; +} + +/* + * IsInParallelMode + * + * Are we in a parallel operation, as either the leader or a worker? Check + * this to prohibit operations that change backend-local state expected to + * match across all workers. Mere caches usually don't require such a + * restriction. State modified in a strict push/pop fashion, such as the + * active snapshot stack, is often fine. + */ +bool +IsInParallelMode(void) +{ + return CurrentTransactionState->parallelModeLevel != 0; +} + +/* + * CommandCounterIncrement + */ +void +CommandCounterIncrement(void) +{ + /* + * If the current value of the command counter hasn't been "used" to mark + * tuples, we need not increment it, since there's no need to distinguish + * a read-only command from others. This helps postpone command counter + * overflow, and keeps no-op CommandCounterIncrement operations cheap. + */ + if (currentCommandIdUsed) + { + /* + * Workers synchronize transaction state at the beginning of each + * parallel operation, so we can't account for new commands after that + * point. + */ + if (IsInParallelMode() || IsParallelWorker()) + elog(ERROR, "cannot start commands during a parallel operation"); + + currentCommandId += 1; + if (currentCommandId == InvalidCommandId) + { + currentCommandId -= 1; + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than 2^32-2 commands in a transaction"))); + } + currentCommandIdUsed = false; + + /* Propagate new command ID into static snapshots */ + SnapshotSetCommandId(currentCommandId); + + /* + * Make any catalog changes done by the just-completed command visible + * in the local syscache. We obviously don't need to do this after a + * read-only command. (But see hacks in inval.c to make real sure we + * don't think a command that queued inval messages was read-only.) + */ + AtCCI_LocalCache(); + } +} + +/* + * ForceSyncCommit + * + * Interface routine to allow commands to force a synchronous commit of the + * current top-level transaction. Currently, two-phase commit does not + * persist and restore this variable. So long as all callers use + * PreventInTransactionBlock(), that omission has no consequences. + */ +void +ForceSyncCommit(void) +{ + forceSyncCommit = true; +} + + +/* ---------------------------------------------------------------- + * StartTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtStart_Cache + */ +static void +AtStart_Cache(void) +{ + AcceptInvalidationMessages(); +} + +/* + * AtStart_Memory + */ +static void +AtStart_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If this is the first time through, create a private context for + * AbortTransaction to work in. By reserving some space now, we can + * insulate AbortTransaction from out-of-memory scenarios. Like + * ErrorContext, we set it up with slow growth rate and a nonzero minimum + * size, so that space will be reserved immediately. + */ + if (TransactionAbortContext == NULL) + TransactionAbortContext = + AllocSetContextCreate(TopMemoryContext, + "TransactionAbortContext", + 32 * 1024, + 32 * 1024, + 32 * 1024); + + /* + * We shouldn't have a transaction context already. + */ + Assert(TopTransactionContext == NULL); + + /* + * Create a toplevel context for the transaction. + */ + TopTransactionContext = + AllocSetContextCreate(TopMemoryContext, + "TopTransactionContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * In a top-level transaction, CurTransactionContext is the same as + * TopTransactionContext. + */ + CurTransactionContext = TopTransactionContext; + s->curTransactionContext = CurTransactionContext; + + /* Make the CurTransactionContext active. */ + MemoryContextSwitchTo(CurTransactionContext); +} + +/* + * AtStart_ResourceOwner + */ +static void +AtStart_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * We shouldn't have a transaction resource owner already. + */ + Assert(TopTransactionResourceOwner == NULL); + + /* + * Create a toplevel resource owner for the transaction. + */ + s->curTransactionOwner = ResourceOwnerCreate(NULL, "TopTransaction"); + + TopTransactionResourceOwner = s->curTransactionOwner; + CurTransactionResourceOwner = s->curTransactionOwner; + CurrentResourceOwner = s->curTransactionOwner; +} + +/* ---------------------------------------------------------------- + * StartSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubStart_Memory + */ +static void +AtSubStart_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(CurTransactionContext != NULL); + + /* + * Create a CurTransactionContext, which will be used to hold data that + * survives subtransaction commit but disappears on subtransaction abort. + * We make it a child of the immediate parent's CurTransactionContext. + */ + CurTransactionContext = AllocSetContextCreate(CurTransactionContext, + "CurTransactionContext", + ALLOCSET_DEFAULT_SIZES); + s->curTransactionContext = CurTransactionContext; + + /* Make the CurTransactionContext active. */ + MemoryContextSwitchTo(CurTransactionContext); +} + +/* + * AtSubStart_ResourceOwner + */ +static void +AtSubStart_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* + * Create a resource owner for the subtransaction. We make it a child of + * the immediate parent's resource owner. + */ + s->curTransactionOwner = + ResourceOwnerCreate(s->parent->curTransactionOwner, + "SubTransaction"); + + CurTransactionResourceOwner = s->curTransactionOwner; + CurrentResourceOwner = s->curTransactionOwner; +} + +/* ---------------------------------------------------------------- + * CommitTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * RecordTransactionCommit + * + * Returns latest XID among xact and its children, or InvalidTransactionId + * if the xact has no XID. (We compute that here just because it's easier.) + * + * If you change this function, see RecordTransactionCommitPrepared also. + */ +static TransactionId +RecordTransactionCommit(void) +{ + TransactionId xid = GetTopTransactionIdIfAny(); + bool markXidCommitted = TransactionIdIsValid(xid); + TransactionId latestXid = InvalidTransactionId; + int nrels; + RelFileNode *rels; + int nchildren; + TransactionId *children; + int ndroppedstats = 0; + xl_xact_stats_item *droppedstats = NULL; + int nmsgs = 0; + SharedInvalidationMessage *invalMessages = NULL; + bool RelcacheInitFileInval = false; + bool wrote_xlog; + + /* + * Log pending invalidations for logical decoding of in-progress + * transactions. Normally for DDLs, we log this at each command end, + * however, for certain cases where we directly update the system table + * without a transaction block, the invalidations are not logged till this + * time. + */ + if (XLogLogicalInfoActive()) + LogLogicalInvalidations(); + + /* Get data needed for commit record */ + nrels = smgrGetPendingDeletes(true, &rels); + nchildren = xactGetCommittedChildren(&children); + ndroppedstats = pgstat_get_transactional_drops(true, &droppedstats); + if (XLogStandbyInfoActive()) + nmsgs = xactGetCommittedInvalidationMessages(&invalMessages, + &RelcacheInitFileInval); + wrote_xlog = (XactLastRecEnd != 0); + + /* + * If we haven't been assigned an XID yet, we neither can, nor do we want + * to write a COMMIT record. + */ + if (!markXidCommitted) + { + /* + * We expect that every RelationDropStorage is followed by a catalog + * update, and hence XID assignment, so we shouldn't get here with any + * pending deletes. Same is true for dropping stats. + * + * Use a real test not just an Assert to check this, since it's a bit + * fragile. + */ + if (nrels != 0 || ndroppedstats != 0) + elog(ERROR, "cannot commit a transaction that deleted files but has no xid"); + + /* Can't have child XIDs either; AssignTransactionId enforces this */ + Assert(nchildren == 0); + + /* + * Transactions without an assigned xid can contain invalidation + * messages (e.g. explicit relcache invalidations or catcache + * invalidations for inplace updates); standbys need to process those. + * We can't emit a commit record without an xid, and we don't want to + * force assigning an xid, because that'd be problematic for e.g. + * vacuum. Hence we emit a bespoke record for the invalidations. We + * don't want to use that in case a commit record is emitted, so they + * happen synchronously with commits (besides not wanting to emit more + * WAL records). + */ + if (nmsgs != 0) + { + LogStandbyInvalidations(nmsgs, invalMessages, + RelcacheInitFileInval); + wrote_xlog = true; /* not strictly necessary */ + } + + /* + * If we didn't create XLOG entries, we're done here; otherwise we + * should trigger flushing those entries the same as a commit record + * would. This will primarily happen for HOT pruning and the like; we + * want these to be flushed to disk in due time. + */ + if (!wrote_xlog) + goto cleanup; + } + else + { + bool replorigin; + + /* + * Are we using the replication origins feature? Or, in other words, + * are we replaying remote actions? + */ + replorigin = (replorigin_session_origin != InvalidRepOriginId && + replorigin_session_origin != DoNotReplicateId); + + /* + * Begin commit critical section and insert the commit XLOG record. + */ + /* Tell bufmgr and smgr to prepare for commit */ + BufmgrCommit(); + + /* + * Mark ourselves as within our "commit critical section". This + * forces any concurrent checkpoint to wait until we've updated + * pg_xact. Without this, it is possible for the checkpoint to set + * REDO after the XLOG record but fail to flush the pg_xact update to + * disk, leading to loss of the transaction commit if the system + * crashes a little later. + * + * Note: we could, but don't bother to, set this flag in + * RecordTransactionAbort. That's because loss of a transaction abort + * is noncritical; the presumption would be that it aborted, anyway. + * + * It's safe to change the delayChkptFlags flag of our own backend + * without holding the ProcArrayLock, since we're the only one + * modifying it. This makes checkpoint's determination of which xacts + * are delaying the checkpoint a bit fuzzy, but it doesn't matter. + */ + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + START_CRIT_SECTION(); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + + SetCurrentTransactionStopTimestamp(); + + XactLogCommitRecord(xactStopTimestamp, + nchildren, children, nrels, rels, + ndroppedstats, droppedstats, + nmsgs, invalMessages, + RelcacheInitFileInval, + MyXactFlags, + InvalidTransactionId, NULL /* plain commit */ ); + + if (replorigin) + /* Move LSNs forward for this replication origin */ + replorigin_session_advance(replorigin_session_origin_lsn, + XactLastRecEnd); + + /* + * Record commit timestamp. The value comes from plain commit + * timestamp if there's no replication origin; otherwise, the + * timestamp was already set in replorigin_session_origin_timestamp by + * replication. + * + * We don't need to WAL-log anything here, as the commit record + * written above already contains the data. + */ + + if (!replorigin || replorigin_session_origin_timestamp == 0) + replorigin_session_origin_timestamp = xactStopTimestamp; + + TransactionTreeSetCommitTsData(xid, nchildren, children, + replorigin_session_origin_timestamp, + replorigin_session_origin); + } + + /* + * Check if we want to commit asynchronously. We can allow the XLOG flush + * to happen asynchronously if synchronous_commit=off, or if the current + * transaction has not performed any WAL-logged operation or didn't assign + * an xid. The transaction can end up not writing any WAL, even if it has + * an xid, if it only wrote to temporary and/or unlogged tables. It can + * end up having written WAL without an xid if it did HOT pruning. In + * case of a crash, the loss of such a transaction will be irrelevant; + * temp tables will be lost anyway, unlogged tables will be truncated and + * HOT pruning will be done again later. (Given the foregoing, you might + * think that it would be unnecessary to emit the XLOG record at all in + * this case, but we don't currently try to do that. It would certainly + * cause problems at least in Hot Standby mode, where the + * KnownAssignedXids machinery requires tracking every XID assignment. It + * might be OK to skip it only when wal_level < replica, but for now we + * don't.) + * + * However, if we're doing cleanup of any non-temp rels or committing any + * command that wanted to force sync commit, then we must flush XLOG + * immediately. (We must not allow asynchronous commit if there are any + * non-temp tables to be deleted, because we might delete the files before + * the COMMIT record is flushed to disk. We do allow asynchronous commit + * if all to-be-deleted tables are temporary though, since they are lost + * anyway if we crash.) + */ + if ((wrote_xlog && markXidCommitted && + synchronous_commit > SYNCHRONOUS_COMMIT_OFF) || + forceSyncCommit || nrels > 0) + { + XLogFlush(XactLastRecEnd); + + /* + * Now we may update the CLOG, if we wrote a COMMIT record above + */ + if (markXidCommitted) + TransactionIdCommitTree(xid, nchildren, children); + } + else + { + /* + * Asynchronous commit case: + * + * This enables possible committed transaction loss in the case of a + * postmaster crash because WAL buffers are left unwritten. Ideally we + * could issue the WAL write without the fsync, but some + * wal_sync_methods do not allow separate write/fsync. + * + * Report the latest async commit LSN, so that the WAL writer knows to + * flush this commit. + */ + XLogSetAsyncXactLSN(XactLastRecEnd); + + /* + * We must not immediately update the CLOG, since we didn't flush the + * XLOG. Instead, we store the LSN up to which the XLOG must be + * flushed before the CLOG may be updated. + */ + if (markXidCommitted) + TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd); + } + + /* + * If we entered a commit critical section, leave it now, and let + * checkpoints proceed. + */ + if (markXidCommitted) + { + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + END_CRIT_SECTION(); + } + + /* Compute latestXid while we have the child XIDs handy */ + latestXid = TransactionIdLatest(xid, nchildren, children); + + /* + * Wait for synchronous replication, if required. Similar to the decision + * above about using committing asynchronously we only want to wait if + * this backend assigned an xid and wrote WAL. No need to wait if an xid + * was assigned due to temporary/unlogged tables or due to HOT pruning. + * + * Note that at this stage we have marked clog, but still show as running + * in the procarray and continue to hold locks. + */ + if (wrote_xlog && markXidCommitted) + SyncRepWaitForLSN(XactLastRecEnd, true); + + /* remember end of last commit record */ + XactLastCommitEnd = XactLastRecEnd; + + /* Reset XactLastRecEnd until the next transaction writes something */ + XactLastRecEnd = 0; +cleanup: + /* Clean up local data */ + if (rels) + pfree(rels); + if (ndroppedstats) + pfree(droppedstats); + + return latestXid; +} + + +/* + * AtCCI_LocalCache + */ +static void +AtCCI_LocalCache(void) +{ + /* + * Make any pending relation map changes visible. We must do this before + * processing local sinval messages, so that the map changes will get + * reflected into the relcache when relcache invals are processed. + */ + AtCCI_RelationMap(); + + /* + * Make catalog changes visible to me for the next command. + */ + CommandEndInvalidationMessages(); +} + +/* + * AtCommit_Memory + */ +static void +AtCommit_Memory(void) +{ + /* + * Now that we're "out" of a transaction, have the system allocate things + * in the top memory context instead of per-transaction contexts. + */ + MemoryContextSwitchTo(TopMemoryContext); + + /* + * Release all transaction-local memory. + */ + Assert(TopTransactionContext != NULL); + MemoryContextDelete(TopTransactionContext); + TopTransactionContext = NULL; + CurTransactionContext = NULL; + CurrentTransactionState->curTransactionContext = NULL; +} + +/* ---------------------------------------------------------------- + * CommitSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubCommit_Memory + */ +static void +AtSubCommit_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* Return to parent transaction level's memory context. */ + CurTransactionContext = s->parent->curTransactionContext; + MemoryContextSwitchTo(CurTransactionContext); + + /* + * Ordinarily we cannot throw away the child's CurTransactionContext, + * since the data it contains will be needed at upper commit. However, if + * there isn't actually anything in it, we can throw it away. This avoids + * a small memory leak in the common case of "trivial" subxacts. + */ + if (MemoryContextIsEmpty(s->curTransactionContext)) + { + MemoryContextDelete(s->curTransactionContext); + s->curTransactionContext = NULL; + } +} + +/* + * AtSubCommit_childXids + * + * Pass my own XID and my child XIDs up to my parent as committed children. + */ +static void +AtSubCommit_childXids(void) +{ + TransactionState s = CurrentTransactionState; + int new_nChildXids; + + Assert(s->parent != NULL); + + /* + * The parent childXids array will need to hold my XID and all my + * childXids, in addition to the XIDs already there. + */ + new_nChildXids = s->parent->nChildXids + s->nChildXids + 1; + + /* Allocate or enlarge the parent array if necessary */ + if (s->parent->maxChildXids < new_nChildXids) + { + int new_maxChildXids; + TransactionId *new_childXids; + + /* + * Make it 2x what's needed right now, to avoid having to enlarge it + * repeatedly. But we can't go above MaxAllocSize. (The latter limit + * is what ensures that we don't need to worry about integer overflow + * here or in the calculation of new_nChildXids.) + */ + new_maxChildXids = Min(new_nChildXids * 2, + (int) (MaxAllocSize / sizeof(TransactionId))); + + if (new_maxChildXids < new_nChildXids) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("maximum number of committed subtransactions (%d) exceeded", + (int) (MaxAllocSize / sizeof(TransactionId))))); + + /* + * We keep the child-XID arrays in TopTransactionContext; this avoids + * setting up child-transaction contexts for what might be just a few + * bytes of grandchild XIDs. + */ + if (s->parent->childXids == NULL) + new_childXids = + MemoryContextAlloc(TopTransactionContext, + new_maxChildXids * sizeof(TransactionId)); + else + new_childXids = repalloc(s->parent->childXids, + new_maxChildXids * sizeof(TransactionId)); + + s->parent->childXids = new_childXids; + s->parent->maxChildXids = new_maxChildXids; + } + + /* + * Copy all my XIDs to parent's array. + * + * Note: We rely on the fact that the XID of a child always follows that + * of its parent. By copying the XID of this subtransaction before the + * XIDs of its children, we ensure that the array stays ordered. Likewise, + * all XIDs already in the array belong to subtransactions started and + * subcommitted before us, so their XIDs must precede ours. + */ + s->parent->childXids[s->parent->nChildXids] = XidFromFullTransactionId(s->fullTransactionId); + + if (s->nChildXids > 0) + memcpy(&s->parent->childXids[s->parent->nChildXids + 1], + s->childXids, + s->nChildXids * sizeof(TransactionId)); + + s->parent->nChildXids = new_nChildXids; + + /* Release child's array to avoid leakage */ + if (s->childXids != NULL) + pfree(s->childXids); + /* We must reset these to avoid double-free if fail later in commit */ + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; +} + +/* ---------------------------------------------------------------- + * AbortTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * RecordTransactionAbort + * + * Returns latest XID among xact and its children, or InvalidTransactionId + * if the xact has no XID. (We compute that here just because it's easier.) + */ +static TransactionId +RecordTransactionAbort(bool isSubXact) +{ + TransactionId xid = GetCurrentTransactionIdIfAny(); + TransactionId latestXid; + int nrels; + RelFileNode *rels; + int ndroppedstats = 0; + xl_xact_stats_item *droppedstats = NULL; + int nchildren; + TransactionId *children; + TimestampTz xact_time; + + /* + * If we haven't been assigned an XID, nobody will care whether we aborted + * or not. Hence, we're done in that case. It does not matter if we have + * rels to delete (note that this routine is not responsible for actually + * deleting 'em). We cannot have any child XIDs, either. + */ + if (!TransactionIdIsValid(xid)) + { + /* Reset XactLastRecEnd until the next transaction writes something */ + if (!isSubXact) + XactLastRecEnd = 0; + return InvalidTransactionId; + } + + /* + * We have a valid XID, so we should write an ABORT record for it. + * + * We do not flush XLOG to disk here, since the default assumption after a + * crash would be that we aborted, anyway. For the same reason, we don't + * need to worry about interlocking against checkpoint start. + */ + + /* + * Check that we haven't aborted halfway through RecordTransactionCommit. + */ + if (TransactionIdDidCommit(xid)) + elog(PANIC, "cannot abort transaction %u, it was already committed", + xid); + + /* Fetch the data we need for the abort record */ + nrels = smgrGetPendingDeletes(false, &rels); + nchildren = xactGetCommittedChildren(&children); + ndroppedstats = pgstat_get_transactional_drops(false, &droppedstats); + + /* XXX do we really need a critical section here? */ + START_CRIT_SECTION(); + + /* Write the ABORT record */ + if (isSubXact) + xact_time = GetCurrentTimestamp(); + else + { + SetCurrentTransactionStopTimestamp(); + xact_time = xactStopTimestamp; + } + + XactLogAbortRecord(xact_time, + nchildren, children, + nrels, rels, + ndroppedstats, droppedstats, + MyXactFlags, InvalidTransactionId, + NULL); + + /* + * Report the latest async abort LSN, so that the WAL writer knows to + * flush this abort. There's nothing to be gained by delaying this, since + * WALWriter may as well do this when it can. This is important with + * streaming replication because if we don't flush WAL regularly we will + * find that large aborts leave us with a long backlog for when commits + * occur after the abort, increasing our window of data loss should + * problems occur at that point. + */ + if (!isSubXact) + XLogSetAsyncXactLSN(XactLastRecEnd); + + /* + * Mark the transaction aborted in clog. This is not absolutely necessary + * but we may as well do it while we are here; also, in the subxact case + * it is helpful because XactLockTableWait makes use of it to avoid + * waiting for already-aborted subtransactions. It is OK to do it without + * having flushed the ABORT record to disk, because in event of a crash + * we'd be assumed to have aborted anyway. + */ + TransactionIdAbortTree(xid, nchildren, children); + + END_CRIT_SECTION(); + + /* Compute latestXid while we have the child XIDs handy */ + latestXid = TransactionIdLatest(xid, nchildren, children); + + /* + * If we're aborting a subtransaction, we can immediately remove failed + * XIDs from PGPROC's cache of running child XIDs. We do that here for + * subxacts, because we already have the child XID array at hand. For + * main xacts, the equivalent happens just after this function returns. + */ + if (isSubXact) + XidCacheRemoveRunningXids(xid, nchildren, children, latestXid); + + /* Reset XactLastRecEnd until the next transaction writes something */ + if (!isSubXact) + XactLastRecEnd = 0; + + /* And clean up local data */ + if (rels) + pfree(rels); + if (ndroppedstats) + pfree(droppedstats); + + return latestXid; +} + +/* + * AtAbort_Memory + */ +static void +AtAbort_Memory(void) +{ + /* + * Switch into TransactionAbortContext, which should have some free space + * even if nothing else does. We'll work in this context until we've + * finished cleaning up. + * + * It is barely possible to get here when we've not been able to create + * TransactionAbortContext yet; if so use TopMemoryContext. + */ + if (TransactionAbortContext != NULL) + MemoryContextSwitchTo(TransactionAbortContext); + else + MemoryContextSwitchTo(TopMemoryContext); +} + +/* + * AtSubAbort_Memory + */ +static void +AtSubAbort_Memory(void) +{ + Assert(TransactionAbortContext != NULL); + + MemoryContextSwitchTo(TransactionAbortContext); +} + + +/* + * AtAbort_ResourceOwner + */ +static void +AtAbort_ResourceOwner(void) +{ + /* + * Make sure we have a valid ResourceOwner, if possible (else it will be + * NULL, which is OK) + */ + CurrentResourceOwner = TopTransactionResourceOwner; +} + +/* + * AtSubAbort_ResourceOwner + */ +static void +AtSubAbort_ResourceOwner(void) +{ + TransactionState s = CurrentTransactionState; + + /* Make sure we have a valid ResourceOwner */ + CurrentResourceOwner = s->curTransactionOwner; +} + + +/* + * AtSubAbort_childXids + */ +static void +AtSubAbort_childXids(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * We keep the child-XID arrays in TopTransactionContext (see + * AtSubCommit_childXids). This means we'd better free the array + * explicitly at abort to avoid leakage. + */ + if (s->childXids != NULL) + pfree(s->childXids); + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + /* + * We could prune the unreportedXids array here. But we don't bother. That + * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it + * would likely introduce more CPU time into the more common paths, so we + * choose not to do that. + */ +} + +/* ---------------------------------------------------------------- + * CleanupTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtCleanup_Memory + */ +static void +AtCleanup_Memory(void) +{ + Assert(CurrentTransactionState->parent == NULL); + + /* + * Now that we're "out" of a transaction, have the system allocate things + * in the top memory context instead of per-transaction contexts. + */ + MemoryContextSwitchTo(TopMemoryContext); + + /* + * Clear the special abort context for next time. + */ + if (TransactionAbortContext != NULL) + MemoryContextResetAndDeleteChildren(TransactionAbortContext); + + /* + * Release all transaction-local memory. + */ + if (TopTransactionContext != NULL) + MemoryContextDelete(TopTransactionContext); + TopTransactionContext = NULL; + CurTransactionContext = NULL; + CurrentTransactionState->curTransactionContext = NULL; +} + + +/* ---------------------------------------------------------------- + * CleanupSubTransaction stuff + * ---------------------------------------------------------------- + */ + +/* + * AtSubCleanup_Memory + */ +static void +AtSubCleanup_Memory(void) +{ + TransactionState s = CurrentTransactionState; + + Assert(s->parent != NULL); + + /* Make sure we're not in an about-to-be-deleted context */ + MemoryContextSwitchTo(s->parent->curTransactionContext); + CurTransactionContext = s->parent->curTransactionContext; + + /* + * Clear the special abort context for next time. + */ + if (TransactionAbortContext != NULL) + MemoryContextResetAndDeleteChildren(TransactionAbortContext); + + /* + * Delete the subxact local memory contexts. Its CurTransactionContext can + * go too (note this also kills CurTransactionContexts from any children + * of the subxact). + */ + if (s->curTransactionContext) + MemoryContextDelete(s->curTransactionContext); + s->curTransactionContext = NULL; +} + +/* ---------------------------------------------------------------- + * interface routines + * ---------------------------------------------------------------- + */ + +/* + * StartTransaction + */ +static void +StartTransaction(void) +{ + TransactionState s; + VirtualTransactionId vxid; + + /* + * Let's just make sure the state stack is empty + */ + s = &TopTransactionStateData; + CurrentTransactionState = s; + + Assert(!FullTransactionIdIsValid(XactTopFullTransactionId)); + + /* check the current transaction state */ + Assert(s->state == TRANS_DEFAULT); + + /* + * Set the current transaction state information appropriately during + * start processing. Note that once the transaction status is switched + * this process cannot fail until the user ID and the security context + * flags are fetched below. + */ + s->state = TRANS_START; + s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + + /* Determine if statements are logged in this transaction */ + xact_is_sampled = log_xact_sample_rate != 0 && + (log_xact_sample_rate == 1 || + pg_prng_double(&pg_global_prng_state) <= log_xact_sample_rate); + + /* + * initialize current transaction state fields + * + * note: prevXactReadOnly is not used at the outermost level + */ + s->nestingLevel = 1; + s->gucNestLevel = 1; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + /* + * Once the current user ID and the security context flags are fetched, + * both will be properly reset even if transaction startup fails. + */ + GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext); + + /* SecurityRestrictionContext should never be set outside a transaction */ + Assert(s->prevSecContext == 0); + + /* + * Make sure we've reset xact state variables + * + * If recovery is still in progress, mark this transaction as read-only. + * We have lower level defences in XLogInsert and elsewhere to stop us + * from modifying data during recovery, but this gives the normal + * indication to the user that the transaction is read-only. + */ + if (RecoveryInProgress()) + { + s->startedInRecovery = true; + XactReadOnly = true; + } + else + { + s->startedInRecovery = false; + XactReadOnly = DefaultXactReadOnly; + } + XactDeferrable = DefaultXactDeferrable; + XactIsoLevel = DefaultXactIsoLevel; + forceSyncCommit = false; + MyXactFlags = 0; + + /* + * reinitialize within-transaction counters + */ + s->subTransactionId = TopSubTransactionId; + currentSubTransactionId = TopSubTransactionId; + currentCommandId = FirstCommandId; + currentCommandIdUsed = false; + + /* + * initialize reported xid accounting + */ + nUnreportedXids = 0; + s->didLogXid = false; + + /* + * must initialize resource-management stuff first + */ + AtStart_Memory(); + AtStart_ResourceOwner(); + + /* + * Assign a new LocalTransactionId, and combine it with the backendId to + * form a virtual transaction id. + */ + vxid.backendId = MyBackendId; + vxid.localTransactionId = GetNextLocalTransactionId(); + + /* + * Lock the virtual transaction id before we announce it in the proc array + */ + VirtualXactLockTableInsert(vxid); + + /* + * Advertise it in the proc array. We assume assignment of + * localTransactionId is atomic, and the backendId should be set already. + */ + Assert(MyProc->backendId == vxid.backendId); + MyProc->lxid = vxid.localTransactionId; + + TRACE_POSTGRESQL_TRANSACTION_START(vxid.localTransactionId); + + /* + * set transaction_timestamp() (a/k/a now()). Normally, we want this to + * be the same as the first command's statement_timestamp(), so don't do a + * fresh GetCurrentTimestamp() call (which'd be expensive anyway). But + * for transactions started inside procedures (i.e., nonatomic SPI + * contexts), we do need to advance the timestamp. Also, in a parallel + * worker, the timestamp should already have been provided by a call to + * SetParallelStartTimestamps(). + */ + if (!IsParallelWorker()) + { + if (!SPI_inside_nonatomic_context()) + xactStartTimestamp = stmtStartTimestamp; + else + xactStartTimestamp = GetCurrentTimestamp(); + } + else + Assert(xactStartTimestamp != 0); + pgstat_report_xact_timestamp(xactStartTimestamp); + /* Mark xactStopTimestamp as unset. */ + xactStopTimestamp = 0; + + /* + * initialize other subsystems for new transaction + */ + AtStart_GUC(); + AtStart_Cache(); + AfterTriggerBeginXact(); + + /* + * done with start processing, set current transaction state to "in + * progress" + */ + s->state = TRANS_INPROGRESS; + + ShowTransactionState("StartTransaction"); +} + + +/* + * CommitTransaction + * + * NB: if you change this routine, better look at PrepareTransaction too! + */ +static void +CommitTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId latestXid; + bool is_parallel_worker; + + is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); + + /* Enforce parallel mode restrictions during parallel worker commit. */ + if (is_parallel_worker) + EnterParallelMode(); + + ShowTransactionState("CommitTransaction"); + + /* + * check the current transaction state + */ + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "CommitTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. SECURITY_RESTRICTED_OPERATION contexts must not queue an + * action that would run here, because that would bypass the sandbox. + * Since closing cursors could queue trigger actions, triggers could open + * cursors, etc, we have to keep looping until there's nothing left to do. + */ + for (;;) + { + /* + * Fire all currently pending deferred triggers. + */ + AfterTriggerFireDeferred(); + + /* + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. + */ + if (!PreCommit_Portals(false)) + break; + } + + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ + + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_PRE_COMMIT + : XACT_EVENT_PRE_COMMIT); + + /* If we might have parallel workers, clean them up now. */ + if (IsInParallelMode()) + AtEOXact_Parallel(true); + + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); + + /* + * Let ON COMMIT management do its thing (must happen after closing + * cursors, to avoid dangling-reference problems) + */ + PreCommit_on_commit_actions(); + + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before AtEOXact_RelationMap(), so that we + * don't see committed-but-broken files after a crash. + */ + smgrDoPendingSyncs(true, is_parallel_worker); + + /* close large objects before lower-level cleanup */ + AtEOXact_LargeObject(true); + + /* + * Insert notifications sent by NOTIFY commands into the queue. This + * should be late in the pre-commit sequence to minimize time spent + * holding the notify-insertion lock. However, this could result in + * creating a snapshot, so we must do it before serializable cleanup. + */ + PreCommit_Notify(); + + /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. This is not + * appropriate in a parallel worker however, because we aren't committing + * the leader's transaction and its serializable state will live on. + */ + if (!is_parallel_worker) + PreCommit_CheckForSerializationFailure(); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Commit updates to the relation map --- do this as late as possible */ + AtEOXact_RelationMap(true, is_parallel_worker); + + /* + * set the current transaction state information appropriately during + * commit processing + */ + s->state = TRANS_COMMIT; + s->parallelModeLevel = 0; + + if (!is_parallel_worker) + { + /* + * We need to mark our XIDs as committed in pg_xact. This is where we + * durably commit. + */ + latestXid = RecordTransactionCommit(); + } + else + { + /* + * We must not mark our XID committed; the parallel leader is + * responsible for that. + */ + latestXid = InvalidTransactionId; + + /* + * Make sure the leader will know about any WAL we wrote before it + * commits. + */ + ParallelWorkerReportLastRecEnd(XactLastRecEnd); + } + + TRACE_POSTGRESQL_TRANSACTION_COMMIT(MyProc->lxid); + + /* + * Let others know about no transaction in progress by me. Note that this + * must be done _before_ releasing locks we hold and _after_ + * RecordTransactionCommit. + */ + ProcArrayEndTransaction(MyProc, latestXid); + + /* + * This is all post-commit cleanup. Note that if an error is raised here, + * it's too late to abort the transaction. This should be just + * noncritical resource releasing. + * + * The ordering of operations is not entirely random. The idea is: + * release resources visible to other backends (eg, files, buffer pins); + * then release locks; then release backend-local resources. We want to + * release locks at the point where any backend waiting for us will see + * our transaction as being fully cleaned up. + * + * Resources that can be associated with individual queries are handled by + * the ResourceOwner mechanism. The other calls here are for backend-wide + * state. + */ + + CallXactCallbacks(is_parallel_worker ? XACT_EVENT_PARALLEL_COMMIT + : XACT_EVENT_COMMIT); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, true); + + /* Check we've released all buffer pins */ + AtEOXact_Buffers(true); + + /* Clean up the relation cache */ + AtEOXact_RelationCache(true); + + /* + * Make catalog changes visible to all backends. This has to happen after + * relcache references are dropped (see comments for + * AtEOXact_RelationCache), but before locks are released (if anyone is + * waiting for lock on a relation we've modified, we want them to know + * about the catalog change before they start using the relation). + */ + AtEOXact_Inval(true); + + AtEOXact_MultiXact(); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + true, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, true); + + /* + * Likewise, dropping of files deleted during the transaction is best done + * after releasing relcache and buffer pins. (This is not strictly + * necessary during commit, since such pins should have been released + * already, but this ordering is definitely critical during abort.) Since + * this may take many seconds, also delay until after releasing locks. + * Other backends will observe the attendant catalog changes and not + * attempt to access affected files. + */ + smgrDoPendingDeletes(true); + + /* + * Send out notification signals to other backends (and do other + * post-commit NOTIFY cleanup). This must not happen until after our + * transaction is fully done from the viewpoint of other backends. + */ + AtCommit_Notify(); + + /* + * Everything after this should be purely internal-to-this-backend + * cleanup. + */ + AtEOXact_GUC(true, 1); + AtEOXact_SPI(true); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(true); + AtEOXact_Namespace(true, is_parallel_worker); + AtEOXact_SMgr(); + AtEOXact_Files(true); + AtEOXact_ComboCid(); + AtEOXact_HashTables(true); + AtEOXact_PgStat(true, is_parallel_worker); + AtEOXact_Snapshot(true, false); + AtEOXact_ApplyLauncher(true); + pgstat_report_xact_timestamp(0); + + CurrentResourceOwner = NULL; + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCommit_Memory(); + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with commit processing, set current transaction state back to + * default + */ + s->state = TRANS_DEFAULT; + + RESUME_INTERRUPTS(); +} + + +/* + * PrepareTransaction + * + * NB: if you change this routine, better look at CommitTransaction too! + */ +static void +PrepareTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId xid = GetCurrentTransactionId(); + GlobalTransaction gxact; + TimestampTz prepared_at; + + Assert(!IsInParallelMode()); + + ShowTransactionState("PrepareTransaction"); + + /* + * check the current transaction state + */ + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "PrepareTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * Do pre-commit processing that involves calling user-defined code, such + * as triggers. Since closing cursors could queue trigger actions, + * triggers could open cursors, etc, we have to keep looping until there's + * nothing left to do. + */ + for (;;) + { + /* + * Fire all currently pending deferred triggers. + */ + AfterTriggerFireDeferred(); + + /* + * Close open portals (converting holdable ones into static portals). + * If there weren't any, we are done ... otherwise loop back to check + * if they queued deferred triggers. Lather, rinse, repeat. + */ + if (!PreCommit_Portals(true)) + break; + } + + CallXactCallbacks(XACT_EVENT_PRE_PREPARE); + + /* + * The remaining actions cannot call any user-defined code, so it's safe + * to start shutting down within-transaction services. But note that most + * of this stuff could still throw an error, which would switch us into + * the transaction-abort path. + */ + + /* Shut down the deferred-trigger manager */ + AfterTriggerEndXact(true); + + /* + * Let ON COMMIT management do its thing (must happen after closing + * cursors, to avoid dangling-reference problems) + */ + PreCommit_on_commit_actions(); + + /* + * Synchronize files that are created and not WAL-logged during this + * transaction. This must happen before EndPrepare(), so that we don't see + * committed-but-broken files after a crash and COMMIT PREPARED. + */ + smgrDoPendingSyncs(true, false); + + /* close large objects before lower-level cleanup */ + AtEOXact_LargeObject(true); + + /* NOTIFY requires no work at this point */ + + /* + * Mark serializable transaction as complete for predicate locking + * purposes. This should be done as late as we can put it and still allow + * errors to be raised for failure patterns found at commit. + */ + PreCommit_CheckForSerializationFailure(); + + /* + * Don't allow PREPARE TRANSACTION if we've accessed a temporary table in + * this transaction. Having the prepared xact hold locks on another + * backend's temp table seems a bad idea --- for instance it would prevent + * the backend from exiting. There are other problems too, such as how to + * clean up the source backend's local buffers and ON COMMIT state if the + * prepared xact includes a DROP of a temp table. + * + * Other objects types, like functions, operators or extensions, share the + * same restriction as they should not be created, locked or dropped as + * this can mess up with this session or even a follow-up session trying + * to use the same temporary namespace. + * + * We must check this after executing any ON COMMIT actions, because they + * might still access a temp relation. + * + * XXX In principle this could be relaxed to allow some useful special + * cases, such as a temp table created and dropped all within the + * transaction. That seems to require much more bookkeeping though. + */ + if ((MyXactFlags & XACT_FLAGS_ACCESSEDTEMPNAMESPACE)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that has operated on temporary objects"))); + + /* + * Likewise, don't allow PREPARE after pg_export_snapshot. This could be + * supported if we added cleanup logic to twophase.c, but for now it + * doesn't seem worth the trouble. + */ + if (XactHasExportedSnapshots()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that has exported snapshots"))); + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * set the current transaction state information appropriately during + * prepare processing + */ + s->state = TRANS_PREPARE; + + prepared_at = GetCurrentTimestamp(); + + /* Tell bufmgr and smgr to prepare for commit */ + BufmgrCommit(); + + /* + * Reserve the GID for this transaction. This could fail if the requested + * GID is invalid or already in use. + */ + gxact = MarkAsPreparing(xid, prepareGID, prepared_at, + GetUserId(), MyDatabaseId); + prepareGID = NULL; + + /* + * Collect data for the 2PC state file. Note that in general, no actual + * state change should happen in the called modules during this step, + * since it's still possible to fail before commit, and in that case we + * want transaction abort to be able to clean up. (In particular, the + * AtPrepare routines may error out if they find cases they cannot + * handle.) State cleanup should happen in the PostPrepare routines + * below. However, some modules can go ahead and clear state here because + * they wouldn't do anything with it during abort anyway. + * + * Note: because the 2PC state file records will be replayed in the same + * order they are made, the order of these calls has to match the order in + * which we want things to happen during COMMIT PREPARED or ROLLBACK + * PREPARED; in particular, pay attention to whether things should happen + * before or after releasing the transaction's locks. + */ + StartPrepare(gxact); + + AtPrepare_Notify(); + AtPrepare_Locks(); + AtPrepare_PredicateLocks(); + AtPrepare_PgStat(); + AtPrepare_MultiXact(); + AtPrepare_RelationMap(); + + /* + * Here is where we really truly prepare. + * + * We have to record transaction prepares even if we didn't make any + * updates, because the transaction manager might get confused if we lose + * a global transaction. + */ + EndPrepare(gxact); + + /* + * Now we clean up backend-internal state and release internal resources. + */ + + /* Reset XactLastRecEnd until the next transaction writes something */ + XactLastRecEnd = 0; + + /* + * Transfer our locks to a dummy PGPROC. This has to be done before + * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would + * conclude "xact already committed or aborted" for our locks. + */ + PostPrepare_Locks(xid); + + /* + * Let others know about no transaction in progress by me. This has to be + * done *after* the prepared transaction has been marked valid, else + * someone may think it is unlocked and recyclable. + */ + ProcArrayClearTransaction(MyProc); + + /* + * In normal commit-processing, this is all non-critical post-transaction + * cleanup. When the transaction is prepared, however, it's important + * that the locks and other per-backend resources are transferred to the + * prepared transaction's PGPROC entry. Note that if an error is raised + * here, it's too late to abort the transaction. XXX: This probably should + * be in a critical section, to force a PANIC if any of this fails, but + * that cure could be worse than the disease. + */ + + CallXactCallbacks(XACT_EVENT_PREPARE); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, true); + + /* Check we've released all buffer pins */ + AtEOXact_Buffers(true); + + /* Clean up the relation cache */ + AtEOXact_RelationCache(true); + + /* notify doesn't need a postprepare call */ + + PostPrepare_PgStat(); + + PostPrepare_Inval(); + + PostPrepare_smgr(); + + PostPrepare_MultiXact(xid); + + PostPrepare_PredicateLocks(xid); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + true, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, true); + + /* + * Allow another backend to finish the transaction. After + * PostPrepare_Twophase(), the transaction is completely detached from our + * backend. The rest is just non-critical cleanup of backend-local state. + */ + PostPrepare_Twophase(); + + /* PREPARE acts the same as COMMIT as far as GUC is concerned */ + AtEOXact_GUC(true, 1); + AtEOXact_SPI(true); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(true); + AtEOXact_Namespace(true, false); + AtEOXact_SMgr(); + AtEOXact_Files(true); + AtEOXact_ComboCid(); + AtEOXact_HashTables(true); + /* don't call AtEOXact_PgStat here; we fixed pgstat state above */ + AtEOXact_Snapshot(true, true); + pgstat_report_xact_timestamp(0); + + CurrentResourceOwner = NULL; + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCommit_Memory(); + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with 1st phase commit processing, set current transaction state + * back to default + */ + s->state = TRANS_DEFAULT; + + RESUME_INTERRUPTS(); +} + + +/* + * AbortTransaction + */ +static void +AbortTransaction(void) +{ + TransactionState s = CurrentTransactionState; + TransactionId latestXid; + bool is_parallel_worker; + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Make sure we have a valid memory context and resource owner */ + AtAbort_Memory(); + AtAbort_ResourceOwner(); + + /* + * Release any LW locks we might be holding as quickly as possible. + * (Regular locks, however, must be held till we finish aborting.) + * Releasing LW locks is critical since we might try to grab them again + * while cleaning up! + */ + LWLockReleaseAll(); + + /* Clear wait information and command progress indicator */ + pgstat_report_wait_end(); + pgstat_progress_end_command(); + + /* Clean up buffer I/O and buffer context locks, too */ + AbortBufferIO(); + UnlockBuffers(); + + /* Reset WAL record construction state */ + XLogResetInsertion(); + + /* Cancel condition variable sleep */ + ConditionVariableCancelSleep(); + + /* + * Also clean up any open wait for lock, since the lock manager will choke + * if we try to wait for another lock before doing this. + */ + LockErrorCleanup(); + + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + + /* + * check the current transaction state + */ + is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); + if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE) + elog(WARNING, "AbortTransaction while in %s state", + TransStateAsString(s->state)); + Assert(s->parent == NULL); + + /* + * set the current transaction state information appropriately during the + * abort processing + */ + s->state = TRANS_ABORT; + + /* + * Reset user ID which might have been changed transiently. We need this + * to clean up in case control escaped out of a SECURITY DEFINER function + * or other local change of CurrentUserId; therefore, the prior value of + * SecurityRestrictionContext also needs to be restored. + * + * (Note: it is not necessary to restore session authorization or role + * settings here because those can only be changed via GUC, and GUC will + * take care of rolling them back if need be.) + */ + SetUserIdAndSecContext(s->prevUser, s->prevSecContext); + + /* Forget about any active REINDEX. */ + ResetReindexState(s->nestingLevel); + + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + + /* Reset snapshot export state. */ + SnapBuildResetExportedSnapshotState(); + + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) + { + AtEOXact_Parallel(false); + s->parallelModeLevel = 0; + } + + /* + * do abort processing + */ + AfterTriggerEndXact(false); /* 'false' means it's abort */ + AtAbort_Portals(); + smgrDoPendingSyncs(false, is_parallel_worker); + AtEOXact_LargeObject(false); + AtAbort_Notify(); + AtEOXact_RelationMap(false, is_parallel_worker); + AtAbort_Twophase(); + + /* + * Advertise the fact that we aborted in pg_xact (assuming that we got as + * far as assigning an XID to advertise). But if we're inside a parallel + * worker, skip this; the user backend must be the one to write the abort + * record. + */ + if (!is_parallel_worker) + latestXid = RecordTransactionAbort(false); + else + { + latestXid = InvalidTransactionId; + + /* + * Since the parallel leader won't get our value of XactLastRecEnd in + * this case, we nudge WAL-writer ourselves in this case. See related + * comments in RecordTransactionAbort for why this matters. + */ + XLogSetAsyncXactLSN(XactLastRecEnd); + } + + TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid); + + /* + * Let others know about no transaction in progress by me. Note that this + * must be done _before_ releasing locks we hold and _after_ + * RecordTransactionAbort. + */ + ProcArrayEndTransaction(MyProc, latestXid); + + /* + * Post-abort cleanup. See notes in CommitTransaction() concerning + * ordering. We can skip all of it if the transaction failed before + * creating a resource owner. + */ + if (TopTransactionResourceOwner != NULL) + { + if (is_parallel_worker) + CallXactCallbacks(XACT_EVENT_PARALLEL_ABORT); + else + CallXactCallbacks(XACT_EVENT_ABORT); + + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + AtEOXact_Buffers(false); + AtEOXact_RelationCache(false); + AtEOXact_Inval(false); + AtEOXact_MultiXact(); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_LOCKS, + false, true); + ResourceOwnerRelease(TopTransactionResourceOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + false, true); + smgrDoPendingDeletes(false); + + AtEOXact_GUC(false, 1); + AtEOXact_SPI(false); + AtEOXact_Enum(); + AtEOXact_on_commit_actions(false); + AtEOXact_Namespace(false, is_parallel_worker); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_ComboCid(); + AtEOXact_HashTables(false); + AtEOXact_PgStat(false, is_parallel_worker); + AtEOXact_ApplyLauncher(false); + pgstat_report_xact_timestamp(0); + } + + /* + * State remains TRANS_ABORT until CleanupTransaction(). + */ + RESUME_INTERRUPTS(); +} + +/* + * CleanupTransaction + */ +static void +CleanupTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * State should still be TRANS_ABORT from AbortTransaction(). + */ + if (s->state != TRANS_ABORT) + elog(FATAL, "CleanupTransaction: unexpected state %s", + TransStateAsString(s->state)); + + /* + * do abort cleanup processing + */ + AtCleanup_Portals(); /* now safe to release portal memory */ + AtEOXact_Snapshot(false, true); /* and release the transaction's snapshots */ + + CurrentResourceOwner = NULL; /* and resource owner */ + if (TopTransactionResourceOwner) + ResourceOwnerDelete(TopTransactionResourceOwner); + s->curTransactionOwner = NULL; + CurTransactionResourceOwner = NULL; + TopTransactionResourceOwner = NULL; + + AtCleanup_Memory(); /* and transaction memory */ + + s->fullTransactionId = InvalidFullTransactionId; + s->subTransactionId = InvalidSubTransactionId; + s->nestingLevel = 0; + s->gucNestLevel = 0; + s->childXids = NULL; + s->nChildXids = 0; + s->maxChildXids = 0; + s->parallelModeLevel = 0; + + XactTopFullTransactionId = InvalidFullTransactionId; + nParallelCurrentXids = 0; + + /* + * done with abort processing, set current transaction state back to + * default + */ + s->state = TRANS_DEFAULT; +} + +/* + * StartTransactionCommand + */ +void +StartTransactionCommand(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * if we aren't in a transaction block, we just do our usual start + * transaction. + */ + case TBLOCK_DEFAULT: + StartTransaction(); + s->blockState = TBLOCK_STARTED; + break; + + /* + * We are somewhere in a transaction block or subtransaction and + * about to start a new command. For now we do nothing, but + * someday we may do command-local resource initialization. (Note + * that any needed CommandCounterIncrement was done by the + * previous CommitTransactionCommand.) + */ + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + break; + + /* + * Here we are in a failed transaction block (one of the commands + * caused an abort) so we do nothing but remain in the abort + * state. Eventually we will get a ROLLBACK command which will + * get us out of this state. (It is up to other code to ensure + * that no commands other than ROLLBACK will be processed in these + * states.) + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(ERROR, "StartTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + /* + * We must switch to CurTransactionContext before returning. This is + * already done if we called StartTransaction, otherwise not. + */ + Assert(CurTransactionContext != NULL); + MemoryContextSwitchTo(CurTransactionContext); +} + + +/* + * Simple system for saving and restoring transaction characteristics + * (isolation level, read only, deferrable). We need this for transaction + * chaining, so that we can set the characteristics of the new transaction to + * be the same as the previous one. (We need something like this because the + * GUC system resets the characteristics at transaction end, so for example + * just skipping the reset in StartTransaction() won't work.) + */ +void +SaveTransactionCharacteristics(SavedTransactionCharacteristics *s) +{ + s->save_XactIsoLevel = XactIsoLevel; + s->save_XactReadOnly = XactReadOnly; + s->save_XactDeferrable = XactDeferrable; +} + +void +RestoreTransactionCharacteristics(const SavedTransactionCharacteristics *s) +{ + XactIsoLevel = s->save_XactIsoLevel; + XactReadOnly = s->save_XactReadOnly; + XactDeferrable = s->save_XactDeferrable; +} + + +/* + * CommitTransactionCommand + */ +void +CommitTransactionCommand(void) +{ + TransactionState s = CurrentTransactionState; + SavedTransactionCharacteristics savetc; + + /* Must save in case we need to restore below */ + SaveTransactionCharacteristics(&savetc); + + switch (s->blockState) + { + /* + * These shouldn't happen. TBLOCK_DEFAULT means the previous + * StartTransactionCommand didn't set the STARTED state + * appropriately, while TBLOCK_PARALLEL_INPROGRESS should be ended + * by EndParallelWorkerTransaction(), not this function. + */ + case TBLOCK_DEFAULT: + case TBLOCK_PARALLEL_INPROGRESS: + elog(FATAL, "CommitTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * If we aren't in a transaction block, just do our usual + * transaction commit, and return to the idle state. + */ + case TBLOCK_STARTED: + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are completing a "BEGIN TRANSACTION" command, so we change + * to the "transaction block in progress" state and return. (We + * assume the BEGIN did nothing to the database, so we need no + * CommandCounterIncrement.) + */ + case TBLOCK_BEGIN: + s->blockState = TBLOCK_INPROGRESS; + break; + + /* + * This is the case when we have finished executing a command + * someplace within a transaction block. We increment the command + * counter and return. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + CommandCounterIncrement(); + break; + + /* + * We are completing a "COMMIT" command. Do it and return to the + * idle state. + */ + case TBLOCK_END: + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(&savetc); + } + break; + + /* + * Here we are in the middle of a transaction block but one of the + * commands caused an abort so we do nothing but remain in the + * abort state. Eventually we will get a ROLLBACK command. + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* + * Here we were in an aborted transaction block and we just got + * the ROLLBACK command from the user, so clean up the + * already-aborted transaction and return to the idle state. + */ + case TBLOCK_ABORT_END: + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(&savetc); + } + break; + + /* + * Here we were in a perfectly good transaction block but the user + * told us to ROLLBACK anyway. We have to abort the transaction + * and then clean up. + */ + case TBLOCK_ABORT_PENDING: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(&savetc); + } + break; + + /* + * We are completing a "PREPARE TRANSACTION" command. Do it and + * return to the idle state. + */ + case TBLOCK_PREPARE: + PrepareTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We were just issued a SAVEPOINT inside a transaction block. + * Start a subtransaction. (DefineSavepoint already did + * PushTransaction, so as to have someplace to put the SUBBEGIN + * state.) + */ + case TBLOCK_SUBBEGIN: + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + break; + + /* + * We were issued a RELEASE command, so we end the current + * subtransaction and return to the parent transaction. The parent + * might be ended too, so repeat till we find an INPROGRESS + * transaction or subtransaction. + */ + case TBLOCK_SUBRELEASE: + do + { + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + } while (s->blockState == TBLOCK_SUBRELEASE); + + Assert(s->blockState == TBLOCK_INPROGRESS || + s->blockState == TBLOCK_SUBINPROGRESS); + break; + + /* + * We were issued a COMMIT, so we end the current subtransaction + * hierarchy and perform final commit. We do this by rolling up + * any subtransactions into their parent, which leads to O(N^2) + * operations with respect to resource owners - this isn't that + * bad until we approach a thousands of savepoints but is + * necessary for correctness should after triggers create new + * resource owners. + */ + case TBLOCK_SUBCOMMIT: + do + { + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + } while (s->blockState == TBLOCK_SUBCOMMIT); + /* If we had a COMMIT command, finish off the main xact too */ + if (s->blockState == TBLOCK_END) + { + Assert(s->parent == NULL); + CommitTransaction(); + s->blockState = TBLOCK_DEFAULT; + if (s->chain) + { + StartTransaction(); + s->blockState = TBLOCK_INPROGRESS; + s->chain = false; + RestoreTransactionCharacteristics(&savetc); + } + } + else if (s->blockState == TBLOCK_PREPARE) + { + Assert(s->parent == NULL); + PrepareTransaction(); + s->blockState = TBLOCK_DEFAULT; + } + else + elog(ERROR, "CommitTransactionCommand: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The current already-failed subtransaction is ending due to a + * ROLLBACK or ROLLBACK TO command, so pop it and recursively + * examine the parent (which could be in any of several states). + */ + case TBLOCK_SUBABORT_END: + CleanupSubTransaction(); + CommitTransactionCommand(); + break; + + /* + * As above, but it's not dead yet, so abort first. + */ + case TBLOCK_SUBABORT_PENDING: + AbortSubTransaction(); + CleanupSubTransaction(); + CommitTransactionCommand(); + break; + + /* + * The current subtransaction is the target of a ROLLBACK TO + * command. Abort and pop it, then start a new subtransaction + * with the same name. + */ + case TBLOCK_SUBRESTART: + { + char *name; + int savepointLevel; + + /* save name and keep Cleanup from freeing it */ + name = s->name; + s->name = NULL; + savepointLevel = s->savepointLevel; + + AbortSubTransaction(); + CleanupSubTransaction(); + + DefineSavepoint(NULL); + s = CurrentTransactionState; /* changed by push */ + s->name = name; + s->savepointLevel = savepointLevel; + + /* This is the same as TBLOCK_SUBBEGIN case */ + AssertState(s->blockState == TBLOCK_SUBBEGIN); + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + } + break; + + /* + * Same as above, but the subtransaction had already failed, so we + * don't need AbortSubTransaction. + */ + case TBLOCK_SUBABORT_RESTART: + { + char *name; + int savepointLevel; + + /* save name and keep Cleanup from freeing it */ + name = s->name; + s->name = NULL; + savepointLevel = s->savepointLevel; + + CleanupSubTransaction(); + + DefineSavepoint(NULL); + s = CurrentTransactionState; /* changed by push */ + s->name = name; + s->savepointLevel = savepointLevel; + + /* This is the same as TBLOCK_SUBBEGIN case */ + AssertState(s->blockState == TBLOCK_SUBBEGIN); + StartSubTransaction(); + s->blockState = TBLOCK_SUBINPROGRESS; + } + break; + } +} + +/* + * AbortCurrentTransaction + */ +void +AbortCurrentTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + case TBLOCK_DEFAULT: + if (s->state == TRANS_DEFAULT) + { + /* we are idle, so nothing to do */ + } + else + { + /* + * We can get here after an error during transaction start + * (state will be TRANS_START). Need to clean up the + * incompletely started transaction. First, adjust the + * low-level state to suppress warning message from + * AbortTransaction. + */ + if (s->state == TRANS_START) + s->state = TRANS_INPROGRESS; + AbortTransaction(); + CleanupTransaction(); + } + break; + + /* + * If we aren't in a transaction block, we just do the basic abort + * & cleanup transaction. For this purpose, we treat an implicit + * transaction block as if it were a simple statement. + */ + case TBLOCK_STARTED: + case TBLOCK_IMPLICIT_INPROGRESS: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * If we are in TBLOCK_BEGIN it means something screwed up right + * after reading "BEGIN TRANSACTION". We assume that the user + * will interpret the error as meaning the BEGIN failed to get him + * into a transaction block, so we should abort and return to idle + * state. + */ + case TBLOCK_BEGIN: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are somewhere in a transaction block and we've gotten a + * failure, so we abort the transaction and set up the persistent + * ABORT state. We will stay in ABORT until we get a ROLLBACK. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + AbortTransaction(); + s->blockState = TBLOCK_ABORT; + /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */ + break; + + /* + * Here, we failed while trying to COMMIT. Clean up the + * transaction and return to idle state (we do not want to stay in + * the transaction). + */ + case TBLOCK_END: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * Here, we are already in an aborted transaction state and are + * waiting for a ROLLBACK, but for some reason we failed again! So + * we just remain in the abort state. + */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + break; + + /* + * We are in a failed transaction and we got the ROLLBACK command. + * We have already aborted, we just need to cleanup and go to idle + * state. + */ + case TBLOCK_ABORT_END: + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We are in a live transaction and we got a ROLLBACK command. + * Abort, cleanup, go to idle state. + */ + case TBLOCK_ABORT_PENDING: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * Here, we failed while trying to PREPARE. Clean up the + * transaction and return to idle state (we do not want to stay in + * the transaction). + */ + case TBLOCK_PREPARE: + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * We got an error inside a subtransaction. Abort just the + * subtransaction, and go to the persistent SUBABORT state until + * we get ROLLBACK. + */ + case TBLOCK_SUBINPROGRESS: + AbortSubTransaction(); + s->blockState = TBLOCK_SUBABORT; + break; + + /* + * If we failed while trying to create a subtransaction, clean up + * the broken subtransaction and abort the parent. The same + * applies if we get a failure while ending a subtransaction. + */ + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + AbortSubTransaction(); + CleanupSubTransaction(); + AbortCurrentTransaction(); + break; + + /* + * Same as above, except the Abort() was already done. + */ + case TBLOCK_SUBABORT_END: + case TBLOCK_SUBABORT_RESTART: + CleanupSubTransaction(); + AbortCurrentTransaction(); + break; + } +} + +/* + * PreventInTransactionBlock + * + * This routine is to be called by statements that must not run inside + * a transaction block, typically because they have non-rollback-able + * side effects or do internal commits. + * + * If this routine completes successfully, then the calling statement is + * guaranteed that if it completes without error, its results will be + * committed immediately. + * + * If we have already started a transaction block, issue an error; also issue + * an error if we appear to be running inside a user-defined function (which + * could issue more commands and possibly cause a failure after the statement + * completes). Subtransactions are verboten too. + * + * We must also set XACT_FLAGS_NEEDIMMEDIATECOMMIT in MyXactFlags, to ensure + * that postgres.c follows through by committing after the statement is done. + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. (We will always fail if this is false, but it's + * convenient to centralize the check here instead of making callers do it.) + * stmtType: statement type name, for error messages. + */ +void +PreventInTransactionBlock(bool isTopLevel, const char *stmtType) +{ + /* + * xact block already started? + */ + if (IsTransactionBlock()) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot run inside a transaction block", + stmtType))); + + /* + * subtransaction? + */ + if (IsSubTransaction()) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot run inside a subtransaction", + stmtType))); + + /* + * inside a pipeline that has started an implicit transaction? + */ + if (MyXactFlags & XACT_FLAGS_PIPELINING) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot be executed within a pipeline", + stmtType))); + + /* + * inside a function call? + */ + if (!isTopLevel) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s cannot be executed from a function", stmtType))); + + /* If we got past IsTransactionBlock test, should be in default state */ + if (CurrentTransactionState->blockState != TBLOCK_DEFAULT && + CurrentTransactionState->blockState != TBLOCK_STARTED) + elog(FATAL, "cannot prevent transaction chain"); + + /* All okay. Set the flag to make sure the right thing happens later. */ + MyXactFlags |= XACT_FLAGS_NEEDIMMEDIATECOMMIT; +} + +/* + * WarnNoTransactionBlock + * RequireTransactionBlock + * + * These two functions allow for warnings or errors if a command is executed + * outside of a transaction block. This is useful for commands that have no + * effects that persist past transaction end (and so calling them outside a + * transaction block is presumably an error). DECLARE CURSOR is an example. + * While top-level transaction control commands (BEGIN/COMMIT/ABORT) and SET + * that have no effect issue warnings, all other no-effect commands generate + * errors. + * + * If we appear to be running inside a user-defined function, we do not + * issue anything, since the function could issue more commands that make + * use of the current statement's results. Likewise subtransactions. + * Thus these are inverses for PreventInTransactionBlock. + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. + * stmtType: statement type name, for warning or error messages. + */ +void +WarnNoTransactionBlock(bool isTopLevel, const char *stmtType) +{ + CheckTransactionBlock(isTopLevel, false, stmtType); +} + +void +RequireTransactionBlock(bool isTopLevel, const char *stmtType) +{ + CheckTransactionBlock(isTopLevel, true, stmtType); +} + +/* + * This is the implementation of the above two. + */ +static void +CheckTransactionBlock(bool isTopLevel, bool throwError, const char *stmtType) +{ + /* + * xact block already started? + */ + if (IsTransactionBlock()) + return; + + /* + * subtransaction? + */ + if (IsSubTransaction()) + return; + + /* + * inside a function call? + */ + if (!isTopLevel) + return; + + ereport(throwError ? ERROR : WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + stmtType))); +} + +/* + * IsInTransactionBlock + * + * This routine is for statements that need to behave differently inside + * a transaction block than when running as single commands. ANALYZE is + * currently the only example. + * + * If this routine returns "false", then the calling statement is allowed + * to perform internal transaction-commit-and-start cycles; there is not a + * risk of messing up any transaction already in progress. (Note that this + * is not the identical guarantee provided by PreventInTransactionBlock, + * since we will not force a post-statement commit.) + * + * isTopLevel: passed down from ProcessUtility to determine whether we are + * inside a function. + */ +bool +IsInTransactionBlock(bool isTopLevel) +{ + /* + * Return true on same conditions that would make + * PreventInTransactionBlock error out + */ + if (IsTransactionBlock()) + return true; + + if (IsSubTransaction()) + return true; + + if (MyXactFlags & XACT_FLAGS_PIPELINING) + return true; + + if (!isTopLevel) + return true; + + if (CurrentTransactionState->blockState != TBLOCK_DEFAULT && + CurrentTransactionState->blockState != TBLOCK_STARTED) + return true; + + return false; +} + + +/* + * Register or deregister callback functions for start- and end-of-xact + * operations. + * + * These functions are intended for use by dynamically loaded modules. + * For built-in modules we generally just hardwire the appropriate calls + * (mainly because it's easier to control the order that way, where needed). + * + * At transaction end, the callback occurs post-commit or post-abort, so the + * callback functions can only do noncritical cleanup. + */ +void +RegisterXactCallback(XactCallback callback, void *arg) +{ + XactCallbackItem *item; + + item = (XactCallbackItem *) + MemoryContextAlloc(TopMemoryContext, sizeof(XactCallbackItem)); + item->callback = callback; + item->arg = arg; + item->next = Xact_callbacks; + Xact_callbacks = item; +} + +void +UnregisterXactCallback(XactCallback callback, void *arg) +{ + XactCallbackItem *item; + XactCallbackItem *prev; + + prev = NULL; + for (item = Xact_callbacks; item; prev = item, item = item->next) + { + if (item->callback == callback && item->arg == arg) + { + if (prev) + prev->next = item->next; + else + Xact_callbacks = item->next; + pfree(item); + break; + } + } +} + +static void +CallXactCallbacks(XactEvent event) +{ + XactCallbackItem *item; + + for (item = Xact_callbacks; item; item = item->next) + item->callback(event, item->arg); +} + + +/* + * Register or deregister callback functions for start- and end-of-subxact + * operations. + * + * Pretty much same as above, but for subtransaction events. + * + * At subtransaction end, the callback occurs post-subcommit or post-subabort, + * so the callback functions can only do noncritical cleanup. At + * subtransaction start, the callback is called when the subtransaction has + * finished initializing. + */ +void +RegisterSubXactCallback(SubXactCallback callback, void *arg) +{ + SubXactCallbackItem *item; + + item = (SubXactCallbackItem *) + MemoryContextAlloc(TopMemoryContext, sizeof(SubXactCallbackItem)); + item->callback = callback; + item->arg = arg; + item->next = SubXact_callbacks; + SubXact_callbacks = item; +} + +void +UnregisterSubXactCallback(SubXactCallback callback, void *arg) +{ + SubXactCallbackItem *item; + SubXactCallbackItem *prev; + + prev = NULL; + for (item = SubXact_callbacks; item; prev = item, item = item->next) + { + if (item->callback == callback && item->arg == arg) + { + if (prev) + prev->next = item->next; + else + SubXact_callbacks = item->next; + pfree(item); + break; + } + } +} + +static void +CallSubXactCallbacks(SubXactEvent event, + SubTransactionId mySubid, + SubTransactionId parentSubid) +{ + SubXactCallbackItem *item; + + for (item = SubXact_callbacks; item; item = item->next) + item->callback(event, mySubid, parentSubid, item->arg); +} + + +/* ---------------------------------------------------------------- + * transaction block support + * ---------------------------------------------------------------- + */ + +/* + * BeginTransactionBlock + * This executes a BEGIN command. + */ +void +BeginTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * We are not inside a transaction block, so allow one to begin. + */ + case TBLOCK_STARTED: + s->blockState = TBLOCK_BEGIN; + break; + + /* + * BEGIN converts an implicit transaction block to a regular one. + * (Note that we allow this even if we've already done some + * commands, which is a bit odd but matches historical practice.) + */ + case TBLOCK_IMPLICIT_INPROGRESS: + s->blockState = TBLOCK_BEGIN; + break; + + /* + * Already a transaction block in progress. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + ereport(WARNING, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("there is already a transaction in progress"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "BeginTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } +} + +/* + * PrepareTransactionBlock + * This executes a PREPARE command. + * + * Since PREPARE may actually do a ROLLBACK, the result indicates what + * happened: true for PREPARE, false for ROLLBACK. + * + * Note that we don't actually do anything here except change blockState. + * The real work will be done in the upcoming PrepareTransaction(). + * We do it this way because it's not convenient to change memory context, + * resource owner, etc while executing inside a Portal. + */ +bool +PrepareTransactionBlock(const char *gid) +{ + TransactionState s; + bool result; + + /* Set up to commit the current transaction */ + result = EndTransactionBlock(false); + + /* If successful, change outer tblock state to PREPARE */ + if (result) + { + s = CurrentTransactionState; + + while (s->parent != NULL) + s = s->parent; + + if (s->blockState == TBLOCK_END) + { + /* Save GID where PrepareTransaction can find it again */ + prepareGID = MemoryContextStrdup(TopTransactionContext, gid); + + s->blockState = TBLOCK_PREPARE; + } + else + { + /* + * ignore case where we are not in a transaction; + * EndTransactionBlock already issued a warning. + */ + Assert(s->blockState == TBLOCK_STARTED || + s->blockState == TBLOCK_IMPLICIT_INPROGRESS); + /* Don't send back a PREPARE result tag... */ + result = false; + } + } + + return result; +} + +/* + * EndTransactionBlock + * This executes a COMMIT command. + * + * Since COMMIT may actually do a ROLLBACK, the result indicates what + * happened: true for COMMIT, false for ROLLBACK. + * + * Note that we don't actually do anything here except change blockState. + * The real work will be done in the upcoming CommitTransactionCommand(). + * We do it this way because it's not convenient to change memory context, + * resource owner, etc while executing inside a Portal. + */ +bool +EndTransactionBlock(bool chain) +{ + TransactionState s = CurrentTransactionState; + bool result = false; + + switch (s->blockState) + { + /* + * We are in a transaction block, so tell CommitTransactionCommand + * to COMMIT. + */ + case TBLOCK_INPROGRESS: + s->blockState = TBLOCK_END; + result = true; + break; + + /* + * We are in an implicit transaction block. If AND CHAIN was + * specified, error. Otherwise commit, but issue a warning + * because there was no explicit BEGIN before this. + */ + case TBLOCK_IMPLICIT_INPROGRESS: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "COMMIT AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + s->blockState = TBLOCK_END; + result = true; + break; + + /* + * We are in a failed transaction block. Tell + * CommitTransactionCommand it's time to exit the block. + */ + case TBLOCK_ABORT: + s->blockState = TBLOCK_ABORT_END; + break; + + /* + * We are in a live subtransaction block. Set up to subcommit all + * open subtransactions and then commit the main transaction. + */ + case TBLOCK_SUBINPROGRESS: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBCOMMIT; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + result = true; + break; + + /* + * Here we are inside an aborted subtransaction. Treat the COMMIT + * as ROLLBACK: set up to abort everything and exit the main + * transaction. + */ + case TBLOCK_SUBABORT: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBABORT_PENDING; + else if (s->blockState == TBLOCK_SUBABORT) + s->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_ABORT_PENDING; + else if (s->blockState == TBLOCK_ABORT) + s->blockState = TBLOCK_ABORT_END; + else + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The user issued COMMIT when not inside a transaction. For + * COMMIT without CHAIN, issue a WARNING, staying in + * TBLOCK_STARTED state. The upcoming call to + * CommitTransactionCommand() will then close the transaction and + * put us back into the default state. For COMMIT AND CHAIN, + * error. + */ + case TBLOCK_STARTED: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "COMMIT AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + result = true; + break; + + /* + * The user issued a COMMIT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot commit during a parallel operation"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "EndTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + Assert(s->blockState == TBLOCK_STARTED || + s->blockState == TBLOCK_END || + s->blockState == TBLOCK_ABORT_END || + s->blockState == TBLOCK_ABORT_PENDING); + + s->chain = chain; + + return result; +} + +/* + * UserAbortTransactionBlock + * This executes a ROLLBACK command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +UserAbortTransactionBlock(bool chain) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + /* + * We are inside a transaction block and we got a ROLLBACK command + * from the user, so tell CommitTransactionCommand to abort and + * exit the transaction block. + */ + case TBLOCK_INPROGRESS: + s->blockState = TBLOCK_ABORT_PENDING; + break; + + /* + * We are inside a failed transaction block and we got a ROLLBACK + * command from the user. Abort processing is already done, so + * CommitTransactionCommand just has to cleanup and go back to + * idle state. + */ + case TBLOCK_ABORT: + s->blockState = TBLOCK_ABORT_END; + break; + + /* + * We are inside a subtransaction. Mark everything up to top + * level as exitable. + */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + while (s->parent != NULL) + { + if (s->blockState == TBLOCK_SUBINPROGRESS) + s->blockState = TBLOCK_SUBABORT_PENDING; + else if (s->blockState == TBLOCK_SUBABORT) + s->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + s = s->parent; + } + if (s->blockState == TBLOCK_INPROGRESS) + s->blockState = TBLOCK_ABORT_PENDING; + else if (s->blockState == TBLOCK_ABORT) + s->blockState = TBLOCK_ABORT_END; + else + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + + /* + * The user issued ABORT when not inside a transaction. For + * ROLLBACK without CHAIN, issue a WARNING and go to abort state. + * The upcoming call to CommitTransactionCommand() will then put + * us back into the default state. For ROLLBACK AND CHAIN, error. + * + * We do the same thing with ABORT inside an implicit transaction, + * although in this case we might be rolling back actual database + * state changes. (It's debatable whether we should issue a + * WARNING in this case, but we have done so historically.) + */ + case TBLOCK_STARTED: + case TBLOCK_IMPLICIT_INPROGRESS: + if (chain) + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "ROLLBACK AND CHAIN"))); + else + ereport(WARNING, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + errmsg("there is no transaction in progress"))); + s->blockState = TBLOCK_ABORT_PENDING; + break; + + /* + * The user issued an ABORT that somehow ran inside a parallel + * worker. We can't cope with that. + */ + case TBLOCK_PARALLEL_INPROGRESS: + ereport(FATAL, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot abort during a parallel operation"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "UserAbortTransactionBlock: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + Assert(s->blockState == TBLOCK_ABORT_END || + s->blockState == TBLOCK_ABORT_PENDING); + + s->chain = chain; +} + +/* + * BeginImplicitTransactionBlock + * Start an implicit transaction block if we're not already in one. + * + * Unlike BeginTransactionBlock, this is called directly from the main loop + * in postgres.c, not within a Portal. So we can just change blockState + * without a lot of ceremony. We do not expect caller to do + * CommitTransactionCommand/StartTransactionCommand. + */ +void +BeginImplicitTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If we are in STARTED state (that is, no transaction block is open), + * switch to IMPLICIT_INPROGRESS state, creating an implicit transaction + * block. + * + * For caller convenience, we consider all other transaction states as + * legal here; otherwise the caller would need its own state check, which + * seems rather pointless. + */ + if (s->blockState == TBLOCK_STARTED) + s->blockState = TBLOCK_IMPLICIT_INPROGRESS; +} + +/* + * EndImplicitTransactionBlock + * End an implicit transaction block, if we're in one. + * + * Like EndTransactionBlock, we just make any needed blockState change here. + * The real work will be done in the upcoming CommitTransactionCommand(). + */ +void +EndImplicitTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * If we are in IMPLICIT_INPROGRESS state, switch back to STARTED state, + * allowing CommitTransactionCommand to commit whatever happened during + * the implicit transaction block as though it were a single statement. + * + * For caller convenience, we consider all other transaction states as + * legal here; otherwise the caller would need its own state check, which + * seems rather pointless. + */ + if (s->blockState == TBLOCK_IMPLICIT_INPROGRESS) + s->blockState = TBLOCK_STARTED; +} + +/* + * DefineSavepoint + * This executes a SAVEPOINT command. + */ +void +DefineSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot define savepoints during a parallel operation"))); + + switch (s->blockState) + { + case TBLOCK_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + /* Normal subtransaction start */ + PushTransaction(); + s = CurrentTransactionState; /* changed by push */ + + /* + * Savepoint names, like the TransactionState block itself, live + * in TopTransactionContext. + */ + if (name) + s->name = MemoryContextStrdup(TopTransactionContext, name); + break; + + /* + * We disallow savepoint commands in implicit transaction blocks. + * There would be no great difficulty in allowing them so far as + * this module is concerned, but a savepoint seems inconsistent + * with exec_simple_query's behavior of abandoning the whole query + * string upon error. Also, the point of an implicit transaction + * block (as opposed to a regular one) is to automatically close + * after an error, so it's hard to see how a savepoint would fit + * into that. + * + * The error messages for this are phrased as if there were no + * active transaction block at all, which is historical but + * perhaps could be improved. + */ + case TBLOCK_IMPLICIT_INPROGRESS: + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "SAVEPOINT"))); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "DefineSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } +} + +/* + * ReleaseSavepoint + * This executes a RELEASE command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +ReleaseSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + TransactionState target, + xact; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot release savepoints during a parallel operation"))); + + switch (s->blockState) + { + /* + * We can't release a savepoint if there is no savepoint defined. + */ + case TBLOCK_INPROGRESS: + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + break; + + case TBLOCK_IMPLICIT_INPROGRESS: + /* See comment about implicit transactions in DefineSavepoint */ + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "RELEASE SAVEPOINT"))); + break; + + /* + * We are in a non-aborted subtransaction. This is the only valid + * case. + */ + case TBLOCK_SUBINPROGRESS: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "ReleaseSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + for (target = s; PointerIsValid(target); target = target->parent) + { + if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + break; + } + + if (!PointerIsValid(target)) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + + /* disallow crossing savepoint level boundaries */ + if (target->savepointLevel != s->savepointLevel) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist within current savepoint level", name))); + + /* + * Mark "commit pending" all subtransactions up to the target + * subtransaction. The actual commits will happen when control gets to + * CommitTransactionCommand. + */ + xact = CurrentTransactionState; + for (;;) + { + Assert(xact->blockState == TBLOCK_SUBINPROGRESS); + xact->blockState = TBLOCK_SUBRELEASE; + if (xact == target) + break; + xact = xact->parent; + Assert(PointerIsValid(xact)); + } +} + +/* + * RollbackToSavepoint + * This executes a ROLLBACK TO <savepoint> command. + * + * As above, we don't actually do anything here except change blockState. + */ +void +RollbackToSavepoint(const char *name) +{ + TransactionState s = CurrentTransactionState; + TransactionState target, + xact; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for transaction state change after that + * point. (Note that this check will certainly error out if s->blockState + * is TBLOCK_PARALLEL_INPROGRESS, so we can treat that as an invalid case + * below.) + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot rollback to savepoints during a parallel operation"))); + + switch (s->blockState) + { + /* + * We can't rollback to a savepoint if there is no savepoint + * defined. + */ + case TBLOCK_INPROGRESS: + case TBLOCK_ABORT: + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + break; + + case TBLOCK_IMPLICIT_INPROGRESS: + /* See comment about implicit transactions in DefineSavepoint */ + ereport(ERROR, + (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION), + /* translator: %s represents an SQL statement name */ + errmsg("%s can only be used in transaction blocks", + "ROLLBACK TO SAVEPOINT"))); + break; + + /* + * There is at least one savepoint, so proceed. + */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + for (target = s; PointerIsValid(target); target = target->parent) + { + if (PointerIsValid(target->name) && strcmp(target->name, name) == 0) + break; + } + + if (!PointerIsValid(target)) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist", name))); + + /* disallow crossing savepoint level boundaries */ + if (target->savepointLevel != s->savepointLevel) + ereport(ERROR, + (errcode(ERRCODE_S_E_INVALID_SPECIFICATION), + errmsg("savepoint \"%s\" does not exist within current savepoint level", name))); + + /* + * Mark "abort pending" all subtransactions up to the target + * subtransaction. The actual aborts will happen when control gets to + * CommitTransactionCommand. + */ + xact = CurrentTransactionState; + for (;;) + { + if (xact == target) + break; + if (xact->blockState == TBLOCK_SUBINPROGRESS) + xact->blockState = TBLOCK_SUBABORT_PENDING; + else if (xact->blockState == TBLOCK_SUBABORT) + xact->blockState = TBLOCK_SUBABORT_END; + else + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(xact->blockState)); + xact = xact->parent; + Assert(PointerIsValid(xact)); + } + + /* And mark the target as "restart pending" */ + if (xact->blockState == TBLOCK_SUBINPROGRESS) + xact->blockState = TBLOCK_SUBRESTART; + else if (xact->blockState == TBLOCK_SUBABORT) + xact->blockState = TBLOCK_SUBABORT_RESTART; + else + elog(FATAL, "RollbackToSavepoint: unexpected state %s", + BlockStateAsString(xact->blockState)); +} + +/* + * BeginInternalSubTransaction + * This is the same as DefineSavepoint except it allows TBLOCK_STARTED, + * TBLOCK_IMPLICIT_INPROGRESS, TBLOCK_END, and TBLOCK_PREPARE states, + * and therefore it can safely be used in functions that might be called + * when not inside a BEGIN block or when running deferred triggers at + * COMMIT/PREPARE time. Also, it automatically does + * CommitTransactionCommand/StartTransactionCommand instead of expecting + * the caller to do it. + */ +void +BeginInternalSubTransaction(const char *name) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for new subtransactions after that + * point. We might be able to make an exception for the type of + * subtransaction established by this function, which is typically used in + * contexts where we're going to release or roll back the subtransaction + * before proceeding further, so that no enduring change to the + * transaction state occurs. For now, however, we prohibit this case along + * with all the others. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot start subtransactions during a parallel operation"))); + + switch (s->blockState) + { + case TBLOCK_STARTED: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_END: + case TBLOCK_PREPARE: + case TBLOCK_SUBINPROGRESS: + /* Normal subtransaction start */ + PushTransaction(); + s = CurrentTransactionState; /* changed by push */ + + /* + * Savepoint names, like the TransactionState block itself, live + * in TopTransactionContext. + */ + if (name) + s->name = MemoryContextStrdup(TopTransactionContext, name); + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_BEGIN: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + elog(FATAL, "BeginInternalSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + CommitTransactionCommand(); + StartTransactionCommand(); +} + +/* + * ReleaseCurrentSubTransaction + * + * RELEASE (ie, commit) the innermost subtransaction, regardless of its + * savepoint name (if any). + * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this. + */ +void +ReleaseCurrentSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * Workers synchronize transaction state at the beginning of each parallel + * operation, so we can't account for commit of subtransactions after that + * point. This should not happen anyway. Code calling this would + * typically have called BeginInternalSubTransaction() first, failing + * there. + */ + if (IsInParallelMode()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot commit subtransactions during a parallel operation"))); + + if (s->blockState != TBLOCK_SUBINPROGRESS) + elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + Assert(s->state == TRANS_INPROGRESS); + MemoryContextSwitchTo(CurTransactionContext); + CommitSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + Assert(s->state == TRANS_INPROGRESS); +} + +/* + * RollbackAndReleaseCurrentSubTransaction + * + * ROLLBACK and RELEASE (ie, abort) the innermost subtransaction, regardless + * of its savepoint name (if any). + * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this. + */ +void +RollbackAndReleaseCurrentSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* + * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted + * during parallel operations. That's because we may be in the leader, + * recovering from an error thrown while we were in parallel mode. We + * won't reach here in a worker, because BeginInternalSubTransaction() + * will have failed. + */ + + switch (s->blockState) + { + /* Must be in a subtransaction */ + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBABORT: + break; + + /* These cases are invalid. */ + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBBEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_ABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + case TBLOCK_PREPARE: + elog(FATAL, "RollbackAndReleaseCurrentSubTransaction: unexpected state %s", + BlockStateAsString(s->blockState)); + break; + } + + /* + * Abort the current subtransaction, if needed. + */ + if (s->blockState == TBLOCK_SUBINPROGRESS) + AbortSubTransaction(); + + /* And clean it up, too */ + CleanupSubTransaction(); + + s = CurrentTransactionState; /* changed by pop */ + AssertState(s->blockState == TBLOCK_SUBINPROGRESS || + s->blockState == TBLOCK_INPROGRESS || + s->blockState == TBLOCK_IMPLICIT_INPROGRESS || + s->blockState == TBLOCK_STARTED); +} + +/* + * AbortOutOfAnyTransaction + * + * This routine is provided for error recovery purposes. It aborts any + * active transaction or transaction block, leaving the system in a known + * idle state. + */ +void +AbortOutOfAnyTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* Ensure we're not running in a doomed memory context */ + AtAbort_Memory(); + + /* + * Get out of any transaction or nested transaction + */ + do + { + switch (s->blockState) + { + case TBLOCK_DEFAULT: + if (s->state == TRANS_DEFAULT) + { + /* Not in a transaction, do nothing */ + } + else + { + /* + * We can get here after an error during transaction start + * (state will be TRANS_START). Need to clean up the + * incompletely started transaction. First, adjust the + * low-level state to suppress warning message from + * AbortTransaction. + */ + if (s->state == TRANS_START) + s->state = TRANS_INPROGRESS; + AbortTransaction(); + CleanupTransaction(); + } + break; + case TBLOCK_STARTED: + case TBLOCK_BEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_PREPARE: + /* In a transaction, so clean up */ + AbortTransaction(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + case TBLOCK_ABORT: + case TBLOCK_ABORT_END: + + /* + * AbortTransaction is already done, still need Cleanup. + * However, if we failed partway through running ROLLBACK, + * there will be an active portal running that command, which + * we need to shut down before doing CleanupTransaction. + */ + AtAbort_Portals(); + CleanupTransaction(); + s->blockState = TBLOCK_DEFAULT; + break; + + /* + * In a subtransaction, so clean it up and abort parent too + */ + case TBLOCK_SUBBEGIN: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + AbortSubTransaction(); + CleanupSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + break; + + case TBLOCK_SUBABORT: + case TBLOCK_SUBABORT_END: + case TBLOCK_SUBABORT_RESTART: + /* As above, but AbortSubTransaction already done */ + if (s->curTransactionOwner) + { + /* As in TBLOCK_ABORT, might have a live portal to zap */ + AtSubAbort_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->curTransactionOwner, + s->parent->curTransactionOwner); + } + CleanupSubTransaction(); + s = CurrentTransactionState; /* changed by pop */ + break; + } + } while (s->blockState != TBLOCK_DEFAULT); + + /* Should be out of all subxacts now */ + Assert(s->parent == NULL); + + /* If we didn't actually have anything to do, revert to TopMemoryContext */ + AtCleanup_Memory(); +} + +/* + * IsTransactionBlock --- are we within a transaction block? + */ +bool +IsTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_DEFAULT || s->blockState == TBLOCK_STARTED) + return false; + + return true; +} + +/* + * IsTransactionOrTransactionBlock --- are we within either a transaction + * or a transaction block? (The backend is only really "idle" when this + * returns false.) + * + * This should match up with IsTransactionBlock and IsTransactionState. + */ +bool +IsTransactionOrTransactionBlock(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_DEFAULT) + return false; + + return true; +} + +/* + * TransactionBlockStatusCode - return status code to send in ReadyForQuery + */ +char +TransactionBlockStatusCode(void) +{ + TransactionState s = CurrentTransactionState; + + switch (s->blockState) + { + case TBLOCK_DEFAULT: + case TBLOCK_STARTED: + return 'I'; /* idle --- not in transaction */ + case TBLOCK_BEGIN: + case TBLOCK_SUBBEGIN: + case TBLOCK_INPROGRESS: + case TBLOCK_IMPLICIT_INPROGRESS: + case TBLOCK_PARALLEL_INPROGRESS: + case TBLOCK_SUBINPROGRESS: + case TBLOCK_END: + case TBLOCK_SUBRELEASE: + case TBLOCK_SUBCOMMIT: + case TBLOCK_PREPARE: + return 'T'; /* in transaction */ + case TBLOCK_ABORT: + case TBLOCK_SUBABORT: + case TBLOCK_ABORT_END: + case TBLOCK_SUBABORT_END: + case TBLOCK_ABORT_PENDING: + case TBLOCK_SUBABORT_PENDING: + case TBLOCK_SUBRESTART: + case TBLOCK_SUBABORT_RESTART: + return 'E'; /* in failed transaction */ + } + + /* should never get here */ + elog(FATAL, "invalid transaction block state: %s", + BlockStateAsString(s->blockState)); + return 0; /* keep compiler quiet */ +} + +/* + * IsSubTransaction + */ +bool +IsSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->nestingLevel >= 2) + return true; + + return false; +} + +/* + * StartSubTransaction + * + * If you're wondering why this is separate from PushTransaction: it's because + * we can't conveniently do this stuff right inside DefineSavepoint. The + * SAVEPOINT utility command will be executed inside a Portal, and if we + * muck with CurrentMemoryContext or CurrentResourceOwner then exit from + * the Portal will undo those settings. So we make DefineSavepoint just + * push a dummy transaction block, and when control returns to the main + * idle loop, CommitTransactionCommand will be called, and we'll come here + * to finish starting the subtransaction. + */ +static void +StartSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->state != TRANS_DEFAULT) + elog(WARNING, "StartSubTransaction while in %s state", + TransStateAsString(s->state)); + + s->state = TRANS_START; + + /* + * Initialize subsystems for new subtransaction + * + * must initialize resource-management stuff first + */ + AtSubStart_Memory(); + AtSubStart_ResourceOwner(); + AfterTriggerBeginSubXact(); + + s->state = TRANS_INPROGRESS; + + /* + * Call start-of-subxact callbacks + */ + CallSubXactCallbacks(SUBXACT_EVENT_START_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ShowTransactionState("StartSubTransaction"); +} + +/* + * CommitSubTransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +CommitSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + ShowTransactionState("CommitSubTransaction"); + + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "CommitSubTransaction while in %s state", + TransStateAsString(s->state)); + + /* Pre-commit processing goes here */ + + CallSubXactCallbacks(SUBXACT_EVENT_PRE_COMMIT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + /* If in parallel mode, clean up workers and exit parallel mode. */ + if (IsInParallelMode()) + { + AtEOSubXact_Parallel(true, s->subTransactionId); + s->parallelModeLevel = 0; + } + + /* Do the actual "commit", such as it is */ + s->state = TRANS_COMMIT; + + /* Must CCI to ensure commands of subtransaction are seen as done */ + CommandCounterIncrement(); + + /* + * Prior to 8.4 we marked subcommit in clog at this point. We now only + * perform that step, if required, as part of the atomic update of the + * whole transaction tree at top level commit or abort. + */ + + /* Post-commit cleanup */ + if (FullTransactionIdIsValid(s->fullTransactionId)) + AtSubCommit_childXids(); + AfterTriggerEndSubXact(true); + AtSubCommit_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->parent->nestingLevel, + s->parent->curTransactionOwner); + AtEOSubXact_LargeObject(true, s->subTransactionId, + s->parent->subTransactionId); + AtSubCommit_Notify(); + + CallSubXactCallbacks(SUBXACT_EVENT_COMMIT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + true, false); + AtEOSubXact_RelationCache(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Inval(true); + AtSubCommit_smgr(); + + /* + * The only lock we actually release here is the subtransaction XID lock. + */ + CurrentResourceOwner = s->curTransactionOwner; + if (FullTransactionIdIsValid(s->fullTransactionId)) + XactLockTableDelete(XidFromFullTransactionId(s->fullTransactionId)); + + /* + * Other locks should get transferred to their parent resource owner. + */ + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_LOCKS, + true, false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + true, false); + + AtEOXact_GUC(true, s->gucNestLevel); + AtEOSubXact_SPI(true, s->subTransactionId); + AtEOSubXact_on_commit_actions(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Namespace(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Files(true, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_HashTables(true, s->nestingLevel); + AtEOSubXact_PgStat(true, s->nestingLevel); + AtSubCommit_Snapshot(s->nestingLevel); + + /* + * We need to restore the upper transaction's read-only state, in case the + * upper is read-write while the child is read-only; GUC will incorrectly + * think it should leave the child state in place. + */ + XactReadOnly = s->prevXactReadOnly; + + CurrentResourceOwner = s->parent->curTransactionOwner; + CurTransactionResourceOwner = s->parent->curTransactionOwner; + ResourceOwnerDelete(s->curTransactionOwner); + s->curTransactionOwner = NULL; + + AtSubCommit_Memory(); + + s->state = TRANS_DEFAULT; + + PopTransaction(); +} + +/* + * AbortSubTransaction + */ +static void +AbortSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + /* Prevent cancel/die interrupt while cleaning up */ + HOLD_INTERRUPTS(); + + /* Make sure we have a valid memory context and resource owner */ + AtSubAbort_Memory(); + AtSubAbort_ResourceOwner(); + + /* + * Release any LW locks we might be holding as quickly as possible. + * (Regular locks, however, must be held till we finish aborting.) + * Releasing LW locks is critical since we might try to grab them again + * while cleaning up! + * + * FIXME This may be incorrect --- Are there some locks we should keep? + * Buffer locks, for example? I don't think so but I'm not sure. + */ + LWLockReleaseAll(); + + pgstat_report_wait_end(); + pgstat_progress_end_command(); + AbortBufferIO(); + UnlockBuffers(); + + /* Reset WAL record construction state */ + XLogResetInsertion(); + + /* Cancel condition variable sleep */ + ConditionVariableCancelSleep(); + + /* + * Also clean up any open wait for lock, since the lock manager will choke + * if we try to wait for another lock before doing this. + */ + LockErrorCleanup(); + + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + + /* + * check the current transaction state + */ + ShowTransactionState("AbortSubTransaction"); + + if (s->state != TRANS_INPROGRESS) + elog(WARNING, "AbortSubTransaction while in %s state", + TransStateAsString(s->state)); + + s->state = TRANS_ABORT; + + /* + * Reset user ID which might have been changed transiently. (See notes in + * AbortTransaction.) + */ + SetUserIdAndSecContext(s->prevUser, s->prevSecContext); + + /* Forget about any active REINDEX. */ + ResetReindexState(s->nestingLevel); + + /* Reset logical streaming state. */ + ResetLogicalStreamingState(); + + /* + * No need for SnapBuildResetExportedSnapshotState() here, snapshot + * exports are not supported in subtransactions. + */ + + /* Exit from parallel mode, if necessary. */ + if (IsInParallelMode()) + { + AtEOSubXact_Parallel(false, s->subTransactionId); + s->parallelModeLevel = 0; + } + + /* + * We can skip all this stuff if the subxact failed before creating a + * ResourceOwner... + */ + if (s->curTransactionOwner) + { + AfterTriggerEndSubXact(false); + AtSubAbort_Portals(s->subTransactionId, + s->parent->subTransactionId, + s->curTransactionOwner, + s->parent->curTransactionOwner); + AtEOSubXact_LargeObject(false, s->subTransactionId, + s->parent->subTransactionId); + AtSubAbort_Notify(); + + /* Advertise the fact that we aborted in pg_xact. */ + (void) RecordTransactionAbort(true); + + /* Post-abort cleanup */ + if (FullTransactionIdIsValid(s->fullTransactionId)) + AtSubAbort_childXids(); + + CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId, + s->parent->subTransactionId); + + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, false); + AtEOSubXact_RelationCache(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Inval(false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_LOCKS, + false, false); + ResourceOwnerRelease(s->curTransactionOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + false, false); + AtSubAbort_smgr(); + + AtEOXact_GUC(false, s->gucNestLevel); + AtEOSubXact_SPI(false, s->subTransactionId); + AtEOSubXact_on_commit_actions(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Namespace(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_Files(false, s->subTransactionId, + s->parent->subTransactionId); + AtEOSubXact_HashTables(false, s->nestingLevel); + AtEOSubXact_PgStat(false, s->nestingLevel); + AtSubAbort_Snapshot(s->nestingLevel); + } + + /* + * Restore the upper transaction's read-only state, too. This should be + * redundant with GUC's cleanup but we may as well do it for consistency + * with the commit case. + */ + XactReadOnly = s->prevXactReadOnly; + + RESUME_INTERRUPTS(); +} + +/* + * CleanupSubTransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +CleanupSubTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + ShowTransactionState("CleanupSubTransaction"); + + if (s->state != TRANS_ABORT) + elog(WARNING, "CleanupSubTransaction while in %s state", + TransStateAsString(s->state)); + + AtSubCleanup_Portals(s->subTransactionId); + + CurrentResourceOwner = s->parent->curTransactionOwner; + CurTransactionResourceOwner = s->parent->curTransactionOwner; + if (s->curTransactionOwner) + ResourceOwnerDelete(s->curTransactionOwner); + s->curTransactionOwner = NULL; + + AtSubCleanup_Memory(); + + s->state = TRANS_DEFAULT; + + PopTransaction(); +} + +/* + * PushTransaction + * Create transaction state stack entry for a subtransaction + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +PushTransaction(void) +{ + TransactionState p = CurrentTransactionState; + TransactionState s; + + /* + * We keep subtransaction state nodes in TopTransactionContext. + */ + s = (TransactionState) + MemoryContextAllocZero(TopTransactionContext, + sizeof(TransactionStateData)); + + /* + * Assign a subtransaction ID, watching out for counter wraparound. + */ + currentSubTransactionId += 1; + if (currentSubTransactionId == InvalidSubTransactionId) + { + currentSubTransactionId -= 1; + pfree(s); + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot have more than 2^32-1 subtransactions in a transaction"))); + } + + /* + * We can now stack a minimally valid subtransaction without fear of + * failure. + */ + s->fullTransactionId = InvalidFullTransactionId; /* until assigned */ + s->subTransactionId = currentSubTransactionId; + s->parent = p; + s->nestingLevel = p->nestingLevel + 1; + s->gucNestLevel = NewGUCNestLevel(); + s->savepointLevel = p->savepointLevel; + s->state = TRANS_DEFAULT; + s->blockState = TBLOCK_SUBBEGIN; + GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext); + s->prevXactReadOnly = XactReadOnly; + s->parallelModeLevel = 0; + s->topXidLogged = false; + + CurrentTransactionState = s; + + /* + * AbortSubTransaction and CleanupSubTransaction have to be able to cope + * with the subtransaction from here on out; in particular they should not + * assume that it necessarily has a transaction context, resource owner, + * or XID. + */ +} + +/* + * PopTransaction + * Pop back to parent transaction state + * + * The caller has to make sure to always reassign CurrentTransactionState + * if it has a local pointer to it after calling this function. + */ +static void +PopTransaction(void) +{ + TransactionState s = CurrentTransactionState; + + if (s->state != TRANS_DEFAULT) + elog(WARNING, "PopTransaction while in %s state", + TransStateAsString(s->state)); + + if (s->parent == NULL) + elog(FATAL, "PopTransaction with no parent"); + + CurrentTransactionState = s->parent; + + /* Let's just make sure CurTransactionContext is good */ + CurTransactionContext = s->parent->curTransactionContext; + MemoryContextSwitchTo(CurTransactionContext); + + /* Ditto for ResourceOwner links */ + CurTransactionResourceOwner = s->parent->curTransactionOwner; + CurrentResourceOwner = s->parent->curTransactionOwner; + + /* Free the old child structure */ + if (s->name) + pfree(s->name); + pfree(s); +} + +/* + * EstimateTransactionStateSpace + * Estimate the amount of space that will be needed by + * SerializeTransactionState. It would be OK to overestimate slightly, + * but it's simple for us to work out the precise value, so we do. + */ +Size +EstimateTransactionStateSpace(void) +{ + TransactionState s; + Size nxids = 0; + Size size = SerializedTransactionStateHeaderSize; + + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + nxids = add_size(nxids, 1); + nxids = add_size(nxids, s->nChildXids); + } + + return add_size(size, mul_size(sizeof(TransactionId), nxids)); +} + +/* + * SerializeTransactionState + * Write out relevant details of our transaction state that will be + * needed by a parallel worker. + * + * We need to save and restore XactDeferrable, XactIsoLevel, and the XIDs + * associated with this transaction. These are serialized into a + * caller-supplied buffer big enough to hold the number of bytes reported by + * EstimateTransactionStateSpace(). We emit the XIDs in sorted order for the + * convenience of the receiving process. + */ +void +SerializeTransactionState(Size maxsize, char *start_address) +{ + TransactionState s; + Size nxids = 0; + Size i = 0; + TransactionId *workspace; + SerializedTransactionState *result; + + result = (SerializedTransactionState *) start_address; + + result->xactIsoLevel = XactIsoLevel; + result->xactDeferrable = XactDeferrable; + result->topFullTransactionId = XactTopFullTransactionId; + result->currentFullTransactionId = + CurrentTransactionState->fullTransactionId; + result->currentCommandId = currentCommandId; + + /* + * If we're running in a parallel worker and launching a parallel worker + * of our own, we can just pass along the information that was passed to + * us. + */ + if (nParallelCurrentXids > 0) + { + result->nParallelCurrentXids = nParallelCurrentXids; + memcpy(&result->parallelCurrentXids[0], ParallelCurrentXids, + nParallelCurrentXids * sizeof(TransactionId)); + return; + } + + /* + * OK, we need to generate a sorted list of XIDs that our workers should + * view as current. First, figure out how many there are. + */ + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + nxids = add_size(nxids, 1); + nxids = add_size(nxids, s->nChildXids); + } + Assert(SerializedTransactionStateHeaderSize + nxids * sizeof(TransactionId) + <= maxsize); + + /* Copy them to our scratch space. */ + workspace = palloc(nxids * sizeof(TransactionId)); + for (s = CurrentTransactionState; s != NULL; s = s->parent) + { + if (FullTransactionIdIsValid(s->fullTransactionId)) + workspace[i++] = XidFromFullTransactionId(s->fullTransactionId); + if (s->nChildXids > 0) + memcpy(&workspace[i], s->childXids, + s->nChildXids * sizeof(TransactionId)); + i += s->nChildXids; + } + Assert(i == nxids); + + /* Sort them. */ + qsort(workspace, nxids, sizeof(TransactionId), xidComparator); + + /* Copy data into output area. */ + result->nParallelCurrentXids = nxids; + memcpy(&result->parallelCurrentXids[0], workspace, + nxids * sizeof(TransactionId)); +} + +/* + * StartParallelWorkerTransaction + * Start a parallel worker transaction, restoring the relevant + * transaction state serialized by SerializeTransactionState. + */ +void +StartParallelWorkerTransaction(char *tstatespace) +{ + SerializedTransactionState *tstate; + + Assert(CurrentTransactionState->blockState == TBLOCK_DEFAULT); + StartTransaction(); + + tstate = (SerializedTransactionState *) tstatespace; + XactIsoLevel = tstate->xactIsoLevel; + XactDeferrable = tstate->xactDeferrable; + XactTopFullTransactionId = tstate->topFullTransactionId; + CurrentTransactionState->fullTransactionId = + tstate->currentFullTransactionId; + currentCommandId = tstate->currentCommandId; + nParallelCurrentXids = tstate->nParallelCurrentXids; + ParallelCurrentXids = &tstate->parallelCurrentXids[0]; + + CurrentTransactionState->blockState = TBLOCK_PARALLEL_INPROGRESS; +} + +/* + * EndParallelWorkerTransaction + * End a parallel worker transaction. + */ +void +EndParallelWorkerTransaction(void) +{ + Assert(CurrentTransactionState->blockState == TBLOCK_PARALLEL_INPROGRESS); + CommitTransaction(); + CurrentTransactionState->blockState = TBLOCK_DEFAULT; +} + +/* + * ShowTransactionState + * Debug support + */ +static void +ShowTransactionState(const char *str) +{ + /* skip work if message will definitely not be printed */ + if (message_level_is_interesting(DEBUG5)) + ShowTransactionStateRec(str, CurrentTransactionState); +} + +/* + * ShowTransactionStateRec + * Recursive subroutine for ShowTransactionState + */ +static void +ShowTransactionStateRec(const char *str, TransactionState s) +{ + StringInfoData buf; + + initStringInfo(&buf); + + if (s->nChildXids > 0) + { + int i; + + appendStringInfo(&buf, ", children: %u", s->childXids[0]); + for (i = 1; i < s->nChildXids; i++) + appendStringInfo(&buf, " %u", s->childXids[i]); + } + + if (s->parent) + ShowTransactionStateRec(str, s->parent); + + ereport(DEBUG5, + (errmsg_internal("%s(%d) name: %s; blockState: %s; state: %s, xid/subid/cid: %u/%u/%u%s%s", + str, s->nestingLevel, + PointerIsValid(s->name) ? s->name : "unnamed", + BlockStateAsString(s->blockState), + TransStateAsString(s->state), + (unsigned int) XidFromFullTransactionId(s->fullTransactionId), + (unsigned int) s->subTransactionId, + (unsigned int) currentCommandId, + currentCommandIdUsed ? " (used)" : "", + buf.data))); + + pfree(buf.data); +} + +/* + * BlockStateAsString + * Debug support + */ +static const char * +BlockStateAsString(TBlockState blockState) +{ + switch (blockState) + { + case TBLOCK_DEFAULT: + return "DEFAULT"; + case TBLOCK_STARTED: + return "STARTED"; + case TBLOCK_BEGIN: + return "BEGIN"; + case TBLOCK_INPROGRESS: + return "INPROGRESS"; + case TBLOCK_IMPLICIT_INPROGRESS: + return "IMPLICIT_INPROGRESS"; + case TBLOCK_PARALLEL_INPROGRESS: + return "PARALLEL_INPROGRESS"; + case TBLOCK_END: + return "END"; + case TBLOCK_ABORT: + return "ABORT"; + case TBLOCK_ABORT_END: + return "ABORT_END"; + case TBLOCK_ABORT_PENDING: + return "ABORT_PENDING"; + case TBLOCK_PREPARE: + return "PREPARE"; + case TBLOCK_SUBBEGIN: + return "SUBBEGIN"; + case TBLOCK_SUBINPROGRESS: + return "SUBINPROGRESS"; + case TBLOCK_SUBRELEASE: + return "SUBRELEASE"; + case TBLOCK_SUBCOMMIT: + return "SUBCOMMIT"; + case TBLOCK_SUBABORT: + return "SUBABORT"; + case TBLOCK_SUBABORT_END: + return "SUBABORT_END"; + case TBLOCK_SUBABORT_PENDING: + return "SUBABORT_PENDING"; + case TBLOCK_SUBRESTART: + return "SUBRESTART"; + case TBLOCK_SUBABORT_RESTART: + return "SUBABORT_RESTART"; + } + return "UNRECOGNIZED"; +} + +/* + * TransStateAsString + * Debug support + */ +static const char * +TransStateAsString(TransState state) +{ + switch (state) + { + case TRANS_DEFAULT: + return "DEFAULT"; + case TRANS_START: + return "START"; + case TRANS_INPROGRESS: + return "INPROGRESS"; + case TRANS_COMMIT: + return "COMMIT"; + case TRANS_ABORT: + return "ABORT"; + case TRANS_PREPARE: + return "PREPARE"; + } + return "UNRECOGNIZED"; +} + +/* + * xactGetCommittedChildren + * + * Gets the list of committed children of the current transaction. The return + * value is the number of child transactions. *ptr is set to point to an + * array of TransactionIds. The array is allocated in TopTransactionContext; + * the caller should *not* pfree() it (this is a change from pre-8.4 code!). + * If there are no subxacts, *ptr is set to NULL. + */ +int +xactGetCommittedChildren(TransactionId **ptr) +{ + TransactionState s = CurrentTransactionState; + + if (s->nChildXids == 0) + *ptr = NULL; + else + *ptr = s->childXids; + + return s->nChildXids; +} + +/* + * XLOG support routines + */ + + +/* + * Log the commit record for a plain or twophase transaction commit. + * + * A 2pc commit will be emitted when twophase_xid is valid, a plain one + * otherwise. + */ +XLogRecPtr +XactLogCommitRecord(TimestampTz commit_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int ndroppedstats, xl_xact_stats_item *droppedstats, + int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInval, + int xactflags, TransactionId twophase_xid, + const char *twophase_gid) +{ + xl_xact_commit xlrec; + xl_xact_xinfo xl_xinfo; + xl_xact_dbinfo xl_dbinfo; + xl_xact_subxacts xl_subxacts; + xl_xact_relfilenodes xl_relfilenodes; + xl_xact_stats_items xl_dropped_stats; + xl_xact_invals xl_invals; + xl_xact_twophase xl_twophase; + xl_xact_origin xl_origin; + uint8 info; + + Assert(CritSectionCount > 0); + + xl_xinfo.xinfo = 0; + + /* decide between a plain and 2pc commit */ + if (!TransactionIdIsValid(twophase_xid)) + info = XLOG_XACT_COMMIT; + else + info = XLOG_XACT_COMMIT_PREPARED; + + /* First figure out and collect all the information needed */ + + xlrec.xact_time = commit_time; + + if (relcacheInval) + xl_xinfo.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE; + if (forceSyncCommit) + xl_xinfo.xinfo |= XACT_COMPLETION_FORCE_SYNC_COMMIT; + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; + + /* + * Check if the caller would like to ask standbys for immediate feedback + * once this commit is applied. + */ + if (synchronous_commit >= SYNCHRONOUS_COMMIT_REMOTE_APPLY) + xl_xinfo.xinfo |= XACT_COMPLETION_APPLY_FEEDBACK; + + /* + * Relcache invalidations requires information about the current database + * and so does logical decoding. + */ + if (nmsgs > 0 || XLogLogicalInfoActive()) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO; + xl_dbinfo.dbId = MyDatabaseId; + xl_dbinfo.tsId = MyDatabaseTableSpace; + } + + if (nsubxacts > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS; + xl_subxacts.nsubxacts = nsubxacts; + } + + if (nrels > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; + xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; + } + + if (ndroppedstats > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS; + xl_dropped_stats.nitems = ndroppedstats; + } + + if (nmsgs > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_INVALS; + xl_invals.nmsgs = nmsgs; + } + + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid = twophase_xid; + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + + /* dump transaction origin information */ + if (replorigin_session_origin != InvalidRepOriginId) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN; + + xl_origin.origin_lsn = replorigin_session_origin_lsn; + xl_origin.origin_timestamp = replorigin_session_origin_timestamp; + } + + if (xl_xinfo.xinfo != 0) + info |= XLOG_XACT_HAS_INFO; + + /* Then include all the collected data into the commit record. */ + + XLogBeginInsert(); + + XLogRegisterData((char *) (&xlrec), sizeof(xl_xact_commit)); + + if (xl_xinfo.xinfo != 0) + XLogRegisterData((char *) (&xl_xinfo.xinfo), sizeof(xl_xinfo.xinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO) + XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS) + { + XLogRegisterData((char *) (&xl_subxacts), + MinSizeOfXactSubxacts); + XLogRegisterData((char *) subxacts, + nsubxacts * sizeof(TransactionId)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES) + { + XLogRegisterData((char *) (&xl_relfilenodes), + MinSizeOfXactRelfilenodes); + XLogRegisterData((char *) rels, + nrels * sizeof(RelFileNode)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS) + { + XLogRegisterData((char *) (&xl_dropped_stats), + MinSizeOfXactStatsItems); + XLogRegisterData((char *) droppedstats, + ndroppedstats * sizeof(xl_xact_stats_item)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_INVALS) + { + XLogRegisterData((char *) (&xl_invals), MinSizeOfXactInvals); + XLogRegisterData((char *) msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE) + { + XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase)); + if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID) + XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN) + XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); + + /* we allow filtering by xacts */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_XACT_ID, info); +} + +/* + * Log the commit record for a plain or twophase transaction abort. + * + * A 2pc abort will be emitted when twophase_xid is valid, a plain one + * otherwise. + */ +XLogRecPtr +XactLogAbortRecord(TimestampTz abort_time, + int nsubxacts, TransactionId *subxacts, + int nrels, RelFileNode *rels, + int ndroppedstats, xl_xact_stats_item *droppedstats, + int xactflags, TransactionId twophase_xid, + const char *twophase_gid) +{ + xl_xact_abort xlrec; + xl_xact_xinfo xl_xinfo; + xl_xact_subxacts xl_subxacts; + xl_xact_relfilenodes xl_relfilenodes; + xl_xact_stats_items xl_dropped_stats; + xl_xact_twophase xl_twophase; + xl_xact_dbinfo xl_dbinfo; + xl_xact_origin xl_origin; + + uint8 info; + + Assert(CritSectionCount > 0); + + xl_xinfo.xinfo = 0; + + /* decide between a plain and 2pc abort */ + if (!TransactionIdIsValid(twophase_xid)) + info = XLOG_XACT_ABORT; + else + info = XLOG_XACT_ABORT_PREPARED; + + + /* First figure out and collect all the information needed */ + + xlrec.xact_time = abort_time; + + if ((xactflags & XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK)) + xl_xinfo.xinfo |= XACT_XINFO_HAS_AE_LOCKS; + + if (nsubxacts > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_SUBXACTS; + xl_subxacts.nsubxacts = nsubxacts; + } + + if (nrels > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES; + xl_relfilenodes.nrels = nrels; + info |= XLR_SPECIAL_REL_UPDATE; + } + + if (ndroppedstats > 0) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DROPPED_STATS; + xl_dropped_stats.nitems = ndroppedstats; + } + + if (TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_TWOPHASE; + xl_twophase.xid = twophase_xid; + Assert(twophase_gid != NULL); + + if (XLogLogicalInfoActive()) + xl_xinfo.xinfo |= XACT_XINFO_HAS_GID; + } + + if (TransactionIdIsValid(twophase_xid) && XLogLogicalInfoActive()) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_DBINFO; + xl_dbinfo.dbId = MyDatabaseId; + xl_dbinfo.tsId = MyDatabaseTableSpace; + } + + /* + * Dump transaction origin information only for abort prepared. We need + * this during recovery to update the replication origin progress. + */ + if ((replorigin_session_origin != InvalidRepOriginId) && + TransactionIdIsValid(twophase_xid)) + { + xl_xinfo.xinfo |= XACT_XINFO_HAS_ORIGIN; + + xl_origin.origin_lsn = replorigin_session_origin_lsn; + xl_origin.origin_timestamp = replorigin_session_origin_timestamp; + } + + if (xl_xinfo.xinfo != 0) + info |= XLOG_XACT_HAS_INFO; + + /* Then include all the collected data into the abort record. */ + + XLogBeginInsert(); + + XLogRegisterData((char *) (&xlrec), MinSizeOfXactAbort); + + if (xl_xinfo.xinfo != 0) + XLogRegisterData((char *) (&xl_xinfo), sizeof(xl_xinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DBINFO) + XLogRegisterData((char *) (&xl_dbinfo), sizeof(xl_dbinfo)); + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_SUBXACTS) + { + XLogRegisterData((char *) (&xl_subxacts), + MinSizeOfXactSubxacts); + XLogRegisterData((char *) subxacts, + nsubxacts * sizeof(TransactionId)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_RELFILENODES) + { + XLogRegisterData((char *) (&xl_relfilenodes), + MinSizeOfXactRelfilenodes); + XLogRegisterData((char *) rels, + nrels * sizeof(RelFileNode)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_DROPPED_STATS) + { + XLogRegisterData((char *) (&xl_dropped_stats), + MinSizeOfXactStatsItems); + XLogRegisterData((char *) droppedstats, + ndroppedstats * sizeof(xl_xact_stats_item)); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_TWOPHASE) + { + XLogRegisterData((char *) (&xl_twophase), sizeof(xl_xact_twophase)); + if (xl_xinfo.xinfo & XACT_XINFO_HAS_GID) + XLogRegisterData(unconstify(char *, twophase_gid), strlen(twophase_gid) + 1); + } + + if (xl_xinfo.xinfo & XACT_XINFO_HAS_ORIGIN) + XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); + + if (TransactionIdIsValid(twophase_xid)) + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_XACT_ID, info); +} + +/* + * Before 9.0 this was a fairly short function, but now it performs many + * actions for which the order of execution is critical. + */ +static void +xact_redo_commit(xl_xact_parsed_commit *parsed, + TransactionId xid, + XLogRecPtr lsn, + RepOriginId origin_id) +{ + TransactionId max_xid; + TimestampTz commit_time; + + Assert(TransactionIdIsValid(xid)); + + max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts); + + /* Make sure nextXid is beyond any XID mentioned in the record. */ + AdvanceNextFullTransactionIdPastXid(max_xid); + + Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) == + (origin_id == InvalidRepOriginId)); + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + commit_time = parsed->origin_timestamp; + else + commit_time = parsed->xact_time; + + /* Set the transaction commit timestamp and metadata */ + TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts, + commit_time, origin_id); + + if (standbyState == STANDBY_DISABLED) + { + /* + * Mark the transaction committed in pg_xact. + */ + TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts); + } + else + { + /* + * If a transaction completion record arrives that has as-yet + * unobserved subtransactions then this will not have been fully + * handled by the call to RecordKnownAssignedTransactionIds() in the + * main recovery loop in xlog.c. So we need to do bookkeeping again to + * cover that case. This is confusing and it is easy to think this + * call is irrelevant, which has happened three times in development + * already. Leave it in. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* + * Mark the transaction committed in pg_xact. We use async commit + * protocol during recovery to provide information on database + * consistency for when users try to set hint bits. It is important + * that we do not set hint bits until the minRecoveryPoint is past + * this commit record. This ensures that if we crash we don't see hint + * bits set on changes made by transactions that haven't yet + * recovered. It's unlikely but it's good to be safe. + */ + TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn); + + /* + * We must mark clog before we update the ProcArray. + */ + ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + + /* + * Send any cache invalidations attached to the commit. We must + * maintain the same order of invalidation then release locks as + * occurs in CommitTransaction(). + */ + ProcessCommittedInvalidationMessages(parsed->msgs, parsed->nmsgs, + XactCompletionRelcacheInitFileInval(parsed->xinfo), + parsed->dbId, parsed->tsId); + + /* + * Release locks, if any. We do this for both two phase and normal one + * phase transactions. In effect we are ignoring the prepare phase and + * just going straight to lock release. + */ + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); + } + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + /* recover apply progress */ + replorigin_advance(origin_id, parsed->origin_lsn, lsn, + false /* backward */ , false /* WAL */ ); + } + + /* Make sure files supposed to be dropped are dropped */ + if (parsed->nrels > 0) + { + /* + * First update minimum recovery point to cover this WAL record. Once + * a relation is deleted, there's no going back. The buffer manager + * enforces the WAL-first rule for normal updates to relation files, + * so that the minimum recovery point is always updated before the + * corresponding change in the data file is flushed to disk, but we + * have to do the same here since we're bypassing the buffer manager. + * + * Doing this before deleting the files means that if a deletion fails + * for some reason, you cannot start up the system even after restart, + * until you fix the underlying situation so that the deletion will + * succeed. Alternatively, we could update the minimum recovery point + * after deletion, but that would leave a small window where the + * WAL-first rule would be violated. + */ + XLogFlush(lsn); + + /* Make sure files supposed to be dropped are dropped */ + DropRelationFiles(parsed->xnodes, parsed->nrels, true); + } + + if (parsed->nstats > 0) + { + /* see equivalent call for relations above */ + XLogFlush(lsn); + + pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true); + } + + /* + * We issue an XLogFlush() for the same reason we emit ForceSyncCommit() + * in normal operation. For example, in CREATE DATABASE, we copy all files + * from the template database, and then commit the transaction. If we + * crash after all the files have been copied but before the commit, you + * have files in the data directory without an entry in pg_database. To + * minimize the window for that, we use ForceSyncCommit() to rush the + * commit record to disk as quick as possible. We have the same window + * during recovery, and forcing an XLogFlush() (which updates + * minRecoveryPoint during recovery) helps to reduce that problem window, + * for any user that requested ForceSyncCommit(). + */ + if (XactCompletionForceSyncCommit(parsed->xinfo)) + XLogFlush(lsn); + + /* + * If asked by the primary (because someone is waiting for a synchronous + * commit = remote_apply), we will need to ask walreceiver to send a reply + * immediately. + */ + if (XactCompletionApplyFeedback(parsed->xinfo)) + XLogRequestWalReceiverReply(); +} + +/* + * Be careful with the order of execution, as with xact_redo_commit(). + * The two functions are similar but differ in key places. + * + * Note also that an abort can be for a subtransaction and its children, + * not just for a top level abort. That means we have to consider + * topxid != xid, whereas in commit we would find topxid == xid always + * because subtransaction commit is never WAL logged. + */ +static void +xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, + XLogRecPtr lsn, RepOriginId origin_id) +{ + TransactionId max_xid; + + Assert(TransactionIdIsValid(xid)); + + /* Make sure nextXid is beyond any XID mentioned in the record. */ + max_xid = TransactionIdLatest(xid, + parsed->nsubxacts, + parsed->subxacts); + AdvanceNextFullTransactionIdPastXid(max_xid); + + if (standbyState == STANDBY_DISABLED) + { + /* Mark the transaction aborted in pg_xact, no need for async stuff */ + TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); + } + else + { + /* + * If a transaction completion record arrives that has as-yet + * unobserved subtransactions then this will not have been fully + * handled by the call to RecordKnownAssignedTransactionIds() in the + * main recovery loop in xlog.c. So we need to do bookkeeping again to + * cover that case. This is confusing and it is easy to think this + * call is irrelevant, which has happened three times in development + * already. Leave it in. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* Mark the transaction aborted in pg_xact, no need for async stuff */ + TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); + + /* + * We must update the ProcArray after we have marked clog. + */ + ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid); + + /* + * There are no invalidation messages to send or undo. + */ + + /* + * Release locks, if any. There are no invalidations to send. + */ + if (parsed->xinfo & XACT_XINFO_HAS_AE_LOCKS) + StandbyReleaseLockTree(xid, parsed->nsubxacts, parsed->subxacts); + } + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + /* recover apply progress */ + replorigin_advance(origin_id, parsed->origin_lsn, lsn, + false /* backward */ , false /* WAL */ ); + } + + /* Make sure files supposed to be dropped are dropped */ + if (parsed->nrels > 0) + { + /* + * See comments about update of minimum recovery point on truncation, + * in xact_redo_commit(). + */ + XLogFlush(lsn); + + DropRelationFiles(parsed->xnodes, parsed->nrels, true); + } + + if (parsed->nstats > 0) + { + /* see equivalent call for relations above */ + XLogFlush(lsn); + + pgstat_execute_transactional_drops(parsed->nstats, parsed->stats, true); + } +} + +void +xact_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + /* Backup blocks are not used in xact records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_XACT_COMMIT) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_commit(&parsed, XLogRecGetXid(record), + record->EndRecPtr, XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_commit(&parsed, parsed.twophase_xid, + record->EndRecPtr, XLogRecGetOrigin(record)); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoRemove(parsed.twophase_xid, false); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_ABORT) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_abort(&parsed, XLogRecGetXid(record), + record->EndRecPtr, XLogRecGetOrigin(record)); + } + else if (info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); + xact_redo_abort(&parsed, parsed.twophase_xid, + record->EndRecPtr, XLogRecGetOrigin(record)); + + /* Delete TwoPhaseState gxact entry and/or 2PC file. */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoRemove(parsed.twophase_xid, false); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_PREPARE) + { + /* + * Store xid and start/end pointers of the WAL record in TwoPhaseState + * gxact entry. + */ + LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); + PrepareRedoAdd(XLogRecGetData(record), + record->ReadRecPtr, + record->EndRecPtr, + XLogRecGetOrigin(record)); + LWLockRelease(TwoPhaseStateLock); + } + else if (info == XLOG_XACT_ASSIGNMENT) + { + xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record); + + if (standbyState >= STANDBY_INITIALIZED) + ProcArrayApplyXidAssignment(xlrec->xtop, + xlrec->nsubxacts, xlrec->xsub); + } + else if (info == XLOG_XACT_INVALIDATIONS) + { + /* + * XXX we do ignore this for now, what matters are invalidations + * written into the commit record. + */ + } + else + elog(PANIC, "xact_redo: unknown op code %u", info); +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c new file mode 100644 index 0000000..59f94b0 --- /dev/null +++ b/src/backend/access/transam/xlog.c @@ -0,0 +1,8906 @@ +/*------------------------------------------------------------------------- + * + * xlog.c + * PostgreSQL write-ahead log manager + * + * The Write-Ahead Log (WAL) functionality is split into several source + * files, in addition to this one: + * + * xloginsert.c - Functions for constructing WAL records + * xlogrecovery.c - WAL recovery and standby code + * xlogreader.c - Facility for reading WAL files and parsing WAL records + * xlogutils.c - Helper functions for WAL redo routines + * + * This file contains functions for coordinating database startup and + * checkpointing, and managing the write-ahead log buffers when the + * system is running. + * + * StartupXLOG() is the main entry point of the startup process. It + * coordinates database startup, performing WAL recovery, and the + * transition from WAL recovery into normal operations. + * + * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most + * callers should not call this directly, but use the functions in + * xloginsert.c to construct the WAL record. XLogFlush() can be used + * to force the WAL to disk. + * + * In addition to those, there are many other functions for interrogating + * the current system state, and for starting/stopping backups. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <ctype.h> +#include <math.h> +#include <time.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <unistd.h> + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heaptoast.h" +#include "access/multixact.h" +#include "access/rewriteheap.h" +#include "access/subtrans.h" +#include "access/timeline.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xloginsert.h" +#include "access/xlogprefetcher.h" +#include "access/xlogreader.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "backup/basebackup.h" +#include "catalog/catversion.h" +#include "catalog/pg_control.h" +#include "catalog/pg_database.h" +#include "common/controldata_utils.h" +#include "common/file_utils.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "port/pg_iovec.h" +#include "postmaster/bgwriter.h" +#include "postmaster/startup.h" +#include "postmaster/walwriter.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/large_object.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/reinit.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/sync.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/relmapper.h" +#include "utils/pg_rusage.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +extern uint32 bootstrap_data_checksum_version; + +/* timeline ID to be used when bootstrapping */ +#define BootstrapTimeLineID 1 + +/* User-settable parameters */ +int max_wal_size_mb = 1024; /* 1 GB */ +int min_wal_size_mb = 80; /* 80 MB */ +int wal_keep_size_mb = 0; +int XLOGbuffers = -1; +int XLogArchiveTimeout = 0; +int XLogArchiveMode = ARCHIVE_MODE_OFF; +char *XLogArchiveCommand = NULL; +bool EnableHotStandby = false; +bool fullPageWrites = true; +bool wal_log_hints = false; +int wal_compression = WAL_COMPRESSION_NONE; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; +bool wal_init_zero = true; +bool wal_recycle = true; +bool log_checkpoints = true; +int sync_method = DEFAULT_SYNC_METHOD; +int wal_level = WAL_LEVEL_MINIMAL; +int CommitDelay = 0; /* precommit delay in microseconds */ +int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ +int wal_retrieve_retry_interval = 5000; +int max_slot_wal_keep_size_mb = -1; +int wal_decode_buffer_size = 512 * 1024; +bool track_wal_io_timing = false; + +#ifdef WAL_DEBUG +bool XLOG_DEBUG = false; +#endif + +int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; + +/* + * Number of WAL insertion locks to use. A higher value allows more insertions + * to happen concurrently, but adds some CPU overhead to flushing the WAL, + * which needs to iterate all the locks. + */ +#define NUM_XLOGINSERT_LOCKS 8 + +/* + * Max distance from last checkpoint, before triggering a new xlog-based + * checkpoint. + */ +int CheckPointSegments; + +/* Estimated distance between checkpoints, in bytes */ +static double CheckPointDistanceEstimate = 0; +static double PrevCheckPointDistance = 0; + +/* + * GUC support + */ +const struct config_enum_entry sync_method_options[] = { + {"fsync", SYNC_METHOD_FSYNC, false}, +#ifdef HAVE_FSYNC_WRITETHROUGH + {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false}, +#endif +#ifdef HAVE_FDATASYNC + {"fdatasync", SYNC_METHOD_FDATASYNC, false}, +#endif +#ifdef OPEN_SYNC_FLAG + {"open_sync", SYNC_METHOD_OPEN, false}, +#endif +#ifdef OPEN_DATASYNC_FLAG + {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false}, +#endif + {NULL, 0, false} +}; + + +/* + * Although only "on", "off", and "always" are documented, + * we accept all the likely variants of "on" and "off". + */ +const struct config_enum_entry archive_mode_options[] = { + {"always", ARCHIVE_MODE_ALWAYS, false}, + {"on", ARCHIVE_MODE_ON, false}, + {"off", ARCHIVE_MODE_OFF, false}, + {"true", ARCHIVE_MODE_ON, true}, + {"false", ARCHIVE_MODE_OFF, true}, + {"yes", ARCHIVE_MODE_ON, true}, + {"no", ARCHIVE_MODE_OFF, true}, + {"1", ARCHIVE_MODE_ON, true}, + {"0", ARCHIVE_MODE_OFF, true}, + {NULL, 0, false} +}; + +/* + * Statistics for current checkpoint are collected in this global struct. + * Because only the checkpointer or a stand-alone backend can perform + * checkpoints, this will be unused in normal backends. + */ +CheckpointStatsData CheckpointStats; + +/* + * During recovery, lastFullPageWrites keeps track of full_page_writes that + * the replayed WAL records indicate. It's initialized with full_page_writes + * that the recovery starting checkpoint record indicates, and then updated + * each time XLOG_FPW_CHANGE record is replayed. + */ +static bool lastFullPageWrites; + +/* + * Local copy of the state tracked by SharedRecoveryState in shared memory, + * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually + * means "not known, need to check the shared state". + */ +static bool LocalRecoveryInProgress = true; + +/* + * Local state for XLogInsertAllowed(): + * 1: unconditionally allowed to insert XLOG + * 0: unconditionally not allowed to insert XLOG + * -1: must check RecoveryInProgress(); disallow until it is false + * Most processes start with -1 and transition to 1 after seeing that recovery + * is not in progress. But we can also force the value for special cases. + * The coding in XLogInsertAllowed() depends on the first two of these states + * being numerically the same as bool true and false. + */ +static int LocalXLogInsertAllowed = -1; + +/* + * ProcLastRecPtr points to the start of the last XLOG record inserted by the + * current backend. It is updated for all inserts. XactLastRecEnd points to + * end+1 of the last record, and is reset when we end a top-level transaction, + * or start a new one; so it can be used to tell if the current transaction has + * created any XLOG records. + * + * While in parallel mode, this may not be fully up to date. When committing, + * a transaction can assume this covers all xlog records written either by the + * user backend or by any parallel worker which was present at any point during + * the transaction. But when aborting, or when still in parallel mode, other + * parallel backends may have written WAL records at later LSNs than the value + * stored here. The parallel leader advances its own copy, when necessary, + * in WaitForParallelWorkersToFinish. + */ +XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr; +XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr; +XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr; + +/* + * RedoRecPtr is this backend's local copy of the REDO record pointer + * (which is almost but not quite the same as a pointer to the most recent + * CHECKPOINT record). We update this from the shared-memory copy, + * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we + * hold an insertion lock). See XLogInsertRecord for details. We are also + * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck; + * see GetRedoRecPtr. + * + * NB: Code that uses this variable must be prepared not only for the + * possibility that it may be arbitrarily out of date, but also for the + * possibility that it might be set to InvalidXLogRecPtr. We used to + * initialize it as a side effect of the first call to RecoveryInProgress(), + * which meant that most code that might use it could assume that it had a + * real if perhaps stale value. That's no longer the case. + */ +static XLogRecPtr RedoRecPtr; + +/* + * doPageWrites is this backend's local copy of (forcePageWrites || + * fullPageWrites). It is used together with RedoRecPtr to decide whether + * a full-page image of a page need to be taken. + * + * NB: Initially this is false, and there's no guarantee that it will be + * initialized to any other value before it is first used. Any code that + * makes use of it must recheck the value after obtaining a WALInsertLock, + * and respond appropriately if it turns out that the previous value wasn't + * accurate. + */ +static bool doPageWrites; + +/*---------- + * Shared-memory data structures for XLOG control + * + * LogwrtRqst indicates a byte position that we need to write and/or fsync + * the log up to (all records before that point must be written or fsynced). + * LogwrtResult indicates the byte positions we have already written/fsynced. + * These structs are identical but are declared separately to indicate their + * slightly different functions. + * + * To read XLogCtl->LogwrtResult, you must hold either info_lck or + * WALWriteLock. To update it, you need to hold both locks. The point of + * this arrangement is that the value can be examined by code that already + * holds WALWriteLock without needing to grab info_lck as well. In addition + * to the shared variable, each backend has a private copy of LogwrtResult, + * which is updated when convenient. + * + * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst + * (protected by info_lck), but we don't need to cache any copies of it. + * + * info_lck is only held long enough to read/update the protected variables, + * so it's a plain spinlock. The other locks are held longer (potentially + * over I/O operations), so we use LWLocks for them. These locks are: + * + * WALBufMappingLock: must be held to replace a page in the WAL buffer cache. + * It is only held while initializing and changing the mapping. If the + * contents of the buffer being replaced haven't been written yet, the mapping + * lock is released while the write is done, and reacquired afterwards. + * + * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or + * XLogFlush). + * + * ControlFileLock: must be held to read/update control file or create + * new log file. + * + *---------- + */ + +typedef struct XLogwrtRqst +{ + XLogRecPtr Write; /* last byte + 1 to write out */ + XLogRecPtr Flush; /* last byte + 1 to flush */ +} XLogwrtRqst; + +typedef struct XLogwrtResult +{ + XLogRecPtr Write; /* last byte + 1 written out */ + XLogRecPtr Flush; /* last byte + 1 flushed */ +} XLogwrtResult; + +/* + * Inserting to WAL is protected by a small fixed number of WAL insertion + * locks. To insert to the WAL, you must hold one of the locks - it doesn't + * matter which one. To lock out other concurrent insertions, you must hold + * of them. Each WAL insertion lock consists of a lightweight lock, plus an + * indicator of how far the insertion has progressed (insertingAt). + * + * The insertingAt values are read when a process wants to flush WAL from + * the in-memory buffers to disk, to check that all the insertions to the + * region the process is about to write out have finished. You could simply + * wait for all currently in-progress insertions to finish, but the + * insertingAt indicator allows you to ignore insertions to later in the WAL, + * so that you only wait for the insertions that are modifying the buffers + * you're about to write out. + * + * This isn't just an optimization. If all the WAL buffers are dirty, an + * inserter that's holding a WAL insert lock might need to evict an old WAL + * buffer, which requires flushing the WAL. If it's possible for an inserter + * to block on another inserter unnecessarily, deadlock can arise when two + * inserters holding a WAL insert lock wait for each other to finish their + * insertion. + * + * Small WAL records that don't cross a page boundary never update the value, + * the WAL record is just copied to the page and the lock is released. But + * to avoid the deadlock-scenario explained above, the indicator is always + * updated before sleeping while holding an insertion lock. + * + * lastImportantAt contains the LSN of the last important WAL record inserted + * using a given lock. This value is used to detect if there has been + * important WAL activity since the last time some action, like a checkpoint, + * was performed - allowing to not repeat the action if not. The LSN is + * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was + * set. lastImportantAt is never cleared, only overwritten by the LSN of newer + * records. Tracking the WAL activity directly in WALInsertLock has the + * advantage of not needing any additional locks to update the value. + */ +typedef struct +{ + LWLock lock; + XLogRecPtr insertingAt; + XLogRecPtr lastImportantAt; +} WALInsertLock; + +/* + * All the WAL insertion locks are allocated as an array in shared memory. We + * force the array stride to be a power of 2, which saves a few cycles in + * indexing, but more importantly also ensures that individual slots don't + * cross cache line boundaries. (Of course, we have to also ensure that the + * array start address is suitably aligned.) + */ +typedef union WALInsertLockPadded +{ + WALInsertLock l; + char pad[PG_CACHE_LINE_SIZE]; +} WALInsertLockPadded; + +/* + * Session status of running backup, used for sanity checks in SQL-callable + * functions to start and stop backups. + */ +static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE; + +/* + * Shared state data for WAL insertion. + */ +typedef struct XLogCtlInsert +{ + slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */ + + /* + * CurrBytePos is the end of reserved WAL. The next record will be + * inserted at that position. PrevBytePos is the start position of the + * previously inserted (or rather, reserved) record - it is copied to the + * prev-link of the next record. These are stored as "usable byte + * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()). + */ + uint64 CurrBytePos; + uint64 PrevBytePos; + + /* + * Make sure the above heavily-contended spinlock and byte positions are + * on their own cache line. In particular, the RedoRecPtr and full page + * write variables below should be on a different cache line. They are + * read on every WAL insertion, but updated rarely, and we don't want + * those reads to steal the cache line containing Curr/PrevBytePos. + */ + char pad[PG_CACHE_LINE_SIZE]; + + /* + * fullPageWrites is the authoritative value used by all backends to + * determine whether to write full-page image to WAL. This shared value, + * instead of the process-local fullPageWrites, is required because, when + * full_page_writes is changed by SIGHUP, we must WAL-log it before it + * actually affects WAL-logging by backends. Checkpointer sets at startup + * or after SIGHUP. + * + * To read these fields, you must hold an insertion lock. To modify them, + * you must hold ALL the locks. + */ + XLogRecPtr RedoRecPtr; /* current redo point for insertions */ + bool forcePageWrites; /* forcing full-page writes for PITR? */ + bool fullPageWrites; + + /* + * runningBackups is a counter indicating the number of backups currently + * in progress. forcePageWrites is set to true when runningBackups is + * non-zero. lastBackupStart is the latest checkpoint redo location used + * as a starting point for an online backup. + */ + int runningBackups; + XLogRecPtr lastBackupStart; + + /* + * WAL insertion locks. + */ + WALInsertLockPadded *WALInsertLocks; +} XLogCtlInsert; + +/* + * Total shared-memory state for XLOG. + */ +typedef struct XLogCtlData +{ + XLogCtlInsert Insert; + + /* Protected by info_lck: */ + XLogwrtRqst LogwrtRqst; + XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ + FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ + XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ + XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ + + XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */ + + /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */ + XLogRecPtr unloggedLSN; + slock_t ulsn_lck; + + /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ + pg_time_t lastSegSwitchTime; + XLogRecPtr lastSegSwitchLSN; + + /* + * Protected by info_lck and WALWriteLock (you must hold either lock to + * read it, but both to update) + */ + XLogwrtResult LogwrtResult; + + /* + * Latest initialized page in the cache (last byte position + 1). + * + * To change the identity of a buffer (and InitializedUpTo), you need to + * hold WALBufMappingLock. To change the identity of a buffer that's + * still dirty, the old page needs to be written out first, and for that + * you need WALWriteLock, and you need to ensure that there are no + * in-progress insertions to the page by calling + * WaitXLogInsertionsToFinish(). + */ + XLogRecPtr InitializedUpTo; + + /* + * These values do not change after startup, although the pointed-to pages + * and xlblocks values certainly do. xlblocks values are protected by + * WALBufMappingLock. + */ + char *pages; /* buffers for unwritten XLOG pages */ + XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ + int XLogCacheBlck; /* highest allocated xlog buffer index */ + + /* + * InsertTimeLineID is the timeline into which new WAL is being inserted + * and flushed. It is zero during recovery, and does not change once set. + * + * If we create a new timeline when the system was started up, + * PrevTimeLineID is the old timeline's ID that we forked off from. + * Otherwise it's equal to InsertTimeLineID. + */ + TimeLineID InsertTimeLineID; + TimeLineID PrevTimeLineID; + + /* + * SharedRecoveryState indicates if we're still in crash or archive + * recovery. Protected by info_lck. + */ + RecoveryState SharedRecoveryState; + + /* + * InstallXLogFileSegmentActive indicates whether the checkpointer should + * arrange for future segments by recycling and/or PreallocXlogFiles(). + * Protected by ControlFileLock. Only the startup process changes it. If + * true, anyone can use InstallXLogFileSegment(). If false, the startup + * process owns the exclusive right to install segments, by reading from + * the archive and possibly replacing existing files. + */ + bool InstallXLogFileSegmentActive; + + /* + * WalWriterSleeping indicates whether the WAL writer is currently in + * low-power mode (and hence should be nudged if an async commit occurs). + * Protected by info_lck. + */ + bool WalWriterSleeping; + + /* + * During recovery, we keep a copy of the latest checkpoint record here. + * lastCheckPointRecPtr points to start of checkpoint record and + * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the + * checkpointer when it wants to create a restartpoint. + * + * Protected by info_lck. + */ + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; + + /* + * lastFpwDisableRecPtr points to the start of the last replayed + * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. + */ + XLogRecPtr lastFpwDisableRecPtr; + + slock_t info_lck; /* locks shared variables shown above */ +} XLogCtlData; + +static XLogCtlData *XLogCtl = NULL; + +/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */ +static WALInsertLockPadded *WALInsertLocks = NULL; + +/* + * We maintain an image of pg_control in shared memory. + */ +static ControlFileData *ControlFile = NULL; + +/* + * Calculate the amount of space left on the page after 'endptr'. Beware + * multiple evaluation! + */ +#define INSERT_FREESPACE(endptr) \ + (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ)) + +/* Macro to advance to next buffer index. */ +#define NextBufIdx(idx) \ + (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1)) + +/* + * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or + * would hold if it was in cache, the page containing 'recptr'. + */ +#define XLogRecPtrToBufIdx(recptr) \ + (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1)) + +/* + * These are the number of bytes in a WAL page usable for WAL data. + */ +#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD) + +/* + * Convert values of GUCs measured in megabytes to equiv. segment count. + * Rounds down. + */ +#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize)) + +/* The number of bytes in a WAL segment usable for WAL data. */ +static int UsableBytesInSegment; + +/* + * Private, possibly out-of-date copy of shared LogwrtResult. + * See discussion above. + */ +static XLogwrtResult LogwrtResult = {0, 0}; + +/* + * openLogFile is -1 or a kernel FD for an open log file segment. + * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI. + * These variables are only used to write the XLOG, and so will normally refer + * to the active segment. + * + * Note: call Reserve/ReleaseExternalFD to track consumption of this FD. + */ +static int openLogFile = -1; +static XLogSegNo openLogSegNo = 0; +static TimeLineID openLogTLI = 0; + +/* + * Local copies of equivalent fields in the control file. When running + * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we + * expect to replay all the WAL available, and updateMinRecoveryPoint is + * switched to false to prevent any updates while replaying records. + * Those values are kept consistent as long as crash recovery runs. + */ +static XLogRecPtr LocalMinRecoveryPoint; +static TimeLineID LocalMinRecoveryPointTLI; +static bool updateMinRecoveryPoint = true; + +/* For WALInsertLockAcquire/Release functions */ +static int MyLockNo = 0; +static bool holdingAllLocks = false; + +#ifdef WAL_DEBUG +static MemoryContext walDebugCxt = NULL; +#endif + +static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, + XLogRecPtr EndOfLog, + TimeLineID newTLI); +static void CheckRequiredParameterValues(void); +static void XLogReportParameters(void); +static int LocalSetXLogInsertAllowed(void); +static void CreateEndOfRecoveryRecord(void); +static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, + XLogRecPtr missingContrecPtr, + TimeLineID newTLI); +static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); +static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); +static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); + +static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, + bool opportunistic); +static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible); +static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, + bool find_free, XLogSegNo max_segno, + TimeLineID tli); +static void XLogFileClose(void); +static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli); +static void RemoveTempXlogFiles(void); +static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, + XLogRecPtr endptr, TimeLineID insertTLI); +static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, + XLogSegNo *endlogSegNo, TimeLineID insertTLI); +static void UpdateLastRemovedPtr(char *filename); +static void ValidateXLOGDirectoryStructure(void); +static void CleanupBackupHistory(void); +static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); +static bool PerformRecoveryXLogAction(void); +static void InitControlFile(uint64 sysidentifier); +static void WriteControlFile(void); +static void ReadControlFile(void); +static void UpdateControlFile(void); +static char *str_time(pg_time_t tnow); + +static void pg_backup_start_callback(int code, Datum arg); + +static int get_sync_bit(int method); + +static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, + XLogRecData *rdata, + XLogRecPtr StartPos, XLogRecPtr EndPos, + TimeLineID tli); +static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, + XLogRecPtr *EndPos, XLogRecPtr *PrevPtr); +static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr); +static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto); +static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli); +static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); +static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); +static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); + +static void WALInsertLockAcquire(void); +static void WALInsertLockAcquireExclusive(void); +static void WALInsertLockRelease(void); +static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); + +/* + * Insert an XLOG record represented by an already-constructed chain of data + * chunks. This is a low-level routine; to construct the WAL record header + * and data, use the higher-level routines in xloginsert.c. + * + * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this + * WAL record applies to, that were not included in the record as full page + * images. If fpw_lsn <= RedoRecPtr, the function does not perform the + * insertion and returns InvalidXLogRecPtr. The caller can then recalculate + * which pages need a full-page image, and retry. If fpw_lsn is invalid, the + * record is always inserted. + * + * 'flags' gives more in-depth control on the record being inserted. See + * XLogSetRecordFlags() for details. + * + * 'topxid_included' tells whether the top-transaction id is logged along with + * current subtransaction. See XLogRecordAssemble(). + * + * The first XLogRecData in the chain must be for the record header, and its + * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and + * xl_crc fields in the header, the rest of the header must already be filled + * by the caller. + * + * Returns XLOG pointer to end of record (beginning of next record). + * This can be used as LSN for data pages affected by the logged action. + * (LSN is the XLOG point up to which the XLOG must be flushed to disk + * before the data page can be written out. This implements the basic + * WAL rule "write the log before the data".) + */ +XLogRecPtr +XLogInsertRecord(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi, + bool topxid_included) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + pg_crc32c rdata_crc; + bool inserted; + XLogRecord *rechdr = (XLogRecord *) rdata->data; + uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; + bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && + info == XLOG_SWITCH); + XLogRecPtr StartPos; + XLogRecPtr EndPos; + bool prevDoPageWrites = doPageWrites; + TimeLineID insertTLI; + + /* we assume that all of the record header is in the first chunk */ + Assert(rdata->len >= SizeOfXLogRecord); + + /* cross-check on whether we should be here or not */ + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); + + /* + * Given that we're not in recovery, InsertTimeLineID is set and can't + * change, so we can read it without a lock. + */ + insertTLI = XLogCtl->InsertTimeLineID; + + /*---------- + * + * We have now done all the preparatory work we can without holding a + * lock or modifying shared state. From here on, inserting the new WAL + * record to the shared WAL buffer cache is a two-step process: + * + * 1. Reserve the right amount of space from the WAL. The current head of + * reserved space is kept in Insert->CurrBytePos, and is protected by + * insertpos_lck. + * + * 2. Copy the record to the reserved WAL space. This involves finding the + * correct WAL buffer containing the reserved space, and copying the + * record in place. This can be done concurrently in multiple processes. + * + * To keep track of which insertions are still in-progress, each concurrent + * inserter acquires an insertion lock. In addition to just indicating that + * an insertion is in progress, the lock tells others how far the inserter + * has progressed. There is a small fixed number of insertion locks, + * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page + * boundary, it updates the value stored in the lock to the how far it has + * inserted, to allow the previous buffer to be flushed. + * + * Holding onto an insertion lock also protects RedoRecPtr and + * fullPageWrites from changing until the insertion is finished. + * + * Step 2 can usually be done completely in parallel. If the required WAL + * page is not initialized yet, you have to grab WALBufMappingLock to + * initialize it, but the WAL writer tries to do that ahead of insertions + * to avoid that from happening in the critical path. + * + *---------- + */ + START_CRIT_SECTION(); + if (isLogSwitch) + WALInsertLockAcquireExclusive(); + else + WALInsertLockAcquire(); + + /* + * Check to see if my copy of RedoRecPtr is out of date. If so, may have + * to go back and have the caller recompute everything. This can only + * happen just after a checkpoint, so it's better to be slow in this case + * and fast otherwise. + * + * Also check to see if fullPageWrites or forcePageWrites was just turned + * on; if we weren't already doing full-page writes then go back and + * recompute. + * + * If we aren't doing full-page writes then RedoRecPtr doesn't actually + * affect the contents of the XLOG record, so we'll update our local copy + * but not force a recomputation. (If doPageWrites was just turned off, + * we could recompute the record without full pages, but we choose not to + * bother.) + */ + if (RedoRecPtr != Insert->RedoRecPtr) + { + Assert(RedoRecPtr < Insert->RedoRecPtr); + RedoRecPtr = Insert->RedoRecPtr; + } + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + + if (doPageWrites && + (!prevDoPageWrites || + (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr))) + { + /* + * Oops, some buffer now needs to be backed up that the caller didn't + * back up. Start over. + */ + WALInsertLockRelease(); + END_CRIT_SECTION(); + return InvalidXLogRecPtr; + } + + /* + * Reserve space for the record in the WAL. This also sets the xl_prev + * pointer. + */ + if (isLogSwitch) + inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); + else + { + ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, + &rechdr->xl_prev); + inserted = true; + } + + if (inserted) + { + /* + * Now that xl_prev has been filled in, calculate CRC of the record + * header. + */ + rdata_crc = rechdr->xl_crc; + COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + rechdr->xl_crc = rdata_crc; + + /* + * All the record data, including the header, is now ready to be + * inserted. Copy the record in the space reserved. + */ + CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, + StartPos, EndPos, insertTLI); + + /* + * Unless record is flagged as not important, update LSN of last + * important record in the current slot. When holding all locks, just + * update the first one. + */ + if ((flags & XLOG_MARK_UNIMPORTANT) == 0) + { + int lockno = holdingAllLocks ? 0 : MyLockNo; + + WALInsertLocks[lockno].l.lastImportantAt = StartPos; + } + } + else + { + /* + * This was an xlog-switch record, but the current insert location was + * already exactly at the beginning of a segment, so there was no need + * to do anything. + */ + } + + /* + * Done! Let others know that we're finished. + */ + WALInsertLockRelease(); + + END_CRIT_SECTION(); + + MarkCurrentTransactionIdLoggedIfAny(); + + /* + * Mark top transaction id is logged (if needed) so that we should not try + * to log it again with the next WAL record in the current subtransaction. + */ + if (topxid_included) + MarkSubxactTopXidLogged(); + + /* + * Update shared LogwrtRqst.Write, if we crossed page boundary. + */ + if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) + { + SpinLockAcquire(&XLogCtl->info_lck); + /* advance global request to include new block(s) */ + if (XLogCtl->LogwrtRqst.Write < EndPos) + XLogCtl->LogwrtRqst.Write = EndPos; + /* update local result copy while I have the chance */ + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + } + + /* + * If this was an XLOG_SWITCH record, flush the record and the empty + * padding space that fills the rest of the segment, and perform + * end-of-segment actions (eg, notifying archiver). + */ + if (isLogSwitch) + { + TRACE_POSTGRESQL_WAL_SWITCH(); + XLogFlush(EndPos); + + /* + * Even though we reserved the rest of the segment for us, which is + * reflected in EndPos, we return a pointer to just the end of the + * xlog-switch record. + */ + if (inserted) + { + EndPos = StartPos + SizeOfXLogRecord; + if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) + { + uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size); + + if (offset == EndPos % XLOG_BLCKSZ) + EndPos += SizeOfXLogLongPHD; + else + EndPos += SizeOfXLogShortPHD; + } + } + } + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + { + static XLogReaderState *debug_reader = NULL; + XLogRecord *record; + DecodedXLogRecord *decoded; + StringInfoData buf; + StringInfoData recordBuf; + char *errormsg = NULL; + MemoryContext oldCxt; + + oldCxt = MemoryContextSwitchTo(walDebugCxt); + + initStringInfo(&buf); + appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + + /* + * We have to piece together the WAL record data from the XLogRecData + * entries, so that we can pass it to the rm_desc function as one + * contiguous chunk. + */ + initStringInfo(&recordBuf); + for (; rdata != NULL; rdata = rdata->next) + appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len); + + /* We also need temporary space to decode the record. */ + record = (XLogRecord *) recordBuf.data; + decoded = (DecodedXLogRecord *) + palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len)); + + if (!debug_reader) + debug_reader = XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(), NULL); + + if (!debug_reader) + { + appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor"); + } + else if (!DecodeXLogRecord(debug_reader, + decoded, + record, + EndPos, + &errormsg)) + { + appendStringInfo(&buf, "error decoding record: %s", + errormsg ? errormsg : "no error message"); + } + else + { + appendStringInfoString(&buf, " - "); + + debug_reader->record = decoded; + xlog_outdesc(&buf, debug_reader); + debug_reader->record = NULL; + } + elog(LOG, "%s", buf.data); + + pfree(decoded); + pfree(buf.data); + pfree(recordBuf.data); + MemoryContextSwitchTo(oldCxt); + } +#endif + + /* + * Update our global variables + */ + ProcLastRecPtr = StartPos; + XactLastRecEnd = EndPos; + + /* Report WAL traffic to the instrumentation. */ + if (inserted) + { + pgWalUsage.wal_bytes += rechdr->xl_tot_len; + pgWalUsage.wal_records++; + pgWalUsage.wal_fpi += num_fpi; + } + + return EndPos; +} + +/* + * Reserves the right amount of space for a record of given size from the WAL. + * *StartPos is set to the beginning of the reserved section, *EndPos to + * its end+1. *PrevPtr is set to the beginning of the previous record; it is + * used to set the xl_prev of this record. + * + * This is the performance critical part of XLogInsert that must be serialized + * across backends. The rest can happen mostly in parallel. Try to keep this + * section as short as possible, insertpos_lck can be heavily contended on a + * busy system. + * + * NB: The space calculation here must match the code in CopyXLogRecordToWAL, + * where we actually copy the record to the reserved space. + */ +static void +ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + + size = MAXALIGN(size); + + /* All (non xlog-switch) records should contain data. */ + Assert(size > SizeOfXLogRecord); + + /* + * The duration the spinlock needs to be held is minimized by minimizing + * the calculations that have to be done while holding the lock. The + * current tip of reserved WAL is kept in CurrBytePos, as a byte position + * that only counts "usable" bytes in WAL, that is, it excludes all WAL + * page headers. The mapping between "usable" byte positions and physical + * positions (XLogRecPtrs) can be done outside the locked region, and + * because the usable byte position doesn't include any headers, reserving + * X bytes from WAL is almost as simple as "CurrBytePos += X". + */ + SpinLockAcquire(&Insert->insertpos_lck); + + startbytepos = Insert->CurrBytePos; + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + SpinLockRelease(&Insert->insertpos_lck); + + *StartPos = XLogBytePosToRecPtr(startbytepos); + *EndPos = XLogBytePosToEndRecPtr(endbytepos); + *PrevPtr = XLogBytePosToRecPtr(prevbytepos); + + /* + * Check that the conversions between "usable byte positions" and + * XLogRecPtrs work consistently in both directions. + */ + Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); +} + +/* + * Like ReserveXLogInsertLocation(), but for an xlog-switch record. + * + * A log-switch record is handled slightly differently. The rest of the + * segment will be reserved for this insertion, as indicated by the returned + * *EndPos value. However, if we are already at the beginning of the current + * segment, *StartPos and *EndPos are set to the current location without + * reserving any space, and the function returns false. +*/ +static bool +ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + uint32 size = MAXALIGN(SizeOfXLogRecord); + XLogRecPtr ptr; + uint32 segleft; + + /* + * These calculations are a bit heavy-weight to be done while holding a + * spinlock, but since we're holding all the WAL insertion locks, there + * are no other inserters competing for it. GetXLogInsertRecPtr() does + * compete for it, but that's not called very frequently. + */ + SpinLockAcquire(&Insert->insertpos_lck); + + startbytepos = Insert->CurrBytePos; + + ptr = XLogBytePosToEndRecPtr(startbytepos); + if (XLogSegmentOffset(ptr, wal_segment_size) == 0) + { + SpinLockRelease(&Insert->insertpos_lck); + *EndPos = *StartPos = ptr; + return false; + } + + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + + *StartPos = XLogBytePosToRecPtr(startbytepos); + *EndPos = XLogBytePosToEndRecPtr(endbytepos); + + segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size); + if (segleft != wal_segment_size) + { + /* consume the rest of the segment */ + *EndPos += segleft; + endbytepos = XLogRecPtrToBytePos(*EndPos); + } + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + SpinLockRelease(&Insert->insertpos_lck); + + *PrevPtr = XLogBytePosToRecPtr(prevbytepos); + + Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0); + Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); + + return true; +} + +/* + * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved + * area in the WAL. + */ +static void +CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, + XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli) +{ + char *currpos; + int freespace; + int written; + XLogRecPtr CurrPos; + XLogPageHeader pagehdr; + + /* + * Get a pointer to the right place in the right WAL buffer to start + * inserting to. + */ + CurrPos = StartPos; + currpos = GetXLogBuffer(CurrPos, tli); + freespace = INSERT_FREESPACE(CurrPos); + + /* + * there should be enough space for at least the first field (xl_tot_len) + * on this page. + */ + Assert(freespace >= sizeof(uint32)); + + /* Copy record data */ + written = 0; + while (rdata != NULL) + { + char *rdata_data = rdata->data; + int rdata_len = rdata->len; + + while (rdata_len > freespace) + { + /* + * Write what fits on this page, and continue on the next page. + */ + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0); + memcpy(currpos, rdata_data, freespace); + rdata_data += freespace; + rdata_len -= freespace; + written += freespace; + CurrPos += freespace; + + /* + * Get pointer to beginning of next page, and set the xlp_rem_len + * in the page header. Set XLP_FIRST_IS_CONTRECORD. + * + * It's safe to set the contrecord flag and xlp_rem_len without a + * lock on the page. All the other flags were already set when the + * page was initialized, in AdvanceXLInsertBuffer, and we're the + * only backend that needs to set the contrecord flag. + */ + currpos = GetXLogBuffer(CurrPos, tli); + pagehdr = (XLogPageHeader) currpos; + pagehdr->xlp_rem_len = write_len - written; + pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD; + + /* skip over the page header */ + if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0) + { + CurrPos += SizeOfXLogLongPHD; + currpos += SizeOfXLogLongPHD; + } + else + { + CurrPos += SizeOfXLogShortPHD; + currpos += SizeOfXLogShortPHD; + } + freespace = INSERT_FREESPACE(CurrPos); + } + + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0); + memcpy(currpos, rdata_data, rdata_len); + currpos += rdata_len; + CurrPos += rdata_len; + freespace -= rdata_len; + written += rdata_len; + + rdata = rdata->next; + } + Assert(written == write_len); + + /* + * If this was an xlog-switch, it's not enough to write the switch record, + * we also have to consume all the remaining space in the WAL segment. We + * have already reserved that space, but we need to actually fill it. + */ + if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0) + { + /* An xlog-switch record doesn't contain any data besides the header */ + Assert(write_len == SizeOfXLogRecord); + + /* Assert that we did reserve the right amount of space */ + Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0); + + /* Use up all the remaining space on the current page */ + CurrPos += freespace; + + /* + * Cause all remaining pages in the segment to be flushed, leaving the + * XLog position where it should be, at the start of the next segment. + * We do this one page at a time, to make sure we don't deadlock + * against ourselves if wal_buffers < wal_segment_size. + */ + while (CurrPos < EndPos) + { + /* + * The minimal action to flush the page would be to call + * WALInsertLockUpdateInsertingAt(CurrPos) followed by + * AdvanceXLInsertBuffer(...). The page would be left initialized + * mostly to zeros, except for the page header (always the short + * variant, as this is never a segment's first page). + * + * The large vistas of zeros are good for compressibility, but the + * headers interrupting them every XLOG_BLCKSZ (with values that + * differ from page to page) are not. The effect varies with + * compression tool, but bzip2 for instance compresses about an + * order of magnitude worse if those headers are left in place. + * + * Rather than complicating AdvanceXLInsertBuffer itself (which is + * called in heavily-loaded circumstances as well as this lightly- + * loaded one) with variant behavior, we just use GetXLogBuffer + * (which itself calls the two methods we need) to get the pointer + * and zero most of the page. Then we just zero the page header. + */ + currpos = GetXLogBuffer(CurrPos, tli); + MemSet(currpos, 0, SizeOfXLogShortPHD); + + CurrPos += XLOG_BLCKSZ; + } + } + else + { + /* Align the end position, so that the next record starts aligned */ + CurrPos = MAXALIGN64(CurrPos); + } + + if (CurrPos != EndPos) + elog(PANIC, "space reserved for WAL record does not match what was written"); +} + +/* + * Acquire a WAL insertion lock, for inserting to WAL. + */ +static void +WALInsertLockAcquire(void) +{ + bool immed; + + /* + * It doesn't matter which of the WAL insertion locks we acquire, so try + * the one we used last time. If the system isn't particularly busy, it's + * a good bet that it's still available, and it's good to have some + * affinity to a particular lock so that you don't unnecessarily bounce + * cache lines between processes when there's no contention. + * + * If this is the first time through in this backend, pick a lock + * (semi-)randomly. This allows the locks to be used evenly if you have a + * lot of very short connections. + */ + static int lockToTry = -1; + + if (lockToTry == -1) + lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS; + MyLockNo = lockToTry; + + /* + * The insertingAt value is initially set to 0, as we don't know our + * insert location yet. + */ + immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE); + if (!immed) + { + /* + * If we couldn't get the lock immediately, try another lock next + * time. On a system with more insertion locks than concurrent + * inserters, this causes all the inserters to eventually migrate to a + * lock that no-one else is using. On a system with more inserters + * than locks, it still helps to distribute the inserters evenly + * across the locks. + */ + lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS; + } +} + +/* + * Acquire all WAL insertion locks, to prevent other backends from inserting + * to WAL. + */ +static void +WALInsertLockAcquireExclusive(void) +{ + int i; + + /* + * When holding all the locks, all but the last lock's insertingAt + * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real + * XLogRecPtr value, to make sure that no-one blocks waiting on those. + */ + for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++) + { + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + LWLockUpdateVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + PG_UINT64_MAX); + } + /* Variable value reset to 0 at release */ + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + + holdingAllLocks = true; +} + +/* + * Release our insertion lock (or locks, if we're holding them all). + * + * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the + * next time the lock is acquired. + */ +static void +WALInsertLockRelease(void) +{ + if (holdingAllLocks) + { + int i; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + LWLockReleaseClearVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + 0); + + holdingAllLocks = false; + } + else + { + LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock, + &WALInsertLocks[MyLockNo].l.insertingAt, + 0); + } +} + +/* + * Update our insertingAt value, to let others know that we've finished + * inserting up to that point. + */ +static void +WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt) +{ + if (holdingAllLocks) + { + /* + * We use the last lock to mark our actual position, see comments in + * WALInsertLockAcquireExclusive. + */ + LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock, + &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt, + insertingAt); + } + else + LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock, + &WALInsertLocks[MyLockNo].l.insertingAt, + insertingAt); +} + +/* + * Wait for any WAL insertions < upto to finish. + * + * Returns the location of the oldest insertion that is still in-progress. + * Any WAL prior to that point has been fully copied into WAL buffers, and + * can be flushed out to disk. Because this waits for any insertions older + * than 'upto' to finish, the return value is always >= 'upto'. + * + * Note: When you are about to write out WAL, you must call this function + * *before* acquiring WALWriteLock, to avoid deadlocks. This function might + * need to wait for an insertion to finish (or at least advance to next + * uninitialized page), and the inserter might need to evict an old WAL buffer + * to make room for a new one, which in turn requires WALWriteLock. + */ +static XLogRecPtr +WaitXLogInsertionsToFinish(XLogRecPtr upto) +{ + uint64 bytepos; + XLogRecPtr reservedUpto; + XLogRecPtr finishedUpto; + XLogCtlInsert *Insert = &XLogCtl->Insert; + int i; + + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + /* Read the current insert position */ + SpinLockAcquire(&Insert->insertpos_lck); + bytepos = Insert->CurrBytePos; + SpinLockRelease(&Insert->insertpos_lck); + reservedUpto = XLogBytePosToEndRecPtr(bytepos); + + /* + * No-one should request to flush a piece of WAL that hasn't even been + * reserved yet. However, it can happen if there is a block with a bogus + * LSN on disk, for example. XLogFlush checks for that situation and + * complains, but only after the flush. Here we just assume that to mean + * that all WAL that has been reserved needs to be finished. In this + * corner-case, the return value can be smaller than 'upto' argument. + */ + if (upto > reservedUpto) + { + ereport(LOG, + (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + upto = reservedUpto; + } + + /* + * Loop through all the locks, sleeping on any in-progress insert older + * than 'upto'. + * + * finishedUpto is our return value, indicating the point upto which all + * the WAL insertions have been finished. Initialize it to the head of + * reserved WAL, and as we iterate through the insertion locks, back it + * out for any insertion that's still in progress. + */ + finishedUpto = reservedUpto; + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr insertingat = InvalidXLogRecPtr; + + do + { + /* + * See if this insertion is in progress. LWLockWaitForVar will + * wait for the lock to be released, or for the 'value' to be set + * by a LWLockUpdateVar call. When a lock is initially acquired, + * its value is 0 (InvalidXLogRecPtr), which means that we don't + * know where it's inserting yet. We will have to wait for it. If + * it's a small insertion, the record will most likely fit on the + * same page and the inserter will release the lock without ever + * calling LWLockUpdateVar. But if it has to sleep, it will + * advertise the insertion point with LWLockUpdateVar before + * sleeping. + */ + if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, + &WALInsertLocks[i].l.insertingAt, + insertingat, &insertingat)) + { + /* the lock was free, so no insertion in progress */ + insertingat = InvalidXLogRecPtr; + break; + } + + /* + * This insertion is still in progress. Have to wait, unless the + * inserter has proceeded past 'upto'. + */ + } while (insertingat < upto); + + if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) + finishedUpto = insertingat; + } + return finishedUpto; +} + +/* + * Get a pointer to the right location in the WAL buffer containing the + * given XLogRecPtr. + * + * If the page is not initialized yet, it is initialized. That might require + * evicting an old dirty buffer from the buffer cache, which means I/O. + * + * The caller must ensure that the page containing the requested location + * isn't evicted yet, and won't be evicted. The way to ensure that is to + * hold onto a WAL insertion lock with the insertingAt position set to + * something <= ptr. GetXLogBuffer() will update insertingAt if it needs + * to evict an old page from the buffer. (This means that once you call + * GetXLogBuffer() with a given 'ptr', you must not access anything before + * that point anymore, and must not call GetXLogBuffer() with an older 'ptr' + * later, because older buffers might be recycled already) + */ +static char * +GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) +{ + int idx; + XLogRecPtr endptr; + static uint64 cachedPage = 0; + static char *cachedPos = NULL; + XLogRecPtr expectedEndPtr; + + /* + * Fast path for the common case that we need to access again the same + * page as last time. + */ + if (ptr / XLOG_BLCKSZ == cachedPage) + { + Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + return cachedPos + ptr % XLOG_BLCKSZ; + } + + /* + * The XLog buffer cache is organized so that a page is always loaded to a + * particular buffer. That way we can easily calculate the buffer a given + * page must be loaded into, from the XLogRecPtr alone. + */ + idx = XLogRecPtrToBufIdx(ptr); + + /* + * See what page is loaded in the buffer at the moment. It could be the + * page we're looking for, or something older. It can't be anything newer + * - that would imply the page we're looking for has already been written + * out to disk and evicted, and the caller is responsible for making sure + * that doesn't happen. + * + * However, we don't hold a lock while we read the value. If someone has + * just initialized the page, it's possible that we get a "torn read" of + * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In + * that case we will see a bogus value. That's ok, we'll grab the mapping + * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than + * the page we're looking for. But it means that when we do this unlocked + * read, we might see a value that appears to be ahead of the page we're + * looking for. Don't PANIC on that, until we've verified the value while + * holding the lock. + */ + expectedEndPtr = ptr; + expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ; + + endptr = XLogCtl->xlblocks[idx]; + if (expectedEndPtr != endptr) + { + XLogRecPtr initializedUpto; + + /* + * Before calling AdvanceXLInsertBuffer(), which can block, let others + * know how far we're finished with inserting the record. + * + * NB: If 'ptr' points to just after the page header, advertise a + * position at the beginning of the page rather than 'ptr' itself. If + * there are no other insertions running, someone might try to flush + * up to our advertised location. If we advertised a position after + * the page header, someone might try to flush the page header, even + * though page might actually not be initialized yet. As the first + * inserter on the page, we are effectively responsible for making + * sure that it's initialized, before we let insertingAt to move past + * the page header. + */ + if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ) + initializedUpto = ptr - SizeOfXLogShortPHD; + else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ) + initializedUpto = ptr - SizeOfXLogLongPHD; + else + initializedUpto = ptr; + + WALInsertLockUpdateInsertingAt(initializedUpto); + + AdvanceXLInsertBuffer(ptr, tli, false); + endptr = XLogCtl->xlblocks[idx]; + + if (expectedEndPtr != endptr) + elog(PANIC, "could not find WAL buffer for %X/%X", + LSN_FORMAT_ARGS(ptr)); + } + else + { + /* + * Make sure the initialization of the page is visible to us, and + * won't arrive later to overwrite the WAL data we write on the page. + */ + pg_memory_barrier(); + } + + /* + * Found the buffer holding this page. Return a pointer to the right + * offset within the page. + */ + cachedPage = ptr / XLOG_BLCKSZ; + cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + + Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + + return cachedPos + ptr % XLOG_BLCKSZ; +} + +/* + * Converts a "usable byte position" to XLogRecPtr. A usable byte position + * is the position starting from the beginning of WAL, excluding all WAL + * page headers. + */ +static XLogRecPtr +XLogBytePosToRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; + + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; +} + +/* + * Like XLogBytePosToRecPtr, but if the position is at a page boundary, + * returns a pointer to the beginning of the page (ie. before page header), + * not to where the first xlog record on that page would go to. This is used + * when converting a pointer to the end of a record. + */ +static XLogRecPtr +XLogBytePosToEndRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + if (bytesleft == 0) + seg_offset = 0; + else + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; + + if (bytesleft == 0) + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; + else + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; +} + +/* + * Convert an XLogRecPtr to a "usable byte position". + */ +static uint64 +XLogRecPtrToBytePos(XLogRecPtr ptr) +{ + uint64 fullsegs; + uint32 fullpages; + uint32 offset; + uint64 result; + + XLByteToSeg(ptr, fullsegs, wal_segment_size); + + fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ; + offset = ptr % XLOG_BLCKSZ; + + if (fullpages == 0) + { + result = fullsegs * UsableBytesInSegment; + if (offset > 0) + { + Assert(offset >= SizeOfXLogLongPHD); + result += offset - SizeOfXLogLongPHD; + } + } + else + { + result = fullsegs * UsableBytesInSegment + + (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */ + (fullpages - 1) * UsableBytesInPage; /* full pages */ + if (offset > 0) + { + Assert(offset >= SizeOfXLogShortPHD); + result += offset - SizeOfXLogShortPHD; + } + } + + return result; +} + +/* + * Initialize XLOG buffers, writing out old buffers if they still contain + * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is + * true, initialize as many pages as we can without having to write out + * unwritten data. Any new pages are initialized to zeros, with pages headers + * initialized properly. + */ +static void +AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + int nextidx; + XLogRecPtr OldPageRqstPtr; + XLogwrtRqst WriteRqst; + XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr; + XLogRecPtr NewPageBeginPtr; + XLogPageHeader NewPage; + int npages pg_attribute_unused() = 0; + + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + + /* + * Now that we have the lock, check if someone initialized the page + * already. + */ + while (upto >= XLogCtl->InitializedUpTo || opportunistic) + { + nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo); + + /* + * Get ending-offset of the buffer page we need to replace (this may + * be zero if the buffer hasn't been used yet). Fall through if it's + * already written out. + */ + OldPageRqstPtr = XLogCtl->xlblocks[nextidx]; + if (LogwrtResult.Write < OldPageRqstPtr) + { + /* + * Nope, got work to do. If we just want to pre-initialize as much + * as we can without flushing, give up now. + */ + if (opportunistic) + break; + + /* Before waiting, get info_lck and update LogwrtResult */ + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr) + XLogCtl->LogwrtRqst.Write = OldPageRqstPtr; + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Now that we have an up-to-date LogwrtResult value, see if we + * still need to write it or if someone else already did. + */ + if (LogwrtResult.Write < OldPageRqstPtr) + { + /* + * Must acquire write lock. Release WALBufMappingLock first, + * to make sure that all insertions that we need to wait for + * can finish (up to this same position). Otherwise we risk + * deadlock. + */ + LWLockRelease(WALBufMappingLock); + + WaitXLogInsertionsToFinish(OldPageRqstPtr); + + LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + + LogwrtResult = XLogCtl->LogwrtResult; + if (LogwrtResult.Write >= OldPageRqstPtr) + { + /* OK, someone wrote it already */ + LWLockRelease(WALWriteLock); + } + else + { + /* Have to write it ourselves */ + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); + WriteRqst.Write = OldPageRqstPtr; + WriteRqst.Flush = 0; + XLogWrite(WriteRqst, tli, false); + LWLockRelease(WALWriteLock); + PendingWalStats.wal_buffers_full++; + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + } + /* Re-acquire WALBufMappingLock and retry */ + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + continue; + } + } + + /* + * Now the next buffer slot is free and we can set it up to be the + * next output page. + */ + NewPageBeginPtr = XLogCtl->InitializedUpTo; + NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ; + + Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx); + + NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); + + /* + * Be sure to re-zero the buffer so that bytes beyond what we've + * written will look like zeroes and not valid XLOG records... + */ + MemSet((char *) NewPage, 0, XLOG_BLCKSZ); + + /* + * Fill the new page's header + */ + NewPage->xlp_magic = XLOG_PAGE_MAGIC; + + /* NewPage->xlp_info = 0; */ /* done by memset */ + NewPage->xlp_tli = tli; + NewPage->xlp_pageaddr = NewPageBeginPtr; + + /* NewPage->xlp_rem_len = 0; */ /* done by memset */ + + /* + * If online backup is not in progress, mark the header to indicate + * that WAL records beginning in this page have removable backup + * blocks. This allows the WAL archiver to know whether it is safe to + * compress archived WAL data by transforming full-block records into + * the non-full-block format. It is sufficient to record this at the + * page level because we force a page switch (in fact a segment + * switch) when starting a backup, so the flag will be off before any + * records can be written during the backup. At the end of a backup, + * the last page will be marked as all unsafe when perhaps only part + * is unsafe, but at worst the archiver would miss the opportunity to + * compress a few records. + */ + if (!Insert->forcePageWrites) + NewPage->xlp_info |= XLP_BKP_REMOVABLE; + + /* + * If first page of an XLOG segment file, make it a long header. + */ + if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0) + { + XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage; + + NewLongPage->xlp_sysid = ControlFile->system_identifier; + NewLongPage->xlp_seg_size = wal_segment_size; + NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ; + NewPage->xlp_info |= XLP_LONG_HEADER; + } + + /* + * Make sure the initialization of the page becomes visible to others + * before the xlblocks update. GetXLogBuffer() reads xlblocks without + * holding a lock. + */ + pg_write_barrier(); + + *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr; + + XLogCtl->InitializedUpTo = NewPageEndPtr; + + npages++; + } + LWLockRelease(WALBufMappingLock); + +#ifdef WAL_DEBUG + if (XLOG_DEBUG && npages > 0) + { + elog(DEBUG1, "initialized %d pages, up to %X/%X", + npages, LSN_FORMAT_ARGS(NewPageEndPtr)); + } +#endif +} + +/* + * Calculate CheckPointSegments based on max_wal_size_mb and + * checkpoint_completion_target. + */ +static void +CalculateCheckpointSegments(void) +{ + double target; + + /*------- + * Calculate the distance at which to trigger a checkpoint, to avoid + * exceeding max_wal_size_mb. This is based on two assumptions: + * + * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept + * WAL for two checkpoint cycles to allow us to recover from the + * secondary checkpoint if the first checkpoint failed, though we + * only did this on the primary anyway, not on standby. Keeping just + * one checkpoint simplifies processing and reduces disk space in + * many smaller databases.) + * b) during checkpoint, we consume checkpoint_completion_target * + * number of segments consumed between checkpoints. + *------- + */ + target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) / + (1.0 + CheckPointCompletionTarget); + + /* round down */ + CheckPointSegments = (int) target; + + if (CheckPointSegments < 1) + CheckPointSegments = 1; +} + +void +assign_max_wal_size(int newval, void *extra) +{ + max_wal_size_mb = newval; + CalculateCheckpointSegments(); +} + +void +assign_checkpoint_completion_target(double newval, void *extra) +{ + CheckPointCompletionTarget = newval; + CalculateCheckpointSegments(); +} + +/* + * At a checkpoint, how many WAL segments to recycle as preallocated future + * XLOG segments? Returns the highest segment that should be preallocated. + */ +static XLogSegNo +XLOGfileslop(XLogRecPtr lastredoptr) +{ + XLogSegNo minSegNo; + XLogSegNo maxSegNo; + double distance; + XLogSegNo recycleSegNo; + + /* + * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb + * correspond to. Always recycle enough segments to meet the minimum, and + * remove enough segments to stay below the maximum. + */ + minSegNo = lastredoptr / wal_segment_size + + ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1; + maxSegNo = lastredoptr / wal_segment_size + + ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1; + + /* + * Between those limits, recycle enough segments to get us through to the + * estimated end of next checkpoint. + * + * To estimate where the next checkpoint will finish, assume that the + * system runs steadily consuming CheckPointDistanceEstimate bytes between + * every checkpoint. + */ + distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate; + /* add 10% for good measure. */ + distance *= 1.10; + + recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) / + wal_segment_size); + + if (recycleSegNo < minSegNo) + recycleSegNo = minSegNo; + if (recycleSegNo > maxSegNo) + recycleSegNo = maxSegNo; + + return recycleSegNo; +} + +/* + * Check whether we've consumed enough xlog space that a checkpoint is needed. + * + * new_segno indicates a log file that has just been filled up (or read + * during recovery). We measure the distance from RedoRecPtr to new_segno + * and see if that exceeds CheckPointSegments. + * + * Note: it is caller's responsibility that RedoRecPtr is up-to-date. + */ +bool +XLogCheckpointNeeded(XLogSegNo new_segno) +{ + XLogSegNo old_segno; + + XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size); + + if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1)) + return true; + return false; +} + +/* + * Write and/or fsync the log at least as far as WriteRqst indicates. + * + * If flexible == true, we don't have to write as far as WriteRqst, but + * may stop at any convenient boundary (such as a cache or logfile boundary). + * This option allows us to avoid uselessly issuing multiple writes when a + * single one would do. + * + * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst) + * must be called before grabbing the lock, to make sure the data is ready to + * write. + */ +static void +XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) +{ + bool ispartialpage; + bool last_iteration; + bool finishing_seg; + int curridx; + int npages; + int startidx; + uint32 startoffset; + + /* We should always be inside a critical section here */ + Assert(CritSectionCount > 0); + + /* + * Update local LogwrtResult (caller probably did this already, but...) + */ + LogwrtResult = XLogCtl->LogwrtResult; + + /* + * Since successive pages in the xlog cache are consecutively allocated, + * we can usually gather multiple pages together and issue just one + * write() call. npages is the number of pages we have determined can be + * written together; startidx is the cache block index of the first one, + * and startoffset is the file offset at which it should go. The latter + * two variables are only valid when npages > 0, but we must initialize + * all of them to keep the compiler quiet. + */ + npages = 0; + startidx = 0; + startoffset = 0; + + /* + * Within the loop, curridx is the cache block index of the page to + * consider writing. Begin at the buffer containing the next unwritten + * page, or last partially written page. + */ + curridx = XLogRecPtrToBufIdx(LogwrtResult.Write); + + while (LogwrtResult.Write < WriteRqst.Write) + { + /* + * Make sure we're not ahead of the insert process. This could happen + * if we're passed a bogus WriteRqst.Write that is past the end of the + * last page that's been initialized by AdvanceXLInsertBuffer. + */ + XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx]; + + if (LogwrtResult.Write >= EndPtr) + elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(EndPtr)); + + /* Advance LogwrtResult.Write to end of current buffer page */ + LogwrtResult.Write = EndPtr; + ispartialpage = WriteRqst.Write < LogwrtResult.Write; + + if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + { + /* + * Switch to new logfile segment. We cannot have any pending + * pages here (since we dump what we have at segment end). + */ + Assert(npages == 0); + if (openLogFile >= 0) + XLogFileClose(); + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogTLI = tli; + + /* create/use new log file */ + openLogFile = XLogFileInit(openLogSegNo, tli); + ReserveExternalFD(); + } + + /* Make sure we have the current logfile open */ + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogTLI = tli; + openLogFile = XLogFileOpen(openLogSegNo, tli); + ReserveExternalFD(); + } + + /* Add current page to the set of pending pages-to-dump */ + if (npages == 0) + { + /* first of group */ + startidx = curridx; + startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ, + wal_segment_size); + } + npages++; + + /* + * Dump the set if this will be the last loop iteration, or if we are + * at the last page of the cache area (since the next page won't be + * contiguous in memory), or if we are at the end of the logfile + * segment. + */ + last_iteration = WriteRqst.Write <= LogwrtResult.Write; + + finishing_seg = !ispartialpage && + (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size; + + if (last_iteration || + curridx == XLogCtl->XLogCacheBlck || + finishing_seg) + { + char *from; + Size nbytes; + Size nleft; + int written; + instr_time start; + + /* OK to write the page(s) */ + from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; + nbytes = npages * (Size) XLOG_BLCKSZ; + nleft = nbytes; + do + { + errno = 0; + + /* Measure I/O timing to write WAL data */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); + written = pg_pwrite(openLogFile, from, nleft, startoffset); + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL data + * were written out to disk. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + PendingWalStats.wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + } + + PendingWalStats.wal_write++; + + if (written <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + if (errno == EINTR) + continue; + + save_errno = errno; + XLogFileName(xlogfname, tli, openLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log file %s " + "at offset %u, length %zu: %m", + xlogfname, startoffset, nleft))); + } + nleft -= written; + from += written; + startoffset += written; + } while (nleft > 0); + + npages = 0; + + /* + * If we just wrote the whole last page of a logfile segment, + * fsync the segment immediately. This avoids having to go back + * and re-open prior segments when an fsync request comes along + * later. Doing it here ensures that one and only one backend will + * perform this fsync. + * + * This is also the right place to notify the Archiver that the + * segment is ready to copy to archival storage, and to update the + * timer for archive_timeout, and to signal for a checkpoint if + * too many logfile segments have been used since the last + * checkpoint. + */ + if (finishing_seg) + { + issue_xlog_fsync(openLogFile, openLogSegNo, tli); + + /* signal that we need to wakeup walsenders later */ + WalSndWakeupRequest(); + + LogwrtResult.Flush = LogwrtResult.Write; /* end of page */ + + if (XLogArchivingActive()) + XLogArchiveNotifySeg(openLogSegNo, tli); + + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush; + + /* + * Request a checkpoint if we've consumed too much xlog since + * the last one. For speed, we first check using the local + * copy of RedoRecPtr, which might be out of date; if it looks + * like a checkpoint is needed, forcibly update RedoRecPtr and + * recheck. + */ + if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(openLogSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + } + + if (ispartialpage) + { + /* Only asked to write a partial page */ + LogwrtResult.Write = WriteRqst.Write; + break; + } + curridx = NextBufIdx(curridx); + + /* If flexible, break out of loop as soon as we wrote something */ + if (flexible && npages == 0) + break; + } + + Assert(npages == 0); + + /* + * If asked to flush, do so + */ + if (LogwrtResult.Flush < WriteRqst.Flush && + LogwrtResult.Flush < LogwrtResult.Write) + { + /* + * Could get here without iterating above loop, in which case we might + * have no open file or the wrong one. However, we do not need to + * fsync more than one file. + */ + if (sync_method != SYNC_METHOD_OPEN && + sync_method != SYNC_METHOD_OPEN_DSYNC) + { + if (openLogFile >= 0 && + !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + XLogFileClose(); + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogTLI = tli; + openLogFile = XLogFileOpen(openLogSegNo, tli); + ReserveExternalFD(); + } + + issue_xlog_fsync(openLogFile, openLogSegNo, tli); + } + + /* signal that we need to wakeup walsenders later */ + WalSndWakeupRequest(); + + LogwrtResult.Flush = LogwrtResult.Write; + } + + /* + * Update shared-memory status + * + * We make sure that the shared 'request' values do not fall behind the + * 'result' values. This is not absolutely essential, but it saves some + * code in a couple of places. + */ + { + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->LogwrtResult = LogwrtResult; + if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write) + XLogCtl->LogwrtRqst.Write = LogwrtResult.Write; + if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush) + XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush; + SpinLockRelease(&XLogCtl->info_lck); + } +} + +/* + * Record the LSN for an asynchronous transaction commit/abort + * and nudge the WALWriter if there is work for it to do. + * (This should not be called for synchronous commits.) + */ +void +XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) +{ + XLogRecPtr WriteRqstPtr = asyncXactLSN; + bool sleeping; + + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + sleeping = XLogCtl->WalWriterSleeping; + if (XLogCtl->asyncXactLSN < asyncXactLSN) + XLogCtl->asyncXactLSN = asyncXactLSN; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If the WALWriter is sleeping, we should kick it to make it come out of + * low-power mode. Otherwise, determine whether there's a full page of + * WAL available to write. + */ + if (!sleeping) + { + /* back off to last completed page boundary */ + WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; + + /* if we have already flushed that far, we're done */ + if (WriteRqstPtr <= LogwrtResult.Flush) + return; + } + + /* + * Nudge the WALWriter: it has a full page of WAL to write, or we want it + * to come out of low-power mode so that this async commit will reach disk + * within the expected amount of time. + */ + if (ProcGlobal->walwriterLatch) + SetLatch(ProcGlobal->walwriterLatch); +} + +/* + * Record the LSN up to which we can remove WAL because it's not required by + * any replication slot. + */ +void +XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->replicationSlotMinLSN = lsn; + SpinLockRelease(&XLogCtl->info_lck); +} + + +/* + * Return the oldest LSN we must retain to satisfy the needs of some + * replication slot. + */ +static XLogRecPtr +XLogGetReplicationSlotMinimumLSN(void) +{ + XLogRecPtr retval; + + SpinLockAcquire(&XLogCtl->info_lck); + retval = XLogCtl->replicationSlotMinLSN; + SpinLockRelease(&XLogCtl->info_lck); + + return retval; +} + +/* + * Advance minRecoveryPoint in control file. + * + * If we crash during recovery, we must reach this point again before the + * database is consistent. + * + * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint + * is only updated if it's not already greater than or equal to 'lsn'. + */ +static void +UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) +{ + /* Quick check using our local copy of the variable */ + if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint)) + return; + + /* + * An invalid minRecoveryPoint means that we need to recover all the WAL, + * i.e., we're doing crash recovery. We never modify the control file's + * value in that case, so we can short-circuit future checks here too. The + * local values of minRecoveryPoint and minRecoveryPointTLI should not be + * updated until crash recovery finishes. We only do this for the startup + * process as it should not update its own reference of minRecoveryPoint + * until it has finished crash recovery to make sure that all WAL + * available is replayed in this case. This also saves from extra locks + * taken on the control file from the startup process. + */ + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) + { + updateMinRecoveryPoint = false; + return; + } + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + /* update local copy */ + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) + updateMinRecoveryPoint = false; + else if (force || LocalMinRecoveryPoint < lsn) + { + XLogRecPtr newMinRecoveryPoint; + TimeLineID newMinRecoveryPointTLI; + + /* + * To avoid having to update the control file too often, we update it + * all the way to the last record being replayed, even though 'lsn' + * would suffice for correctness. This also allows the 'force' case + * to not need a valid 'lsn' value. + * + * Another important reason for doing it this way is that the passed + * 'lsn' value could be bogus, i.e., past the end of available WAL, if + * the caller got it from a corrupted heap page. Accepting such a + * value as the min recovery point would prevent us from coming up at + * all. Instead, we just log a warning and continue with recovery. + * (See also the comments about corrupt LSNs in XLogFlush.) + */ + newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI); + if (!force && newMinRecoveryPoint < lsn) + elog(WARNING, + "xlog min recovery request %X/%X is past current point %X/%X", + LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); + + /* update control file */ + if (ControlFile->minRecoveryPoint < newMinRecoveryPoint) + { + ControlFile->minRecoveryPoint = newMinRecoveryPoint; + ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI; + UpdateControlFile(); + LocalMinRecoveryPoint = newMinRecoveryPoint; + LocalMinRecoveryPointTLI = newMinRecoveryPointTLI; + + ereport(DEBUG2, + (errmsg_internal("updated min recovery point to %X/%X on timeline %u", + LSN_FORMAT_ARGS(newMinRecoveryPoint), + newMinRecoveryPointTLI))); + } + } + LWLockRelease(ControlFileLock); +} + +/* + * Ensure that all XLOG data through the given position is flushed to disk. + * + * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not + * already held, and we try to avoid acquiring it if possible. + */ +void +XLogFlush(XLogRecPtr record) +{ + XLogRecPtr WriteRqstPtr; + XLogwrtRqst WriteRqst; + TimeLineID insertTLI = XLogCtl->InsertTimeLineID; + + /* + * During REDO, we are reading not writing WAL. Therefore, instead of + * trying to flush the WAL, we should update minRecoveryPoint instead. We + * test XLogInsertAllowed(), not InRecovery, because we need checkpointer + * to act this way too, and because when it tries to write the + * end-of-recovery checkpoint, it should indeed flush. + */ + if (!XLogInsertAllowed()) + { + UpdateMinRecoveryPoint(record, false); + return; + } + + /* Quick exit if already known flushed */ + if (record <= LogwrtResult.Flush) + return; + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", + LSN_FORMAT_ARGS(record), + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +#endif + + START_CRIT_SECTION(); + + /* + * Since fsync is usually a horribly expensive operation, we try to + * piggyback as much data as we can on each fsync: if we see any more data + * entered into the xlog buffer, we'll write and fsync that too, so that + * the final value of LogwrtResult.Flush is as large as possible. This + * gives us some chance of avoiding another fsync immediately after. + */ + + /* initialize to given target; may increase below */ + WriteRqstPtr = record; + + /* + * Now wait until we get the write lock, or someone else does the flush + * for us. + */ + for (;;) + { + XLogRecPtr insertpos; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write) + WriteRqstPtr = XLogCtl->LogwrtRqst.Write; + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* done already? */ + if (record <= LogwrtResult.Flush) + break; + + /* + * Before actually performing the write, wait for all in-flight + * insertions to the pages we're about to write to finish. + */ + insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr); + + /* + * Try to get the write lock. If we can't get it immediately, wait + * until it's released, and recheck if we still need to do the flush + * or if the backend that held the lock did it for us already. This + * helps to maintain a good rate of group committing when the system + * is bottlenecked by the speed of fsyncing. + */ + if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) + { + /* + * The lock is now free, but we didn't acquire it yet. Before we + * do, loop back to check if someone else flushed the record for + * us already. + */ + continue; + } + + /* Got the lock; recheck whether request is satisfied */ + LogwrtResult = XLogCtl->LogwrtResult; + if (record <= LogwrtResult.Flush) + { + LWLockRelease(WALWriteLock); + break; + } + + /* + * Sleep before flush! By adding a delay here, we may give further + * backends the opportunity to join the backlog of group commit + * followers; this can significantly improve transaction throughput, + * at the risk of increasing transaction latency. + * + * We do not sleep if enableFsync is not turned on, nor if there are + * fewer than CommitSiblings other backends with active transactions. + */ + if (CommitDelay > 0 && enableFsync && + MinimumActiveBackends(CommitSiblings)) + { + pg_usleep(CommitDelay); + + /* + * Re-check how far we can now flush the WAL. It's generally not + * safe to call WaitXLogInsertionsToFinish while holding + * WALWriteLock, because an in-progress insertion might need to + * also grab WALWriteLock to make progress. But we know that all + * the insertions up to insertpos have already finished, because + * that's what the earlier WaitXLogInsertionsToFinish() returned. + * We're only calling it again to allow insertpos to be moved + * further forward, not to actually wait for anyone. + */ + insertpos = WaitXLogInsertionsToFinish(insertpos); + } + + /* try to write/flush later additions to XLOG as well */ + WriteRqst.Write = insertpos; + WriteRqst.Flush = insertpos; + + XLogWrite(WriteRqst, insertTLI, false); + + LWLockRelease(WALWriteLock); + /* done */ + break; + } + + END_CRIT_SECTION(); + + /* wake up walsenders now that we've released heavily contended locks */ + WalSndWakeupProcessRequests(); + + /* + * If we still haven't flushed to the request point then we have a + * problem; most likely, the requested flush point is past end of XLOG. + * This has been seen to occur when a disk page has a corrupted LSN. + * + * Formerly we treated this as a PANIC condition, but that hurts the + * system's robustness rather than helping it: we do not want to take down + * the whole system due to corruption on one data page. In particular, if + * the bad page is encountered again during recovery then we would be + * unable to restart the database at all! (This scenario actually + * happened in the field several times with 7.1 releases.) As of 8.4, bad + * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem; + * the only time we can reach here during recovery is while flushing the + * end-of-recovery checkpoint record, and we don't expect that to have a + * bad LSN. + * + * Note that for calls from xact.c, the ERROR will be promoted to PANIC + * since xact.c calls this routine inside a critical section. However, + * calls from bufmgr.c are not within critical sections and so we will not + * force a restart for a bad LSN on a data page. + */ + if (LogwrtResult.Flush < record) + elog(ERROR, + "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", + LSN_FORMAT_ARGS(record), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +} + +/* + * Write & flush xlog, but without specifying exactly where to. + * + * We normally write only completed blocks; but if there is nothing to do on + * that basis, we check for unwritten async commits in the current incomplete + * block, and write through the latest one of those. Thus, if async commits + * are not being used, we will write complete blocks only. + * + * If, based on the above, there's anything to write we do so immediately. But + * to avoid calling fsync, fdatasync et. al. at a rate that'd impact + * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's + * more than wal_writer_flush_after unflushed blocks. + * + * We can guarantee that async commits reach disk after at most three + * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite + * to write "flexibly", meaning it can stop at the end of the buffer ring; + * this makes a difference only with very high load or long wal_writer_delay, + * but imposes one extra cycle for the worst case for async commits.) + * + * This routine is invoked periodically by the background walwriter process. + * + * Returns true if there was any work to do, even if we skipped flushing due + * to wal_writer_delay/wal_writer_flush_after. + */ +bool +XLogBackgroundFlush(void) +{ + XLogwrtRqst WriteRqst; + bool flexible = true; + static TimestampTz lastflush; + TimestampTz now; + int flushbytes; + TimeLineID insertTLI; + + /* XLOG doesn't need flushing during recovery */ + if (RecoveryInProgress()) + return false; + + /* + * Since we're not in recovery, InsertTimeLineID is set and can't change, + * so we can read it without a lock. + */ + insertTLI = XLogCtl->InsertTimeLineID; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + WriteRqst = XLogCtl->LogwrtRqst; + SpinLockRelease(&XLogCtl->info_lck); + + /* back off to last completed page boundary */ + WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; + + /* if we have already flushed that far, consider async commit records */ + if (WriteRqst.Write <= LogwrtResult.Flush) + { + SpinLockAcquire(&XLogCtl->info_lck); + WriteRqst.Write = XLogCtl->asyncXactLSN; + SpinLockRelease(&XLogCtl->info_lck); + flexible = false; /* ensure it all gets written */ + } + + /* + * If already known flushed, we're done. Just need to check if we are + * holding an open file handle to a logfile that's no longer in use, + * preventing the file from being deleted. + */ + if (WriteRqst.Write <= LogwrtResult.Flush) + { + if (openLogFile >= 0) + { + if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + { + XLogFileClose(); + } + } + return false; + } + + /* + * Determine how far to flush WAL, based on the wal_writer_delay and + * wal_writer_flush_after GUCs. + */ + now = GetCurrentTimestamp(); + flushbytes = + WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + + if (WalWriterFlushAfter == 0 || lastflush == 0) + { + /* first call, or block based limits disabled */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay)) + { + /* + * Flush the writes at least every WalWriterDelay ms. This is + * important to bound the amount of time it takes for an asynchronous + * commit to hit disk. + */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (flushbytes >= WalWriterFlushAfter) + { + /* exceeded wal_writer_flush_after blocks, flush */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else + { + /* no flushing, this time round */ + WriteRqst.Flush = 0; + } + +#ifdef WAL_DEBUG + if (XLOG_DEBUG) + elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + LSN_FORMAT_ARGS(WriteRqst.Write), + LSN_FORMAT_ARGS(WriteRqst.Flush), + LSN_FORMAT_ARGS(LogwrtResult.Write), + LSN_FORMAT_ARGS(LogwrtResult.Flush)); +#endif + + START_CRIT_SECTION(); + + /* now wait for any in-progress insertions to finish and get write lock */ + WaitXLogInsertionsToFinish(WriteRqst.Write); + LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + LogwrtResult = XLogCtl->LogwrtResult; + if (WriteRqst.Write > LogwrtResult.Write || + WriteRqst.Flush > LogwrtResult.Flush) + { + XLogWrite(WriteRqst, insertTLI, flexible); + } + LWLockRelease(WALWriteLock); + + END_CRIT_SECTION(); + + /* wake up walsenders now that we've released heavily contended locks */ + WalSndWakeupProcessRequests(); + + /* + * Great, done. To take some work off the critical path, try to initialize + * as many of the no-longer-needed WAL buffers for future use as we can. + */ + AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true); + + /* + * If we determined that we need to write data, but somebody else + * wrote/flushed already, it should be considered as being active, to + * avoid hibernating too early. + */ + return true; +} + +/* + * Test whether XLOG data has been flushed up to (at least) the given position. + * + * Returns true if a flush is still needed. (It may be that someone else + * is already in process of flushing that far, however.) + */ +bool +XLogNeedsFlush(XLogRecPtr record) +{ + /* + * During recovery, we don't flush WAL but update minRecoveryPoint + * instead. So "needs flush" is taken to mean whether minRecoveryPoint + * would need to be updated. + */ + if (RecoveryInProgress()) + { + /* + * An invalid minRecoveryPoint means that we need to recover all the + * WAL, i.e., we're doing crash recovery. We never modify the control + * file's value in that case, so we can short-circuit future checks + * here too. This triggers a quick exit path for the startup process, + * which cannot update its local copy of minRecoveryPoint as long as + * it has not replayed all WAL available when doing crash recovery. + */ + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) + updateMinRecoveryPoint = false; + + /* Quick exit if already known to be updated or cannot be updated */ + if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint) + return false; + + /* + * Update local copy of minRecoveryPoint. But if the lock is busy, + * just return a conservative guess. + */ + if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) + return true; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LWLockRelease(ControlFileLock); + + /* + * Check minRecoveryPoint for any other process than the startup + * process doing crash recovery, which should not update the control + * file value if crash recovery is still running. + */ + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) + updateMinRecoveryPoint = false; + + /* check again */ + if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint) + return false; + else + return true; + } + + /* Quick exit if already known flushed */ + if (record <= LogwrtResult.Flush) + return false; + + /* read LogwrtResult and update local state */ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* check again */ + if (record <= LogwrtResult.Flush) + return false; + + return true; +} + +/* + * Try to make a given XLOG file segment exist. + * + * logsegno: identify segment. + * + * *added: on return, true if this call raised the number of extant segments. + * + * path: on return, this char[MAXPGPATH] has the path to the logsegno file. + * + * Returns -1 or FD of opened file. A -1 here is not an error; a caller + * wanting an open segment should attempt to open "path", which usually will + * succeed. (This is weird, but it's efficient for the callers.) + */ +static int +XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, + bool *added, char *path) +{ + char tmppath[MAXPGPATH]; + PGAlignedXLogBlock zbuffer; + XLogSegNo installed_segno; + XLogSegNo max_segno; + int fd; + int save_errno; + + Assert(logtli != 0); + + XLogFilePath(path, logtli, logsegno, wal_segment_size); + + /* + * Try to use existent file (checkpoint maker may have created it already) + */ + *added = false; + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + else + return fd; + + /* + * Initialize an empty (all zeroes) segment. NOTE: it is possible that + * another process is doing the same thing. If so, we will end up + * pre-creating an extra log segment. That seems OK, and better than + * holding the lock throughout this lengthy process. + */ + elog(DEBUG2, "creating and filling new WAL file"); + + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); + save_errno = 0; + if (wal_init_zero) + { + struct iovec iov[PG_IOV_MAX]; + int blocks; + + /* + * Zero-fill the file. With this setting, we do this the hard way to + * ensure that all the file space has really been allocated. On + * platforms that allow "holes" in files, just seeking to the end + * doesn't allocate intermediate space. This way, we know that we + * have all the space and (after the fsync below) that all the + * indirect blocks are down on disk. Therefore, fdatasync(2) or + * O_DSYNC will be sufficient to sync future writes to the log file. + */ + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (int i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = XLOG_BLCKSZ; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = wal_segment_size / XLOG_BLCKSZ; + for (int i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * XLOG_BLCKSZ; + + if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) + { + save_errno = errno; + break; + } + + i += iovcnt; + } + } + else + { + /* + * Otherwise, seeking to the end and writing a solitary byte is + * enough. + */ + errno = 0; + if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) + { + /* if write didn't set errno, assume no disk space */ + save_errno = errno ? errno : ENOSPC; + } + } + pgstat_report_wait_end(); + + if (save_errno) + { + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + + close(fd); + + errno = save_errno; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + /* + * Now move the segment into place with its final name. Cope with + * possibility that someone else has created the file while we were + * filling ours: if so, use ours to pre-create a future log segment. + */ + installed_segno = logsegno; + + /* + * XXX: What should we use as max_segno? We used to use XLOGfileslop when + * that was a constant, but that was always a bit dubious: normally, at a + * checkpoint, XLOGfileslop was the offset from the checkpoint record, but + * here, it was the offset from the insert location. We can't do the + * normal XLOGfileslop calculation here because we don't have access to + * the prior checkpoint's redo location. So somewhat arbitrarily, just use + * CheckPointSegments. + */ + max_segno = logsegno + CheckPointSegments; + if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno, + logtli)) + { + *added = true; + elog(DEBUG2, "done creating and filling new WAL file"); + } + else + { + /* + * No need for any more future segments, or InstallXLogFileSegment() + * failed to rename the file into place. If the rename failed, a + * caller opening the file may fail. + */ + unlink(tmppath); + elog(DEBUG2, "abandoned new WAL file"); + } + + return -1; +} + +/* + * Create a new XLOG file segment, or open a pre-existing one. + * + * logsegno: identify segment to be created/opened. + * + * Returns FD of opened file. + * + * Note: errors here are ERROR not PANIC because we might or might not be + * inside a critical section (eg, during checkpoint there is no reason to + * take down the system on failure). They will promote to PANIC if we are + * in a critical section. + */ +int +XLogFileInit(XLogSegNo logsegno, TimeLineID logtli) +{ + bool ignore_added; + char path[MAXPGPATH]; + int fd; + + Assert(logtli != 0); + + fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path); + if (fd >= 0) + return fd; + + /* Now open original target segment (might not be file I just made) */ + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return fd; +} + +/* + * Create a new XLOG file segment by copying a pre-existing one. + * + * destsegno: identify segment to be created. + * + * srcTLI, srcsegno: identify segment to be copied (could be from + * a different timeline) + * + * upto: how much of the source file to copy (the rest is filled with + * zeros) + * + * Currently this is only used during recovery, and so there are no locking + * considerations. But we should be just as tense as XLogFileInit to avoid + * emplacing a bogus file. + */ +static void +XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno, + TimeLineID srcTLI, XLogSegNo srcsegno, + int upto) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + PGAlignedXLogBlock buffer; + int srcfd; + int fd; + int nbytes; + + /* + * Open the source file + */ + XLogFilePath(path, srcTLI, srcsegno, wal_segment_size); + srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* + * Copy into a temp file name. + */ + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + + unlink(tmppath); + + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ + fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tmppath))); + + /* + * Do the data copying. + */ + for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer)) + { + int nread; + + nread = upto - nbytes; + + /* + * The part that is not read from the source file is filled with + * zeros. + */ + if (nread < sizeof(buffer)) + memset(buffer.data, 0, sizeof(buffer)); + + if (nread > 0) + { + int r; + + if (nread > sizeof(buffer)) + nread = sizeof(buffer); + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ); + r = read(srcfd, buffer.data, nread); + if (r != nread) + { + if (r < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, r, (Size) nread))); + } + pgstat_report_wait_end(); + } + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE); + if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer)) + { + int save_errno = errno; + + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + } + + pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC); + if (pg_fsync(fd) != 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* + * Now move the segment into place with its final name. + */ + if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI)) + elog(ERROR, "InstallXLogFileSegment should not have failed"); +} + +/* + * Install a new XLOG segment file as a current or future log segment. + * + * This is used both to install a newly-created segment (which has a temp + * filename while it's being created) and to recycle an old segment. + * + * *segno: identify segment to install as (or first possible target). + * When find_free is true, this is modified on return to indicate the + * actual installation location or last segment searched. + * + * tmppath: initial name of file to install. It will be renamed into place. + * + * find_free: if true, install the new segment at the first empty segno + * number at or after the passed numbers. If false, install the new segment + * exactly where specified, deleting any existing segment file there. + * + * max_segno: maximum segment number to install the new file as. Fail if no + * free slot is found between *segno and max_segno. (Ignored when find_free + * is false.) + * + * tli: The timeline on which the new segment should be installed. + * + * Returns true if the file was installed successfully. false indicates that + * max_segno limit was exceeded, the startup process has disabled this + * function for now, or an error occurred while renaming the file into place. + */ +static bool +InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, + bool find_free, XLogSegNo max_segno, TimeLineID tli) +{ + char path[MAXPGPATH]; + struct stat stat_buf; + + Assert(tli != 0); + + XLogFilePath(path, tli, *segno, wal_segment_size); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } + + if (!find_free) + { + /* Force installation: get rid of any pre-existing segment file */ + durable_unlink(path, DEBUG1); + } + else + { + /* Find a free slot to put it in */ + while (stat(path, &stat_buf) == 0) + { + if ((*segno) >= max_segno) + { + /* Failed to find a free slot within specified range */ + LWLockRelease(ControlFileLock); + return false; + } + (*segno)++; + XLogFilePath(path, tli, *segno, wal_segment_size); + } + } + + /* + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). + */ + if (durable_rename_excl(tmppath, path, LOG) != 0) + { + LWLockRelease(ControlFileLock); + /* durable_rename_excl already emitted log message */ + return false; + } + + LWLockRelease(ControlFileLock); + + return true; +} + +/* + * Open a pre-existing logfile segment for writing. + */ +int +XLogFileOpen(XLogSegNo segno, TimeLineID tli) +{ + char path[MAXPGPATH]; + int fd; + + XLogFilePath(path, tli, segno, wal_segment_size); + + fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + return fd; +} + +/* + * Close the current logfile segment for writing. + */ +static void +XLogFileClose(void) +{ + Assert(openLogFile >= 0); + + /* + * WAL segment files will not be re-read in normal operation, so we advise + * the OS to release any cached pages. But do not do so if WAL archiving + * or streaming is active, because archiver and walsender process could + * use the cache to read the WAL segment. + */ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + if (!XLogIsNeeded()) + (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); +#endif + + if (close(openLogFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", xlogfname))); + } + + openLogFile = -1; + ReleaseExternalFD(); +} + +/* + * Preallocate log files beyond the specified log endpoint. + * + * XXX this is currently extremely conservative, since it forces only one + * future log segment to exist, and even that only if we are 75% done with + * the current one. This is only appropriate for very low-WAL-volume systems. + * High-volume systems will be OK once they've built up a sufficient set of + * recycled log segments, but the startup transient is likely to include + * a lot of segment creations by foreground processes, which is not so good. + * + * XLogFileInitInternal() can ereport(ERROR). All known causes indicate big + * trouble; for example, a full filesystem is one cause. The checkpoint WAL + * and/or ControlFile updates already completed. If a RequestCheckpoint() + * initiated the present checkpoint and an ERROR ends this function, the + * command that called RequestCheckpoint() fails. That's not ideal, but it's + * not worth contorting more functions to use caller-specified elevel values. + * (With or without RequestCheckpoint(), an ERROR forestalls some inessential + * reporting and resource reclamation.) + */ +static void +PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli) +{ + XLogSegNo _logSegNo; + int lf; + bool added; + char path[MAXPGPATH]; + uint64 offset; + + if (!XLogCtl->InstallXLogFileSegmentActive) + return; /* unlocked check says no */ + + XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); + offset = XLogSegmentOffset(endptr - 1, wal_segment_size); + if (offset >= (uint32) (0.75 * wal_segment_size)) + { + _logSegNo++; + lf = XLogFileInitInternal(_logSegNo, tli, &added, path); + if (lf >= 0) + close(lf); + if (added) + CheckpointStats.ckpt_segs_added++; + } +} + +/* + * Throws an error if the given log segment has already been removed or + * recycled. The caller should only pass a segment that it knows to have + * existed while the server has been running, as this function always + * succeeds if no WAL segments have been removed since startup. + * 'tli' is only used in the error message. + * + * Note: this function guarantees to keep errno unchanged on return. + * This supports callers that use this to possibly deliver a better + * error message about a missing file, while still being able to throw + * a normal file-access error afterwards, if this does return. + */ +void +CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) +{ + int save_errno = errno; + XLogSegNo lastRemovedSegNo; + + SpinLockAcquire(&XLogCtl->info_lck); + lastRemovedSegNo = XLogCtl->lastRemovedSegNo; + SpinLockRelease(&XLogCtl->info_lck); + + if (segno <= lastRemovedSegNo) + { + char filename[MAXFNAMELEN]; + + XLogFileName(filename, tli, segno, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + filename))); + } + errno = save_errno; +} + +/* + * Return the last WAL segment removed, or 0 if no segment has been removed + * since startup. + * + * NB: the result can be out of date arbitrarily fast, the caller has to deal + * with that. + */ +XLogSegNo +XLogGetLastRemovedSegno(void) +{ + XLogSegNo lastRemovedSegNo; + + SpinLockAcquire(&XLogCtl->info_lck); + lastRemovedSegNo = XLogCtl->lastRemovedSegNo; + SpinLockRelease(&XLogCtl->info_lck); + + return lastRemovedSegNo; +} + + +/* + * Update the last removed segno pointer in shared memory, to reflect that the + * given XLOG file has been removed. + */ +static void +UpdateLastRemovedPtr(char *filename) +{ + uint32 tli; + XLogSegNo segno; + + XLogFromFileName(filename, &tli, &segno, wal_segment_size); + + SpinLockAcquire(&XLogCtl->info_lck); + if (segno > XLogCtl->lastRemovedSegNo) + XLogCtl->lastRemovedSegNo = segno; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Remove all temporary log files in pg_wal + * + * This is called at the beginning of recovery after a previous crash, + * at a point where no other processes write fresh WAL data. + */ +static void +RemoveTempXlogFiles(void) +{ + DIR *xldir; + struct dirent *xlde; + + elog(DEBUG2, "removing all temporary WAL segments"); + + xldir = AllocateDir(XLOGDIR); + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + char path[MAXPGPATH]; + + if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0) + continue; + + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name); + unlink(path); + elog(DEBUG2, "removed temporary WAL segment \"%s\"", path); + } + FreeDir(xldir); +} + +/* + * Recycle or remove all log files older or equal to passed segno. + * + * endptr is current (or recent) end of xlog, and lastredoptr is the + * redo pointer of the last checkpoint. These are used to determine + * whether we want to recycle rather than delete no-longer-wanted log files. + * + * insertTLI is the current timeline for XLOG insertion. Any recycled + * segments should be reused for this timeline. + */ +static void +RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr, + TimeLineID insertTLI) +{ + DIR *xldir; + struct dirent *xlde; + char lastoff[MAXFNAMELEN]; + XLogSegNo endlogSegNo; + XLogSegNo recycleSegNo; + + /* Initialize info about where to try to recycle to */ + XLByteToSeg(endptr, endlogSegNo, wal_segment_size); + recycleSegNo = XLOGfileslop(lastredoptr); + + /* + * Construct a filename of the last segment to be kept. The timeline ID + * doesn't matter, we ignore that in the comparison. (During recovery, + * InsertTimeLineID isn't set, so we can't use that.) + */ + XLogFileName(lastoff, 0, segno, wal_segment_size); + + elog(DEBUG2, "attempting to remove WAL segments older than log file %s", + lastoff); + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + /* Ignore files that are not XLOG segments */ + if (!IsXLogFileName(xlde->d_name) && + !IsPartialXLogFileName(xlde->d_name)) + continue; + + /* + * We ignore the timeline part of the XLOG segment identifiers in + * deciding whether a segment is still needed. This ensures that we + * won't prematurely remove a segment from a parent timeline. We could + * probably be a little more proactive about removing segments of + * non-parent timelines, but that would be a whole lot more + * complicated. + * + * We use the alphanumeric sorting property of the filenames to decide + * which ones are earlier than the lastoff segment. + */ + if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0) + { + if (XLogArchiveCheckDone(xlde->d_name)) + { + /* Update the last removed location in shared memory first */ + UpdateLastRemovedPtr(xlde->d_name); + + RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo, + insertTLI); + } + } + } + + FreeDir(xldir); +} + +/* + * Remove WAL files that are not part of the given timeline's history. + * + * This is called during recovery, whenever we switch to follow a new + * timeline, and at the end of recovery when we create a new timeline. We + * wouldn't otherwise care about extra WAL files lying in pg_wal, but they + * might be leftover pre-allocated or recycled WAL segments on the old timeline + * that we haven't used yet, and contain garbage. If we just leave them in + * pg_wal, they will eventually be archived, and we can't let that happen. + * Files that belong to our timeline history are valid, because we have + * successfully replayed them, but from others we can't be sure. + * + * 'switchpoint' is the current point in WAL where we switch to new timeline, + * and 'newTLI' is the new timeline we switch to. + */ +void +RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI) +{ + DIR *xldir; + struct dirent *xlde; + char switchseg[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo switchLogSegNo; + XLogSegNo recycleSegNo; + + /* + * Initialize info about where to begin the work. This will recycle, + * somewhat arbitrarily, 10 future segments. + */ + XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size); + XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size); + recycleSegNo = endLogSegNo + 10; + + /* + * Construct a filename of the last segment to be kept. + */ + XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size); + + elog(DEBUG2, "attempting to remove WAL segments newer than log file %s", + switchseg); + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + /* Ignore files that are not XLOG segments */ + if (!IsXLogFileName(xlde->d_name)) + continue; + + /* + * Remove files that are on a timeline older than the new one we're + * switching to, but with a segment number >= the first segment on the + * new timeline. + */ + if (strncmp(xlde->d_name, switchseg, 8) < 0 && + strcmp(xlde->d_name + 8, switchseg + 8) > 0) + { + /* + * If the file has already been marked as .ready, however, don't + * remove it yet. It should be OK to remove it - files that are + * not part of our timeline history are not required for recovery + * - but seems safer to let them be archived and removed later. + */ + if (!XLogArchiveIsReady(xlde->d_name)) + RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo, + newTLI); + } + } + + FreeDir(xldir); +} + +/* + * Recycle or remove a log file that's no longer needed. + * + * segname is the name of the segment to recycle or remove. recycleSegNo + * is the segment number to recycle up to. endlogSegNo is the segment + * number of the current (or recent) end of WAL. + * + * endlogSegNo gets incremented if the segment is recycled so as it is not + * checked again with future callers of this function. + * + * insertTLI is the current timeline for XLOG insertion. Any recycled segments + * should be used for this timeline. + */ +static void +RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, + XLogSegNo *endlogSegNo, TimeLineID insertTLI) +{ + char path[MAXPGPATH]; +#ifdef WIN32 + char newpath[MAXPGPATH]; +#endif + struct stat statbuf; + + snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname); + + /* + * Before deleting the file, see if it can be recycled as a future log + * segment. Only recycle normal files, because we don't want to recycle + * symbolic links pointing to a separate archive directory. + */ + if (wal_recycle && + *endlogSegNo <= recycleSegNo && + XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */ + lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && + InstallXLogFileSegment(endlogSegNo, path, + true, recycleSegNo, insertTLI)) + { + ereport(DEBUG2, + (errmsg_internal("recycled write-ahead log file \"%s\"", + segname))); + CheckpointStats.ckpt_segs_recycled++; + /* Needn't recheck that slot on future iterations */ + (*endlogSegNo)++; + } + else + { + /* No need for any more future segments, or recycling failed ... */ + int rc; + + ereport(DEBUG2, + (errmsg_internal("removing write-ahead log file \"%s\"", + segname))); + +#ifdef WIN32 + + /* + * On Windows, if another process (e.g another backend) holds the file + * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file + * will still show up in directory listing until the last handle is + * closed. To avoid confusing the lingering deleted file for a live + * WAL file that needs to be archived, rename it before deleting it. + * + * If another process holds the file open without FILE_SHARE_DELETE + * flag, rename will fail. We'll try again at the next checkpoint. + */ + snprintf(newpath, MAXPGPATH, "%s.deleted", path); + if (rename(path, newpath) != 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\": %m", + path))); + return; + } + rc = durable_unlink(newpath, LOG); +#else + rc = durable_unlink(path, LOG); +#endif + if (rc != 0) + { + /* Message already logged by durable_unlink() */ + return; + } + CheckpointStats.ckpt_segs_removed++; + } + + XLogArchiveCleanup(segname); +} + +/* + * Verify whether pg_wal and pg_wal/archive_status exist. + * If the latter does not exist, recreate it. + * + * It is not the goal of this function to verify the contents of these + * directories, but to help in cases where someone has performed a cluster + * copy for PITR purposes but omitted pg_wal from the copy. + * + * We could also recreate pg_wal if it doesn't exist, but a deliberate + * policy decision was made not to. It is fairly common for pg_wal to be + * a symlink, and if that was the DBA's intent then automatically making a + * plain directory would result in degraded performance with no notice. + */ +static void +ValidateXLOGDirectoryStructure(void) +{ + char path[MAXPGPATH]; + struct stat stat_buf; + + /* Check for pg_wal; if it doesn't exist, error out */ + if (stat(XLOGDIR, &stat_buf) != 0 || + !S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + XLOGDIR))); + + /* Check for archive_status */ + snprintf(path, MAXPGPATH, XLOGDIR "/archive_status"); + if (stat(path, &stat_buf) == 0) + { + /* Check for weird cases where it exists but isn't a directory */ + if (!S_ISDIR(stat_buf.st_mode)) + ereport(FATAL, + (errmsg("required WAL directory \"%s\" does not exist", + path))); + } + else + { + ereport(LOG, + (errmsg("creating missing WAL directory \"%s\"", path))); + if (MakePGDirectory(path) < 0) + ereport(FATAL, + (errmsg("could not create missing directory \"%s\": %m", + path))); + } +} + +/* + * Remove previous backup history files. This also retries creation of + * .ready files for any backup history files for which XLogArchiveNotify + * failed earlier. + */ +static void +CleanupBackupHistory(void) +{ + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(XLOGDIR)]; + + xldir = AllocateDir(XLOGDIR); + + while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) + { + if (IsBackupHistoryFileName(xlde->d_name)) + { + if (XLogArchiveCheckDone(xlde->d_name)) + { + elog(DEBUG2, "removing WAL backup history file \"%s\"", + xlde->d_name); + snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name); + unlink(path); + XLogArchiveCleanup(xlde->d_name); + } + } + } + + FreeDir(xldir); +} + +/* + * I/O routines for pg_control + * + * *ControlFile is a buffer in shared memory that holds an image of the + * contents of pg_control. WriteControlFile() initializes pg_control + * given a preloaded buffer, ReadControlFile() loads the buffer from + * the pg_control file (during postmaster or standalone-backend startup), + * and UpdateControlFile() rewrites pg_control after we modify xlog state. + * InitControlFile() fills the buffer with initial values. + * + * For simplicity, WriteControlFile() initializes the fields of pg_control + * that are related to checking backend/database compatibility, and + * ReadControlFile() verifies they are correct. We could split out the + * I/O and compatibility-check functions, but there seems no need currently. + */ + +static void +InitControlFile(uint64 sysidentifier) +{ + char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; + + /* + * Generate a random nonce. This is used for authentication requests that + * will fail because the user does not exist. The nonce is used to create + * a genuine-looking password challenge for the non-existent user, in lieu + * of an actual stored password. + */ + if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN)) + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate secret authorization token"))); + + memset(ControlFile, 0, sizeof(ControlFileData)); + /* Initialize pg_control status fields */ + ControlFile->system_identifier = sysidentifier; + memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN); + ControlFile->state = DB_SHUTDOWNED; + ControlFile->unloggedLSN = FirstNormalUnloggedLSN; + + /* Set important parameter values for use when replaying WAL */ + ControlFile->MaxConnections = MaxConnections; + ControlFile->max_worker_processes = max_worker_processes; + ControlFile->max_wal_senders = max_wal_senders; + ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_locks_per_xact = max_locks_per_xact; + ControlFile->wal_level = wal_level; + ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; + ControlFile->data_checksum_version = bootstrap_data_checksum_version; +} + +static void +WriteControlFile(void) +{ + int fd; + char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */ + + /* + * Ensure that the size of the pg_control data structure is sane. See the + * comments for these symbols in pg_control.h. + */ + StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE, + "pg_control is too large for atomic disk writes"); + StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE, + "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE"); + + /* + * Initialize version and compatibility-check fields + */ + ControlFile->pg_control_version = PG_CONTROL_VERSION; + ControlFile->catalog_version_no = CATALOG_VERSION_NO; + + ControlFile->maxAlign = MAXIMUM_ALIGNOF; + ControlFile->floatFormat = FLOATFORMAT_VALUE; + + ControlFile->blcksz = BLCKSZ; + ControlFile->relseg_size = RELSEG_SIZE; + ControlFile->xlog_blcksz = XLOG_BLCKSZ; + ControlFile->xlog_seg_size = wal_segment_size; + + ControlFile->nameDataLen = NAMEDATALEN; + ControlFile->indexMaxKeys = INDEX_MAX_KEYS; + + ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE; + ControlFile->loblksize = LOBLKSIZE; + + ControlFile->float8ByVal = FLOAT8PASSBYVAL; + + /* Contents are protected with a CRC */ + INIT_CRC32C(ControlFile->crc); + COMP_CRC32C(ControlFile->crc, + (char *) ControlFile, + offsetof(ControlFileData, crc)); + FIN_CRC32C(ControlFile->crc); + + /* + * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding + * the excess over sizeof(ControlFileData). This reduces the odds of + * premature-EOF errors when reading pg_control. We'll still fail when we + * check the contents of the file, but hopefully with a more specific + * error than "couldn't read pg_control". + */ + memset(buffer, 0, PG_CONTROL_FILE_SIZE); + memcpy(buffer, ControlFile, sizeof(ControlFileData)); + + fd = BasicOpenFile(XLOG_CONTROL_FILE, + O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + XLOG_CONTROL_FILE))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE); + if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + XLOG_CONTROL_FILE))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + XLOG_CONTROL_FILE))); + pgstat_report_wait_end(); + + if (close(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + XLOG_CONTROL_FILE))); +} + +static void +ReadControlFile(void) +{ + pg_crc32c crc; + int fd; + static char wal_segsz_str[20]; + int r; + + /* + * Read data... + */ + fd = BasicOpenFile(XLOG_CONTROL_FILE, + O_RDWR | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + XLOG_CONTROL_FILE))); + + pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ); + r = read(fd, ControlFile, sizeof(ControlFileData)); + if (r != sizeof(ControlFileData)) + { + if (r < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + XLOG_CONTROL_FILE))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + XLOG_CONTROL_FILE, r, sizeof(ControlFileData)))); + } + pgstat_report_wait_end(); + + close(fd); + + /* + * Check for expected pg_control format version. If this is wrong, the + * CRC check will likely fail because we'll be checking the wrong number + * of bytes. Complaining about wrong version will probably be more + * enlightening than complaining about wrong CRC. + */ + + if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x)," + " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).", + ControlFile->pg_control_version, ControlFile->pg_control_version, + PG_CONTROL_VERSION, PG_CONTROL_VERSION), + errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb."))); + + if (ControlFile->pg_control_version != PG_CONTROL_VERSION) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d," + " but the server was compiled with PG_CONTROL_VERSION %d.", + ControlFile->pg_control_version, PG_CONTROL_VERSION), + errhint("It looks like you need to initdb."))); + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, + (char *) ControlFile, + offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, ControlFile->crc)) + ereport(FATAL, + (errmsg("incorrect checksum in control file"))); + + /* + * Do compatibility checking immediately. If the database isn't + * compatible with the backend executable, we want to abort before we can + * possibly do any damage. + */ + if (ControlFile->catalog_version_no != CATALOG_VERSION_NO) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d," + " but the server was compiled with CATALOG_VERSION_NO %d.", + ControlFile->catalog_version_no, CATALOG_VERSION_NO), + errhint("It looks like you need to initdb."))); + if (ControlFile->maxAlign != MAXIMUM_ALIGNOF) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with MAXALIGN %d," + " but the server was compiled with MAXALIGN %d.", + ControlFile->maxAlign, MAXIMUM_ALIGNOF), + errhint("It looks like you need to initdb."))); + if (ControlFile->floatFormat != FLOATFORMAT_VALUE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster appears to use a different floating-point number format than the server executable."), + errhint("It looks like you need to initdb."))); + if (ControlFile->blcksz != BLCKSZ) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with BLCKSZ %d," + " but the server was compiled with BLCKSZ %d.", + ControlFile->blcksz, BLCKSZ), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->relseg_size != RELSEG_SIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with RELSEG_SIZE %d," + " but the server was compiled with RELSEG_SIZE %d.", + ControlFile->relseg_size, RELSEG_SIZE), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->xlog_blcksz != XLOG_BLCKSZ) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with XLOG_BLCKSZ %d," + " but the server was compiled with XLOG_BLCKSZ %d.", + ControlFile->xlog_blcksz, XLOG_BLCKSZ), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->nameDataLen != NAMEDATALEN) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with NAMEDATALEN %d," + " but the server was compiled with NAMEDATALEN %d.", + ControlFile->nameDataLen, NAMEDATALEN), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d," + " but the server was compiled with INDEX_MAX_KEYS %d.", + ControlFile->indexMaxKeys, INDEX_MAX_KEYS), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d," + " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.", + ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE), + errhint("It looks like you need to recompile or initdb."))); + if (ControlFile->loblksize != LOBLKSIZE) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with LOBLKSIZE %d," + " but the server was compiled with LOBLKSIZE %d.", + ControlFile->loblksize, (int) LOBLKSIZE), + errhint("It looks like you need to recompile or initdb."))); + +#ifdef USE_FLOAT8_BYVAL + if (ControlFile->float8ByVal != true) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL" + " but the server was compiled with USE_FLOAT8_BYVAL."), + errhint("It looks like you need to recompile or initdb."))); +#else + if (ControlFile->float8ByVal != false) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL" + " but the server was compiled without USE_FLOAT8_BYVAL."), + errhint("It looks like you need to recompile or initdb."))); +#endif + + wal_segment_size = ControlFile->xlog_seg_size; + + if (!IsValidWalSegSize(wal_segment_size)) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte", + "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes", + wal_segment_size, + wal_segment_size))); + + snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size); + SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL, + PGC_S_DYNAMIC_DEFAULT); + + /* check and update variables dependent on wal_segment_size */ + if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\""))); + + if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\""))); + + UsableBytesInSegment = + (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) - + (SizeOfXLogLongPHD - SizeOfXLogShortPHD); + + CalculateCheckpointSegments(); + + /* Make the initdb settings visible as GUC variables, too */ + SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); +} + +/* + * Utility wrapper to update the control file. Note that the control + * file gets flushed. + */ +static void +UpdateControlFile(void) +{ + update_controlfile(DataDir, ControlFile, true); +} + +/* + * Returns the unique system identifier from control file. + */ +uint64 +GetSystemIdentifier(void) +{ + Assert(ControlFile != NULL); + return ControlFile->system_identifier; +} + +/* + * Returns the random nonce from control file. + */ +char * +GetMockAuthenticationNonce(void) +{ + Assert(ControlFile != NULL); + return ControlFile->mock_authentication_nonce; +} + +/* + * Are checksums enabled for data pages? + */ +bool +DataChecksumsEnabled(void) +{ + Assert(ControlFile != NULL); + return (ControlFile->data_checksum_version > 0); +} + +/* + * Returns a fake LSN for unlogged relations. + * + * Each call generates an LSN that is greater than any previous value + * returned. The current counter value is saved and restored across clean + * shutdowns, but like unlogged relations, does not survive a crash. This can + * be used in lieu of real LSN values returned by XLogInsert, if you need an + * LSN-like increasing sequence of numbers without writing any WAL. + */ +XLogRecPtr +GetFakeLSNForUnloggedRel(void) +{ + XLogRecPtr nextUnloggedLSN; + + /* increment the unloggedLSN counter, need SpinLock */ + SpinLockAcquire(&XLogCtl->ulsn_lck); + nextUnloggedLSN = XLogCtl->unloggedLSN++; + SpinLockRelease(&XLogCtl->ulsn_lck); + + return nextUnloggedLSN; +} + +/* + * Auto-tune the number of XLOG buffers. + * + * The preferred setting for wal_buffers is about 3% of shared_buffers, with + * a maximum of one XLOG segment (there is little reason to think that more + * is helpful, at least so long as we force an fsync when switching log files) + * and a minimum of 8 blocks (which was the default value prior to PostgreSQL + * 9.1, when auto-tuning was added). + * + * This should not be called until NBuffers has received its final value. + */ +static int +XLOGChooseNumBuffers(void) +{ + int xbuffers; + + xbuffers = NBuffers / 32; + if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) + xbuffers = (wal_segment_size / XLOG_BLCKSZ); + if (xbuffers < 8) + xbuffers = 8; + return xbuffers; +} + +/* + * GUC check_hook for wal_buffers + */ +bool +check_wal_buffers(int *newval, void **extra, GucSource source) +{ + /* + * -1 indicates a request for auto-tune. + */ + if (*newval == -1) + { + /* + * If we haven't yet changed the boot_val default of -1, just let it + * be. We'll fix it when XLOGShmemSize is called. + */ + if (XLOGbuffers == -1) + return true; + + /* Otherwise, substitute the auto-tune value */ + *newval = XLOGChooseNumBuffers(); + } + + /* + * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL + * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer + * the case, we just silently treat such values as a request for the + * minimum. (We could throw an error instead, but that doesn't seem very + * helpful.) + */ + if (*newval < 4) + *newval = 4; + + return true; +} + +/* + * Read the control file, set respective GUCs. + * + * This is to be called during startup, including a crash recovery cycle, + * unless in bootstrap mode, where no control file yet exists. As there's no + * usable shared memory yet (its sizing can depend on the contents of the + * control file!), first store the contents in local memory. XLOGShmemInit() + * will then copy it to shared memory later. + * + * reset just controls whether previous contents are to be expected (in the + * reset case, there's a dangling pointer into old shared memory), or not. + */ +void +LocalProcessControlFile(bool reset) +{ + Assert(reset || ControlFile == NULL); + ControlFile = palloc(sizeof(ControlFileData)); + ReadControlFile(); +} + +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + Size size; + + /* + * If the value of wal_buffers is -1, use the preferred auto-tune value. + * This isn't an amazingly clean place to do this, but we must wait till + * NBuffers has received its final value, and must do it before using the + * value of XLOGbuffers to do anything important. + * + * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT. + * However, if the DBA explicitly set wal_buffers = -1 in the config file, + * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force + * the matter with PGC_S_OVERRIDE. + */ + if (XLOGbuffers == -1) + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers()); + SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, + PGC_S_DYNAMIC_DEFAULT); + if (XLOGbuffers == -1) /* failed to apply it? */ + SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, + PGC_S_OVERRIDE); + } + Assert(XLOGbuffers > 0); + + /* XLogCtl */ + size = sizeof(XLogCtlData); + + /* WAL insertion locks, plus alignment */ + size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1)); + /* xlblocks array */ + size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); + /* extra alignment padding for XLOG I/O buffers */ + size = add_size(size, XLOG_BLCKSZ); + /* and the buffers themselves */ + size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); + + /* + * Note: we don't count ControlFileData, it comes out of the "slop factor" + * added by CreateSharedMemoryAndSemaphores. This lets us use this + * routine again below to compute the actual allocation size. + */ + + return size; +} + +void +XLOGShmemInit(void) +{ + bool foundCFile, + foundXLog; + char *allocptr; + int i; + ControlFileData *localControlFile; + +#ifdef WAL_DEBUG + + /* + * Create a memory context for WAL debugging that's exempt from the normal + * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if + * an allocation fails, but wal_debug is not for production use anyway. + */ + if (walDebugCxt == NULL) + { + walDebugCxt = AllocSetContextCreate(TopMemoryContext, + "WAL Debug", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(walDebugCxt, true); + } +#endif + + + XLogCtl = (XLogCtlData *) + ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + + localControlFile = ControlFile; + ControlFile = (ControlFileData *) + ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); + + if (foundCFile || foundXLog) + { + /* both should be present or neither */ + Assert(foundCFile && foundXLog); + + /* Initialize local copy of WALInsertLocks */ + WALInsertLocks = XLogCtl->Insert.WALInsertLocks; + + if (localControlFile) + pfree(localControlFile); + return; + } + memset(XLogCtl, 0, sizeof(XLogCtlData)); + + /* + * Already have read control file locally, unless in bootstrap mode. Move + * contents into shared memory. + */ + if (localControlFile) + { + memcpy(ControlFile, localControlFile, sizeof(ControlFileData)); + pfree(localControlFile); + } + + /* + * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a + * multiple of the alignment for same, so no extra alignment padding is + * needed here. + */ + allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData); + XLogCtl->xlblocks = (XLogRecPtr *) allocptr; + memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers); + allocptr += sizeof(XLogRecPtr) * XLOGbuffers; + + + /* WAL insertion locks. Ensure they're aligned to the full padded size */ + allocptr += sizeof(WALInsertLockPadded) - + ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded); + WALInsertLocks = XLogCtl->Insert.WALInsertLocks = + (WALInsertLockPadded *) allocptr; + allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); + WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; + WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr; + } + + /* + * Align the start of the page buffers to a full xlog block size boundary. + * This simplifies some calculations in XLOG insertion. It is also + * required for O_DIRECT. + */ + allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); + XLogCtl->pages = allocptr; + memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + + /* + * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill + * in additional info.) + */ + XLogCtl->XLogCacheBlck = XLOGbuffers - 1; + XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; + XLogCtl->InstallXLogFileSegmentActive = false; + XLogCtl->WalWriterSleeping = false; + + SpinLockInit(&XLogCtl->Insert.insertpos_lck); + SpinLockInit(&XLogCtl->info_lck); + SpinLockInit(&XLogCtl->ulsn_lck); +} + +/* + * This func must be called ONCE on system install. It creates pg_control + * and the initial XLOG segment. + */ +void +BootStrapXLOG(void) +{ + CheckPoint checkPoint; + char *buffer; + XLogPageHeader page; + XLogLongPageHeader longpage; + XLogRecord *record; + char *recptr; + uint64 sysidentifier; + struct timeval tv; + pg_crc32c crc; + + /* allow ordinary WAL segment creation, like StartupXLOG() would */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); + + /* + * Select a hopefully-unique system identifier code for this installation. + * We use the result of gettimeofday(), including the fractional seconds + * field, as being about as unique as we can easily get. (Think not to + * use random(), since it hasn't been seeded and there's no portable way + * to seed it other than the system clock value...) The upper half of the + * uint64 value is just the tv_sec part, while the lower half contains the + * tv_usec part (which must fit in 20 bits), plus 12 bits from our current + * PID for a little extra uniqueness. A person knowing this encoding can + * determine the initialization time of the installation, which could + * perhaps be useful sometimes. + */ + gettimeofday(&tv, NULL); + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + + /* page buffer must be aligned suitably for O_DIRECT */ + buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); + page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer); + memset(page, 0, XLOG_BLCKSZ); + + /* + * Set up information for the initial checkpoint record + * + * The initial checkpoint record is written to the beginning of the WAL + * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not + * used, so that we can use 0/0 to mean "before any valid WAL segment". + */ + checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD; + checkPoint.ThisTimeLineID = BootstrapTimeLineID; + checkPoint.PrevTimeLineID = BootstrapTimeLineID; + checkPoint.fullPageWrites = fullPageWrites; + checkPoint.nextXid = + FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); + checkPoint.nextOid = FirstGenbkiObjectId; + checkPoint.nextMulti = FirstMultiXactId; + checkPoint.nextMultiOffset = 0; + checkPoint.oldestXid = FirstNormalTransactionId; + checkPoint.oldestXidDB = Template1DbOid; + checkPoint.oldestMulti = FirstMultiXactId; + checkPoint.oldestMultiDB = Template1DbOid; + checkPoint.oldestCommitTsXid = InvalidTransactionId; + checkPoint.newestCommitTsXid = InvalidTransactionId; + checkPoint.time = (pg_time_t) time(NULL); + checkPoint.oldestActiveXid = InvalidTransactionId; + + ShmemVariableCache->nextXid = checkPoint.nextXid; + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); + + /* Set up the XLOG page header */ + page->xlp_magic = XLOG_PAGE_MAGIC; + page->xlp_info = XLP_LONG_HEADER; + page->xlp_tli = BootstrapTimeLineID; + page->xlp_pageaddr = wal_segment_size; + longpage = (XLogLongPageHeader) page; + longpage->xlp_sysid = sysidentifier; + longpage->xlp_seg_size = wal_segment_size; + longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; + + /* Insert the initial checkpoint record */ + recptr = ((char *) page + SizeOfXLogLongPHD); + record = (XLogRecord *) recptr; + record->xl_prev = 0; + record->xl_xid = InvalidTransactionId; + record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); + record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; + record->xl_rmid = RM_XLOG_ID; + recptr += SizeOfXLogRecord; + /* fill the XLogRecordDataHeaderShort struct */ + *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = sizeof(checkPoint); + memcpy(recptr, &checkPoint, sizeof(checkPoint)); + recptr += sizeof(checkPoint); + Assert(recptr - (char *) record == record->xl_tot_len); + + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(crc); + record->xl_crc = crc; + + /* Create first XLOG segment file */ + openLogTLI = BootstrapTimeLineID; + openLogFile = XLogFileInit(1, BootstrapTimeLineID); + + /* + * We needn't bother with Reserve/ReleaseExternalFD here, since we'll + * close the file again in a moment. + */ + + /* Write the first page with the initial record */ + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); + if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write bootstrap write-ahead log file: %m"))); + } + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC); + if (pg_fsync(openLogFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync bootstrap write-ahead log file: %m"))); + pgstat_report_wait_end(); + + if (close(openLogFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close bootstrap write-ahead log file: %m"))); + + openLogFile = -1; + + /* Now create pg_control */ + InitControlFile(sysidentifier); + ControlFile->time = checkPoint.time; + ControlFile->checkPoint = checkPoint.redo; + ControlFile->checkPointCopy = checkPoint; + + /* some additional ControlFile fields are set in WriteControlFile() */ + WriteControlFile(); + + /* Bootstrap the commit log, too */ + BootStrapCLOG(); + BootStrapCommitTs(); + BootStrapSUBTRANS(); + BootStrapMultiXact(); + + pfree(buffer); + + /* + * Force control file to be read - in contrast to normal processing we'd + * otherwise never run the checks and GUC related initializations therein. + */ + ReadControlFile(); +} + +static char * +str_time(pg_time_t tnow) +{ + static char buf[128]; + + pg_strftime(buf, sizeof(buf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&tnow, log_timezone)); + + return buf; +} + +/* + * Initialize the first WAL segment on new timeline. + */ +static void +XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI) +{ + char xlogfname[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo startLogSegNo; + + /* we always switch to a new timeline after archive recovery */ + Assert(endTLI != newTLI); + + /* + * Update min recovery point one last time. + */ + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + + /* + * Calculate the last segment on the old timeline, and the first segment + * on the new timeline. If the switch happens in the middle of a segment, + * they are the same, but if the switch happens exactly at a segment + * boundary, startLogSegNo will be endLogSegNo + 1. + */ + XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size); + XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size); + + /* + * Initialize the starting WAL segment for the new timeline. If the switch + * happens in the middle of a segment, copy data from the last WAL segment + * of the old timeline up to the switch point, to the starting WAL segment + * on the new timeline. + */ + if (endLogSegNo == startLogSegNo) + { + /* + * Make a copy of the file on the new timeline. + * + * Writing WAL isn't allowed yet, so there are no locking + * considerations. But we should be just as tense as XLogFileInit to + * avoid emplacing a bogus file. + */ + XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo, + XLogSegmentOffset(endOfLog, wal_segment_size)); + } + else + { + /* + * The switch happened at a segment boundary, so just create the next + * segment on the new timeline. + */ + int fd; + + fd = XLogFileInit(startLogSegNo, newTLI); + + if (close(fd) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", xlogfname))); + } + } + + /* + * Let's just make real sure there are not .ready or .done flags posted + * for the new segment. + */ + XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size); + XLogArchiveCleanup(xlogfname); +} + +/* + * Perform cleanup actions at the conclusion of archive recovery. + */ +static void +CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, + TimeLineID newTLI) +{ + /* + * Execute the recovery_end_command, if any. + */ + if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0) + ExecuteRecoveryCommand(recoveryEndCommand, + "recovery_end_command", + true, + WAIT_EVENT_RECOVERY_END_COMMAND); + + /* + * We switched to a new timeline. Clean up segments on the old timeline. + * + * If there are any higher-numbered segments on the old timeline, remove + * them. They might contain valid WAL, but they might also be + * pre-allocated files containing garbage. In any case, they are not part + * of the new timeline's history so we don't need them. + */ + RemoveNonParentXlogFiles(EndOfLog, newTLI); + + /* + * If the switch happened in the middle of a segment, what to do with the + * last, partial segment on the old timeline? If we don't archive it, and + * the server that created the WAL never archives it either (e.g. because + * it was hit by a meteor), it will never make it to the archive. That's + * OK from our point of view, because the new segment that we created with + * the new TLI contains all the WAL from the old timeline up to the switch + * point. But if you later try to do PITR to the "missing" WAL on the old + * timeline, recovery won't find it in the archive. It's physically + * present in the new file with new TLI, but recovery won't look there + * when it's recovering to the older timeline. On the other hand, if we + * archive the partial segment, and the original server on that timeline + * is still running and archives the completed version of the same segment + * later, it will fail. (We used to do that in 9.4 and below, and it + * caused such problems). + * + * As a compromise, we rename the last segment with the .partial suffix, + * and archive it. Archive recovery will never try to read .partial + * segments, so they will normally go unused. But in the odd PITR case, + * the administrator can copy them manually to the pg_wal directory + * (removing the suffix). They can be useful in debugging, too. + * + * If a .done or .ready file already exists for the old timeline, however, + * we had already determined that the segment is complete, so we can let + * it be archived normally. (In particular, if it was restored from the + * archive to begin with, it's expected to have a .done file). + */ + if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 && + XLogArchivingActive()) + { + char origfname[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + + XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size); + XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size); + + if (!XLogArchiveIsReadyOrDone(origfname)) + { + char origpath[MAXPGPATH]; + char partialfname[MAXFNAMELEN]; + char partialpath[MAXPGPATH]; + + XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); + snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); + snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); + + /* + * Make sure there's no .done or .ready file for the .partial + * file. + */ + XLogArchiveCleanup(partialfname); + + durable_rename(origpath, partialpath, ERROR); + XLogArchiveNotify(partialfname); + } + } +} + +/* + * Check to see if required parameters are set high enough on this server + * for various aspects of recovery operation. + * + * Note that all the parameters which this function tests need to be + * listed in Administrator's Overview section in high-availability.sgml. + * If you change them, don't forget to update the list. + */ +static void +CheckRequiredParameterValues(void) +{ + /* + * For archive recovery, the WAL must be generated with at least 'replica' + * wal_level. + */ + if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL) + { + ereport(FATAL, + (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"), + errdetail("This happens if you temporarily set wal_level=minimal on the server."), + errhint("Use a backup taken after setting wal_level to higher than minimal."))); + } + + /* + * For Hot Standby, the WAL must be generated with 'replica' mode, and we + * must have at least as many backend slots as the primary. + */ + if (ArchiveRecoveryRequested && EnableHotStandby) + { + /* We ignore autovacuum_max_workers when we make this test. */ + RecoveryRequiresIntParameter("max_connections", + MaxConnections, + ControlFile->MaxConnections); + RecoveryRequiresIntParameter("max_worker_processes", + max_worker_processes, + ControlFile->max_worker_processes); + RecoveryRequiresIntParameter("max_wal_senders", + max_wal_senders, + ControlFile->max_wal_senders); + RecoveryRequiresIntParameter("max_prepared_transactions", + max_prepared_xacts, + ControlFile->max_prepared_xacts); + RecoveryRequiresIntParameter("max_locks_per_transaction", + max_locks_per_xact, + ControlFile->max_locks_per_xact); + } +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup + */ +void +StartupXLOG(void) +{ + XLogCtlInsert *Insert; + CheckPoint checkPoint; + bool wasShutdown; + bool didCrash; + bool haveTblspcMap; + bool haveBackupLabel; + XLogRecPtr EndOfLog; + TimeLineID EndOfLogTLI; + TimeLineID newTLI; + bool performedWalRecovery; + EndOfWalRecoveryInfo *endOfRecoveryInfo; + XLogRecPtr abortedRecPtr; + XLogRecPtr missingContrecPtr; + TransactionId oldestActiveXID; + bool promoted = false; + + /* + * We should have an aux process resource owner to use, and we should not + * be in a transaction that's installed some other resowner. + */ + Assert(AuxProcessResourceOwner != NULL); + Assert(CurrentResourceOwner == NULL || + CurrentResourceOwner == AuxProcessResourceOwner); + CurrentResourceOwner = AuxProcessResourceOwner; + + /* + * Check that contents look valid. + */ + if (!XRecOffIsValid(ControlFile->checkPoint)) + ereport(FATAL, + (errmsg("control file contains invalid checkpoint location"))); + + switch (ControlFile->state) + { + case DB_SHUTDOWNED: + + /* + * This is the expected case, so don't be chatty in standalone + * mode + */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("database system was shut down at %s", + str_time(ControlFile->time)))); + break; + + case DB_SHUTDOWNED_IN_RECOVERY: + ereport(LOG, + (errmsg("database system was shut down in recovery at %s", + str_time(ControlFile->time)))); + break; + + case DB_SHUTDOWNING: + ereport(LOG, + (errmsg("database system shutdown was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; + + case DB_IN_CRASH_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at %s", + str_time(ControlFile->time)), + errhint("This probably means that some data is corrupted and" + " you will have to use the last backup for recovery."))); + break; + + case DB_IN_ARCHIVE_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at log time %s", + str_time(ControlFile->checkPointCopy.time)), + errhint("If this has occurred more than once some data might be corrupted" + " and you might need to choose an earlier recovery target."))); + break; + + case DB_IN_PRODUCTION: + ereport(LOG, + (errmsg("database system was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; + + default: + ereport(FATAL, + (errmsg("control file contains invalid database cluster state"))); + } + + /* This is just to allow attaching to startup process with a debugger */ +#ifdef XLOG_REPLAY_DELAY + if (ControlFile->state != DB_SHUTDOWNED) + pg_usleep(60000000L); +#endif + + /* + * Verify that pg_wal and pg_wal/archive_status exist. In cases where + * someone has performed a copy for PITR, these directories may have been + * excluded and need to be re-created. + */ + ValidateXLOGDirectoryStructure(); + + /* Set up timeout handler needed to report startup progress. */ + if (!IsBootstrapProcessingMode()) + RegisterTimeout(STARTUP_PROGRESS_TIMEOUT, + startup_progress_timeout_handler); + + /*---------- + * If we previously crashed, perform a couple of actions: + * + * - The pg_wal directory may still include some temporary WAL segments + * used when creating a new segment, so perform some clean up to not + * bloat this path. This is done first as there is no point to sync + * this temporary data. + * + * - There might be data which we had written, intending to fsync it, but + * which we had not actually fsync'd yet. Therefore, a power failure in + * the near future might cause earlier unflushed writes to be lost, even + * though more recent data written to disk from here on would be + * persisted. To avoid that, fsync the entire data directory. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + { + RemoveTempXlogFiles(); + SyncDataDirectory(); + didCrash = true; + } + else + didCrash = false; + + /* + * Prepare for WAL recovery if needed. + * + * InitWalRecovery analyzes the control file and the backup label file, if + * any. It updates the in-memory ControlFile buffer according to the + * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested. + * It also applies the tablespace map file, if any. + */ + InitWalRecovery(ControlFile, &wasShutdown, + &haveBackupLabel, &haveTblspcMap); + checkPoint = ControlFile->checkPointCopy; + + /* initialize shared memory variables from the checkpoint record */ + ShmemVariableCache->nextXid = checkPoint.nextXid; + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + AdvanceOldestClogXid(checkPoint.oldestXid); + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); + SetCommitTsLimit(checkPoint.oldestCommitTsXid, + checkPoint.newestCommitTsXid); + XLogCtl->ckptFullXid = checkPoint.nextXid; + + /* + * Clear out any old relcache cache files. This is *necessary* if we do + * any WAL replay, since that would probably result in the cache files + * being out of sync with database reality. In theory we could leave them + * in place if the database had been cleanly shut down, but it seems + * safest to just remove them always and let them be rebuilt during the + * first backend startup. These files needs to be removed from all + * directories including pg_tblspc, however the symlinks are created only + * after reading tablespace_map file in case of archive recovery from + * backup, so needs to clear old relcache files here after creating + * symlinks. + */ + RelationCacheInitFileRemove(); + + /* + * Initialize replication slots, before there's a chance to remove + * required resources. + */ + StartupReplicationSlots(); + + /* + * Startup logical state, needs to be setup now so we have proper data + * during crash recovery. + */ + StartupReorderBuffer(); + + /* + * Startup CLOG. This must be done after ShmemVariableCache->nextXid has + * been initialized and before we accept connections or begin WAL replay. + */ + StartupCLOG(); + + /* + * Startup MultiXact. We need to do this early to be able to replay + * truncations. + */ + StartupMultiXact(); + + /* + * Ditto for commit timestamps. Activate the facility if the setting is + * enabled in the control file, as there should be no tracking of commit + * timestamps done when the setting was disabled. This facility can be + * started or stopped when replaying a XLOG_PARAMETER_CHANGE record. + */ + if (ControlFile->track_commit_timestamp) + StartupCommitTs(); + + /* + * Recover knowledge about replay progress of known replication partners. + */ + StartupReplicationOrigin(); + + /* + * Initialize unlogged LSN. On a clean shutdown, it's restored from the + * control file. On recovery, all unlogged relations are blown away, so + * the unlogged LSN counter can be reset too. + */ + if (ControlFile->state == DB_SHUTDOWNED) + XLogCtl->unloggedLSN = ControlFile->unloggedLSN; + else + XLogCtl->unloggedLSN = FirstNormalUnloggedLSN; + + /* + * Copy any missing timeline history files between 'now' and the recovery + * target timeline from archive to pg_wal. While we don't need those files + * ourselves - the history file of the recovery target timeline covers all + * the previous timelines in the history too - a cascading standby server + * might be interested in them. Or, if you archive the WAL from this + * server to a different archive than the primary, it'd be good for all + * the history files to get archived there after failover, so that you can + * use one of the old timelines as a PITR target. Timeline history files + * are small, so it's better to copy them unnecessarily than not copy them + * and regret later. + */ + restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI); + + /* + * Before running in recovery, scan pg_twophase and fill in its status to + * be able to work on entries generated by redo. Doing a scan before + * taking any recovery action has the merit to discard any 2PC files that + * are newer than the first record to replay, saving from any conflicts at + * replay. This avoids as well any subsequent scans when doing recovery + * of the on-disk two-phase data. + */ + restoreTwoPhaseData(); + + /* + * When starting with crash recovery, reset pgstat data - it might not be + * valid. Otherwise restore pgstat data. It's safe to do this here, + * because postmaster will not yet have started any other processes. + * + * NB: Restoring replication slot stats relies on slot state to have + * already been restored from disk. + * + * TODO: With a bit of extra work we could just start with a pgstat file + * associated with the checkpoint redo location we're starting from. + */ + if (didCrash) + pgstat_discard_stats(); + else + pgstat_restore_stats(); + + lastFullPageWrites = checkPoint.fullPageWrites; + + RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + doPageWrites = lastFullPageWrites; + + /* REDO */ + if (InRecovery) + { + /* Initialize state for RecoveryInProgress() */ + SpinLockAcquire(&XLogCtl->info_lck); + if (InArchiveRecovery) + XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; + else + XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Update pg_control to show that we are recovering and to show the + * selected checkpoint as the place we are starting from. We also mark + * pg_control with any minimum recovery stop point obtained from a + * backup history file. + * + * No need to hold ControlFileLock yet, we aren't up far enough. + */ + UpdateControlFile(); + + /* + * If there was a backup label file, it's done its job and the info + * has now been propagated into pg_control. We must get rid of the + * label file so that if we crash during recovery, we'll pick up at + * the latest recovery restartpoint instead of going all the way back + * to the backup start point. It seems prudent though to just rename + * the file out of the way rather than delete it completely. + */ + if (haveBackupLabel) + { + unlink(BACKUP_LABEL_OLD); + durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL); + } + + /* + * If there was a tablespace_map file, it's done its job and the + * symlinks have been created. We must get rid of the map file so + * that if we crash during recovery, we don't create symlinks again. + * It seems prudent though to just rename the file out of the way + * rather than delete it completely. + */ + if (haveTblspcMap) + { + unlink(TABLESPACE_MAP_OLD); + durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL); + } + + /* + * Initialize our local copy of minRecoveryPoint. When doing crash + * recovery we want to replay up to the end of WAL. Particularly, in + * the case of a promoted standby minRecoveryPoint value in the + * control file is only updated after the first checkpoint. However, + * if the instance crashes before the first post-recovery checkpoint + * is completed then recovery will use a stale location causing the + * startup process to think that there are still invalid page + * references when checking for data consistency. + */ + if (InArchiveRecovery) + { + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + else + { + LocalMinRecoveryPoint = InvalidXLogRecPtr; + LocalMinRecoveryPointTLI = 0; + } + + /* Check that the GUCs used to generate the WAL allow recovery */ + CheckRequiredParameterValues(); + + /* + * We're in recovery, so unlogged relations may be trashed and must be + * reset. This should be done BEFORE allowing Hot Standby + * connections, so that read-only backends don't try to read whatever + * garbage is left over from before. + */ + ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP); + + /* + * Likewise, delete any saved transaction snapshot files that got left + * behind by crashed backends. + */ + DeleteAllExportedSnapshotFiles(); + + /* + * Initialize for Hot Standby, if enabled. We won't let backends in + * yet, not until we've reached the min recovery point specified in + * control file and we've established a recovery snapshot from a + * running-xacts WAL record. + */ + if (ArchiveRecoveryRequested && EnableHotStandby) + { + TransactionId *xids; + int nxids; + + ereport(DEBUG1, + (errmsg_internal("initializing for hot standby"))); + + InitRecoveryTransactionEnvironment(); + + if (wasShutdown) + oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + else + oldestActiveXID = checkPoint.oldestActiveXid; + Assert(TransactionIdIsValid(oldestActiveXID)); + + /* Tell procarray about the range of xids it has to deal with */ + ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid)); + + /* + * Startup subtrans only. CLOG, MultiXact and commit timestamp + * have already been started up and other SLRUs are not maintained + * during recovery and need not be started yet. + */ + StartupSUBTRANS(oldestActiveXID); + + /* + * If we're beginning at a shutdown checkpoint, we know that + * nothing was running on the primary at this point. So fake-up an + * empty running-xacts record and use that here and now. Recover + * additional standby state for prepared transactions. + */ + if (wasShutdown) + { + RunningTransactionsData running; + TransactionId latestCompletedXid; + + /* + * Construct a RunningTransactions snapshot representing a + * shut down server, with only prepared transactions still + * alive. We're never overflowed at this point because all + * subxids are listed with their parent prepared transactions. + */ + running.xcnt = nxids; + running.subxcnt = 0; + running.subxid_overflow = false; + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); + running.oldestRunningXid = oldestActiveXID; + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); + TransactionIdRetreat(latestCompletedXid); + Assert(TransactionIdIsNormal(latestCompletedXid)); + running.latestCompletedXid = latestCompletedXid; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(); + } + } + + /* + * We're all set for replaying the WAL now. Do it. + */ + PerformWalRecovery(); + performedWalRecovery = true; + } + else + performedWalRecovery = false; + + /* + * Finish WAL recovery. + */ + endOfRecoveryInfo = FinishWalRecovery(); + EndOfLog = endOfRecoveryInfo->endOfLog; + EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI; + abortedRecPtr = endOfRecoveryInfo->abortedRecPtr; + missingContrecPtr = endOfRecoveryInfo->missingContrecPtr; + + /* + * Reset ps status display, so as no information related to recovery + * shows up. + */ + set_ps_display(""); + + /* + * When recovering from a backup (we are in recovery, and archive recovery + * was requested), complain if we did not roll forward far enough to reach + * the point where the database is consistent. For regular online + * backup-from-primary, that means reaching the end-of-backup WAL record + * (at which point we reset backupStartPoint to be Invalid), for + * backup-from-replica (which can't inject records into the WAL stream), + * that point is when we reach the minRecoveryPoint in pg_control (which + * we purposefully copy last when backing up from a replica). For + * pg_rewind (which creates a backup_label with a method of "pg_rewind") + * or snapshot-style backups (which don't), backupEndRequired will be set + * to false. + * + * Note: it is indeed okay to look at the local variable + * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint + * might be further ahead --- ControlFile->minRecoveryPoint cannot have + * been advanced beyond the WAL we processed. + */ + if (InRecovery && + (EndOfLog < LocalMinRecoveryPoint || + !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))) + { + /* + * Ran off end of WAL before reaching end-of-backup WAL record, or + * minRecoveryPoint. That's a bad sign, indicating that you tried to + * recover from an online backup but never called pg_backup_stop(), or + * you didn't archive all the WAL needed. + */ + if (ArchiveRecoveryRequested || ControlFile->backupEndRequired) + { + if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired) + ereport(FATAL, + (errmsg("WAL ends before end of online backup"), + errhint("All WAL generated while online backup was taken must be available at recovery."))); + else + ereport(FATAL, + (errmsg("WAL ends before consistent recovery point"))); + } + } + + /* + * Reset unlogged relations to the contents of their INIT fork. This is + * done AFTER recovery is complete so as to include any unlogged relations + * created during recovery, but BEFORE recovery is marked as having + * completed successfully. Otherwise we'd not retry if any of the post + * end-of-recovery steps fail. + */ + if (InRecovery) + ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + + /* + * Pre-scan prepared transactions to find out the range of XIDs present. + * This information is not quite needed yet, but it is positioned here so + * as potential problems are detected before any on-disk change is done. + */ + oldestActiveXID = PrescanPreparedTransactions(NULL, NULL); + + /* + * Allow ordinary WAL segment creation before possibly switching to a new + * timeline, which creates a new segment, and after the last ReadRecord(). + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); + + /* + * Consider whether we need to assign a new timeline ID. + * + * If we did archive recovery, we always assign a new ID. This handles a + * couple of issues. If we stopped short of the end of WAL during + * recovery, then we are clearly generating a new timeline and must assign + * it a unique new ID. Even if we ran to the end, modifying the current + * last segment is problematic because it may result in trying to + * overwrite an already-archived copy of that segment, and we encourage + * DBAs to make their archive_commands reject that. We can dodge the + * problem by making the new active segment have a new timeline ID. + * + * In a normal crash recovery, we can just extend the timeline we were in. + */ + newTLI = endOfRecoveryInfo->lastRecTLI; + if (ArchiveRecoveryRequested) + { + newTLI = findNewestTimeLine(recoveryTargetTLI) + 1; + ereport(LOG, + (errmsg("selected new timeline ID: %u", newTLI))); + + /* + * Make a writable copy of the last WAL segment. (Note that we also + * have a copy of the last block of the old WAL in + * endOfRecovery->lastPage; we will use that below.) + */ + XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI); + + /* + * Remove the signal files out of the way, so that we don't + * accidentally re-enter archive recovery mode in a subsequent crash. + */ + if (endOfRecoveryInfo->standby_signal_file_found) + durable_unlink(STANDBY_SIGNAL_FILE, FATAL); + + if (endOfRecoveryInfo->recovery_signal_file_found) + durable_unlink(RECOVERY_SIGNAL_FILE, FATAL); + + /* + * Write the timeline history file, and have it archived. After this + * point (or rather, as soon as the file is archived), the timeline + * will appear as "taken" in the WAL archive and to any standby + * servers. If we crash before actually switching to the new + * timeline, standby servers will nevertheless think that we switched + * to the new timeline, and will try to connect to the new timeline. + * To minimize the window for that, try to do as little as possible + * between here and writing the end-of-recovery record. + */ + writeTimeLineHistory(newTLI, recoveryTargetTLI, + EndOfLog, endOfRecoveryInfo->recoveryStopReason); + + ereport(LOG, + (errmsg("archive recovery complete"))); + } + + /* Save the selected TimeLineID in shared memory, too */ + XLogCtl->InsertTimeLineID = newTLI; + XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI; + + /* + * Actually, if WAL ended in an incomplete record, skip the parts that + * made it through and start writing after the portion that persisted. + * (It's critical to first write an OVERWRITE_CONTRECORD message, which + * we'll do as soon as we're open for writing new WAL.) + */ + if (!XLogRecPtrIsInvalid(missingContrecPtr)) + { + /* + * We should only have a missingContrecPtr if we're not switching to + * a new timeline. When a timeline switch occurs, WAL is copied from + * the old timeline to the new only up to the end of the last complete + * record, so there can't be an incomplete WAL record that we need to + * disregard. + */ + Assert(newTLI == endOfRecoveryInfo->lastRecTLI); + Assert(!XLogRecPtrIsInvalid(abortedRecPtr)); + EndOfLog = missingContrecPtr; + } + + /* + * Prepare to write WAL starting at EndOfLog location, and init xlog + * buffer cache using the block containing the last record from the + * previous incarnation. + */ + Insert = &XLogCtl->Insert; + Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec); + Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog); + + /* + * Tricky point here: lastPage contains the *last* block that the LastRec + * record spans, not the one it starts in. The last block is indeed the + * one we want to use. + */ + if (EndOfLog % XLOG_BLCKSZ != 0) + { + char *page; + int len; + int firstIdx; + + firstIdx = XLogRecPtrToBufIdx(EndOfLog); + len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr; + Assert(len < XLOG_BLCKSZ); + + /* Copy the valid part of the last block, and zero the rest */ + page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; + memcpy(page, endOfRecoveryInfo->lastPage, len); + memset(page + len, 0, XLOG_BLCKSZ - len); + + XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ; + XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ; + } + else + { + /* + * There is no partial block to copy. Just set InitializedUpTo, and + * let the first attempt to insert a log record to initialize the next + * buffer. + */ + XLogCtl->InitializedUpTo = EndOfLog; + } + + LogwrtResult.Write = LogwrtResult.Flush = EndOfLog; + + XLogCtl->LogwrtResult = LogwrtResult; + + XLogCtl->LogwrtRqst.Write = EndOfLog; + XLogCtl->LogwrtRqst.Flush = EndOfLog; + + /* + * Preallocate additional log files, if wanted. + */ + PreallocXlogFiles(EndOfLog, newTLI); + + /* + * Okay, we're officially UP. + */ + InRecovery = false; + + /* start the archive_timeout timer and LSN running */ + XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchLSN = EndOfLog; + + /* also initialize latestCompletedXid, to nextXid - 1 */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid); + LWLockRelease(ProcArrayLock); + + /* + * Start up subtrans, if not already done for hot standby. (commit + * timestamps are started below, if necessary.) + */ + if (standbyState == STANDBY_DISABLED) + StartupSUBTRANS(oldestActiveXID); + + /* + * Perform end of recovery actions for any SLRUs that need it. + */ + TrimCLOG(); + TrimMultiXact(); + + /* + * Reload shared-memory state for prepared transactions. This needs to + * happen before renaming the last partial segment of the old timeline as + * it may be possible that we have to recovery some transactions from it. + */ + RecoverPreparedTransactions(); + + /* Shut down xlogreader */ + ShutdownWalRecovery(); + + /* Enable WAL writes for this backend only. */ + LocalSetXLogInsertAllowed(); + + /* If necessary, write overwrite-contrecord before doing anything else */ + if (!XLogRecPtrIsInvalid(abortedRecPtr)) + { + Assert(!XLogRecPtrIsInvalid(missingContrecPtr)); + CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI); + } + + /* + * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE + * record before resource manager writes cleanup WAL records or checkpoint + * record is written. + */ + Insert->fullPageWrites = lastFullPageWrites; + UpdateFullPageWrites(); + + /* + * Emit checkpoint or end-of-recovery record in XLOG, if required. + */ + if (performedWalRecovery) + promoted = PerformRecoveryXLogAction(); + + /* + * If any of the critical GUCs have changed, log them before we allow + * backends to write WAL. + */ + XLogReportParameters(); + + /* If this is archive recovery, perform post-recovery cleanup actions. */ + if (ArchiveRecoveryRequested) + CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI); + + /* + * Local WAL inserts enabled, so it's time to finish initialization of + * commit timestamp. + */ + CompleteCommitTsInitialization(); + + /* + * All done with end-of-recovery actions. + * + * Now allow backends to write WAL and update the control file status in + * consequence. SharedRecoveryState, that controls if backends can write + * WAL, is updated while holding ControlFileLock to prevent other backends + * to look at an inconsistent state of the control file in shared memory. + * There is still a small window during which backends can write WAL and + * the control file is still referring to a system not in DB_IN_PRODUCTION + * state while looking at the on-disk control file. + * + * Also, we use info_lck to update SharedRecoveryState to ensure that + * there are no race conditions concerning visibility of other recent + * updates to shared memory. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_PRODUCTION; + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE; + SpinLockRelease(&XLogCtl->info_lck); + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* + * Shutdown the recovery environment. This must occur after + * RecoverPreparedTransactions() (see notes in lock_twophase_recover()) + * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as + * any session building a snapshot will not rely on KnownAssignedXids as + * RecoveryInProgress() would return false at this stage. This is + * particularly critical for prepared 2PC transactions, that would still + * need to be included in snapshots once recovery has ended. + */ + if (standbyState != STANDBY_DISABLED) + ShutdownRecoveryTransactionEnvironment(); + + /* + * If there were cascading standby servers connected to us, nudge any wal + * sender processes to notice that we've been promoted. + */ + WalSndWakeup(); + + /* + * If this was a promotion, request an (online) checkpoint now. This isn't + * required for consistency, but the last restartpoint might be far back, + * and in case of a crash, recovering from it might take a longer than is + * appropriate now that we're not in standby mode anymore. + */ + if (promoted) + RequestCheckpoint(CHECKPOINT_FORCE); +} + +/* + * Callback from PerformWalRecovery(), called when we switch from crash + * recovery to archive recovery mode. Updates the control file accordingly. + */ +void +SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI) +{ + /* initialize minRecoveryPoint to this record */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + if (ControlFile->minRecoveryPoint < EndRecPtr) + { + ControlFile->minRecoveryPoint = EndRecPtr; + ControlFile->minRecoveryPointTLI = replayTLI; + } + /* update local copy */ + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + + /* + * The startup process can update its local copy of minRecoveryPoint from + * this point. + */ + updateMinRecoveryPoint = true; + + UpdateControlFile(); + + /* + * We update SharedRecoveryState while holding the lock on ControlFileLock + * so both states are consistent in shared memory. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; + SpinLockRelease(&XLogCtl->info_lck); + + LWLockRelease(ControlFileLock); +} + +/* + * Callback from PerformWalRecovery(), called when we reach the end of backup. + * Updates the control file accordingly. + */ +void +ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli) +{ + /* + * We have reached the end of base backup, as indicated by pg_control. The + * data on disk is now consistent (unless minRecovery point is further + * ahead, which can happen if we crashed during previous recovery). Reset + * backupStartPoint and backupEndPoint, and update minRecoveryPoint to + * make sure we don't allow starting up at an earlier point even if + * recovery is stopped and restarted soon after this. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->minRecoveryPoint < EndRecPtr) + { + ControlFile->minRecoveryPoint = EndRecPtr; + ControlFile->minRecoveryPointTLI = tli; + } + + ControlFile->backupStartPoint = InvalidXLogRecPtr; + ControlFile->backupEndPoint = InvalidXLogRecPtr; + ControlFile->backupEndRequired = false; + UpdateControlFile(); + + LWLockRelease(ControlFileLock); +} + +/* + * Perform whatever XLOG actions are necessary at end of REDO. + * + * The goal here is to make sure that we'll be able to recover properly if + * we crash again. If we choose to write a checkpoint, we'll write a shutdown + * checkpoint rather than an on-line one. This is not particularly critical, + * but since we may be assigning a new TLI, using a shutdown checkpoint allows + * us to have the rule that TLI only changes in shutdown checkpoints, which + * allows some extra error checking in xlog_redo. + */ +static bool +PerformRecoveryXLogAction(void) +{ + bool promoted = false; + + /* + * Perform a checkpoint to update all our recovery activity to disk. + * + * Note that we write a shutdown checkpoint rather than an on-line one. + * This is not particularly critical, but since we may be assigning a new + * TLI, using a shutdown checkpoint allows us to have the rule that TLI + * only changes in shutdown checkpoints, which allows some extra error + * checking in xlog_redo. + * + * In promotion, only create a lightweight end-of-recovery record instead + * of a full checkpoint. A checkpoint is requested later, after we're + * fully out of recovery mode and already accepting queries. + */ + if (ArchiveRecoveryRequested && IsUnderPostmaster && + PromoteIsTriggered()) + { + promoted = true; + + /* + * Insert a special WAL record to mark the end of recovery, since we + * aren't doing a checkpoint. That means that the checkpointer process + * may likely be in the middle of a time-smoothed restartpoint and + * could continue to be for minutes after this. That sounds strange, + * but the effect is roughly the same and it would be stranger to try + * to come out of the restartpoint and then checkpoint. We request a + * checkpoint later anyway, just for safety. + */ + CreateEndOfRecoveryRecord(); + } + else + { + RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_IMMEDIATE | + CHECKPOINT_WAIT); + } + + return promoted; +} + +/* + * Is the system still in recovery? + * + * Unlike testing InRecovery, this works in any process that's connected to + * shared memory. + */ +bool +RecoveryInProgress(void) +{ + /* + * We check shared state each time only until we leave recovery mode. We + * can't re-enter recovery, so there's no need to keep checking after the + * shared variable has once been seen false. + */ + if (!LocalRecoveryInProgress) + return false; + else + { + /* + * use volatile pointer to make sure we make a fresh read of the + * shared variable. + */ + volatile XLogCtlData *xlogctl = XLogCtl; + + LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE); + + /* + * Note: We don't need a memory barrier when we're still in recovery. + * We might exit recovery immediately after return, so the caller + * can't rely on 'true' meaning that we're still in recovery anyway. + */ + + return LocalRecoveryInProgress; + } +} + +/* + * Returns current recovery state from shared memory. + * + * This returned state is kept consistent with the contents of the control + * file. See details about the possible values of RecoveryState in xlog.h. + */ +RecoveryState +GetRecoveryState(void) +{ + RecoveryState retval; + + SpinLockAcquire(&XLogCtl->info_lck); + retval = XLogCtl->SharedRecoveryState; + SpinLockRelease(&XLogCtl->info_lck); + + return retval; +} + +/* + * Is this process allowed to insert new WAL records? + * + * Ordinarily this is essentially equivalent to !RecoveryInProgress(). + * But we also have provisions for forcing the result "true" or "false" + * within specific processes regardless of the global state. + */ +bool +XLogInsertAllowed(void) +{ + /* + * If value is "unconditionally true" or "unconditionally false", just + * return it. This provides the normal fast path once recovery is known + * done. + */ + if (LocalXLogInsertAllowed >= 0) + return (bool) LocalXLogInsertAllowed; + + /* + * Else, must check to see if we're still in recovery. + */ + if (RecoveryInProgress()) + return false; + + /* + * On exit from recovery, reset to "unconditionally true", since there is + * no need to keep checking. + */ + LocalXLogInsertAllowed = 1; + return true; +} + +/* + * Make XLogInsertAllowed() return true in the current process only. + * + * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later, + * and even call LocalSetXLogInsertAllowed() again after that. + * + * Returns the previous value of LocalXLogInsertAllowed. + */ +static int +LocalSetXLogInsertAllowed(void) +{ + int oldXLogAllowed = LocalXLogInsertAllowed; + + LocalXLogInsertAllowed = 1; + + return oldXLogAllowed; +} + +/* + * Return the current Redo pointer from shared memory. + * + * As a side-effect, the local RedoRecPtr copy is updated. + */ +XLogRecPtr +GetRedoRecPtr(void) +{ + XLogRecPtr ptr; + + /* + * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * grabbed a WAL insertion lock to read the authoritative value in + * Insert->RedoRecPtr, someone might update it just after we've released + * the lock. + */ + SpinLockAcquire(&XLogCtl->info_lck); + ptr = XLogCtl->RedoRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (RedoRecPtr < ptr) + RedoRecPtr = ptr; + + return RedoRecPtr; +} + +/* + * Return information needed to decide whether a modified block needs a + * full-page image to be included in the WAL record. + * + * The returned values are cached copies from backend-private memory, and + * possibly out-of-date or, indeed, uninitialized, in which case they will + * be InvalidXLogRecPtr and false, respectively. XLogInsertRecord will + * re-check them against up-to-date values, while holding the WAL insert lock. + */ +void +GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) +{ + *RedoRecPtr_p = RedoRecPtr; + *doPageWrites_p = doPageWrites; +} + +/* + * GetInsertRecPtr -- Returns the current insert position. + * + * NOTE: The value *actually* returned is the position of the last full + * xlog page. It lags behind the real insert position by at most 1 page. + * For that, we don't need to scan through WAL insertion locks, and an + * approximation is enough for the current usage of this function. + */ +XLogRecPtr +GetInsertRecPtr(void) +{ + XLogRecPtr recptr; + + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->LogwrtRqst.Write; + SpinLockRelease(&XLogCtl->info_lck); + + return recptr; +} + +/* + * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL + * position known to be fsync'd to disk. This should only be used on a + * system that is known not to be in recovery. + */ +XLogRecPtr +GetFlushRecPtr(TimeLineID *insertTLI) +{ + Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); + + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If we're writing and flushing WAL, the time line can't be changing, so + * no lock is required. + */ + if (insertTLI) + *insertTLI = XLogCtl->InsertTimeLineID; + + return LogwrtResult.Flush; +} + +/* + * GetWALInsertionTimeLine -- Returns the current timeline of a system that + * is not in recovery. + */ +TimeLineID +GetWALInsertionTimeLine(void) +{ + Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); + + /* Since the value can't be changing, no lock is required. */ + return XLogCtl->InsertTimeLineID; +} + +/* + * GetLastImportantRecPtr -- Returns the LSN of the last important record + * inserted. All records not explicitly marked as unimportant are considered + * important. + * + * The LSN is determined by computing the maximum of + * WALInsertLocks[i].lastImportantAt. + */ +XLogRecPtr +GetLastImportantRecPtr(void) +{ + XLogRecPtr res = InvalidXLogRecPtr; + int i; + + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr last_important; + + /* + * Need to take a lock to prevent torn reads of the LSN, which are + * possible on some of the supported platforms. WAL insert locks only + * support exclusive mode, so we have to use that. + */ + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + last_important = WALInsertLocks[i].l.lastImportantAt; + LWLockRelease(&WALInsertLocks[i].l.lock); + + if (res < last_important) + res = last_important; + } + + return res; +} + +/* + * Get the time and LSN of the last xlog segment switch + */ +pg_time_t +GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) +{ + pg_time_t result; + + /* Need WALWriteLock, but shared lock is sufficient */ + LWLockAcquire(WALWriteLock, LW_SHARED); + result = XLogCtl->lastSegSwitchTime; + *lastSwitchLSN = XLogCtl->lastSegSwitchLSN; + LWLockRelease(WALWriteLock); + + return result; +} + +/* + * This must be called ONCE during postmaster or standalone-backend shutdown + */ +void +ShutdownXLOG(int code, Datum arg) +{ + /* + * We should have an aux process resource owner to use, and we should not + * be in a transaction that's installed some other resowner. + */ + Assert(AuxProcessResourceOwner != NULL); + Assert(CurrentResourceOwner == NULL || + CurrentResourceOwner == AuxProcessResourceOwner); + CurrentResourceOwner = AuxProcessResourceOwner; + + /* Don't be chatty in standalone mode */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("shutting down"))); + + /* + * Signal walsenders to move to stopping state. + */ + WalSndInitStopping(); + + /* + * Wait for WAL senders to be in stopping state. This prevents commands + * from writing new WAL. + */ + WalSndWaitStopping(); + + if (RecoveryInProgress()) + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + else + { + /* + * If archiving is enabled, rotate the last XLOG file so that all the + * remaining records are archived (postmaster wakes up the archiver + * process one more time at the end of shutdown). The checkpoint + * record will go to the next XLOG file and won't be archived (yet). + */ + if (XLogArchivingActive()) + RequestXLogSwitch(false); + + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + } +} + +/* + * Log start of a checkpoint. + */ +static void +LogCheckpointStart(int flags, bool restartpoint) +{ + if (restartpoint) + ereport(LOG, + /* translator: the placeholders show checkpoint options */ + (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", + (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + else + ereport(LOG, + /* translator: the placeholders show checkpoint options */ + (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", + (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", + (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", + (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FORCE) ? " force" : "", + (flags & CHECKPOINT_WAIT) ? " wait" : "", + (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", + (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", + (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); +} + +/* + * Log end of a checkpoint. + */ +static void +LogCheckpointEnd(bool restartpoint) +{ + long write_msecs, + sync_msecs, + total_msecs, + longest_msecs, + average_msecs; + uint64 average_sync_time; + + CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); + + write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t, + CheckpointStats.ckpt_sync_t); + + sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t, + CheckpointStats.ckpt_sync_end_t); + + /* Accumulate checkpoint timing summary data, in milliseconds. */ + PendingCheckpointerStats.checkpoint_write_time += write_msecs; + PendingCheckpointerStats.checkpoint_sync_time += sync_msecs; + + /* + * All of the published timing statistics are accounted for. Only + * continue if a log message is to be written. + */ + if (!log_checkpoints) + return; + + total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t, + CheckpointStats.ckpt_end_t); + + /* + * Timing values returned from CheckpointStats are in microseconds. + * Convert to milliseconds for consistent printing. + */ + longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000); + + average_sync_time = 0; + if (CheckpointStats.ckpt_sync_rels > 0) + average_sync_time = CheckpointStats.ckpt_agg_sync_time / + CheckpointStats.ckpt_sync_rels; + average_msecs = (long) ((average_sync_time + 999) / 1000); + + if (restartpoint) + ereport(LOG, + (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); " + "%d WAL file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "distance=%d kB, estimate=%d kB", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_msecs / 1000, (int) (write_msecs % 1000), + sync_msecs / 1000, (int) (sync_msecs % 1000), + total_msecs / 1000, (int) (total_msecs % 1000), + CheckpointStats.ckpt_sync_rels, + longest_msecs / 1000, (int) (longest_msecs % 1000), + average_msecs / 1000, (int) (average_msecs % 1000), + (int) (PrevCheckPointDistance / 1024.0), + (int) (CheckPointDistanceEstimate / 1024.0)))); + else + ereport(LOG, + (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); " + "%d WAL file(s) added, %d removed, %d recycled; " + "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "distance=%d kB, estimate=%d kB", + CheckpointStats.ckpt_bufs_written, + (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled, + write_msecs / 1000, (int) (write_msecs % 1000), + sync_msecs / 1000, (int) (sync_msecs % 1000), + total_msecs / 1000, (int) (total_msecs % 1000), + CheckpointStats.ckpt_sync_rels, + longest_msecs / 1000, (int) (longest_msecs % 1000), + average_msecs / 1000, (int) (average_msecs % 1000), + (int) (PrevCheckPointDistance / 1024.0), + (int) (CheckPointDistanceEstimate / 1024.0)))); +} + +/* + * Update the estimate of distance between checkpoints. + * + * The estimate is used to calculate the number of WAL segments to keep + * preallocated, see XLOGfileslop(). + */ +static void +UpdateCheckPointDistanceEstimate(uint64 nbytes) +{ + /* + * To estimate the number of segments consumed between checkpoints, keep a + * moving average of the amount of WAL generated in previous checkpoint + * cycles. However, if the load is bursty, with quiet periods and busy + * periods, we want to cater for the peak load. So instead of a plain + * moving average, let the average decline slowly if the previous cycle + * used less WAL than estimated, but bump it up immediately if it used + * more. + * + * When checkpoints are triggered by max_wal_size, this should converge to + * CheckpointSegments * wal_segment_size, + * + * Note: This doesn't pay any attention to what caused the checkpoint. + * Checkpoints triggered manually with CHECKPOINT command, or by e.g. + * starting a base backup, are counted the same as those created + * automatically. The slow-decline will largely mask them out, if they are + * not frequent. If they are frequent, it seems reasonable to count them + * in as any others; if you issue a manual checkpoint every 5 minutes and + * never let a timed checkpoint happen, it makes sense to base the + * preallocation on that 5 minute interval rather than whatever + * checkpoint_timeout is set to. + */ + PrevCheckPointDistance = nbytes; + if (CheckPointDistanceEstimate < nbytes) + CheckPointDistanceEstimate = nbytes; + else + CheckPointDistanceEstimate = + (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes); +} + +/* + * Update the ps display for a process running a checkpoint. Note that + * this routine should not do any allocations so as it can be called + * from a critical section. + */ +static void +update_checkpoint_display(int flags, bool restartpoint, bool reset) +{ + /* + * The status is reported only for end-of-recovery and shutdown + * checkpoints or shutdown restartpoints. Updating the ps display is + * useful in those situations as it may not be possible to rely on + * pg_stat_activity to see the status of the checkpointer or the startup + * process. + */ + if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0) + return; + + if (reset) + set_ps_display(""); + else + { + char activitymsg[128]; + + snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s", + (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "", + (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "", + restartpoint ? "restartpoint" : "checkpoint"); + set_ps_display(activitymsg); + } +} + + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + * + * flags is a bitwise OR of the following: + * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. + * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. + * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or + * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables. + * + * Note: flags contains other bits, of interest here only for logging purposes. + * In particular note that this routine is synchronous and does not pay + * attention to CHECKPOINT_WAIT. + * + * If !shutdown then we are writing an online checkpoint. This is a very special + * kind of operation and WAL record because the checkpoint action occurs over + * a period of time yet logically occurs at just a single LSN. The logical + * position of the WAL record (redo ptr) is the same or earlier than the + * physical position. When we replay WAL we locate the checkpoint via its + * physical position then read the redo ptr and actually start replay at the + * earlier logical position. Note that we don't write *anything* to WAL at + * the logical position, so that location could be any other kind of WAL record. + * All of this mechanism allows us to continue working while we checkpoint. + * As a result, timing of actions is critical here and be careful to note that + * this function will likely take minutes to execute on a busy system. + */ +void +CreateCheckPoint(int flags) +{ + bool shutdown; + CheckPoint checkPoint; + XLogRecPtr recptr; + XLogSegNo _logSegNo; + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace; + XLogRecPtr PriorRedoPtr; + XLogRecPtr curInsert; + XLogRecPtr last_important_lsn; + VirtualTransactionId *vxids; + int nvxids; + int oldXLogAllowed = 0; + + /* + * An end-of-recovery checkpoint is really a shutdown checkpoint, just + * issued at a different time. + */ + if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY)) + shutdown = true; + else + shutdown = false; + + /* sanity check */ + if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0) + elog(ERROR, "can't create a checkpoint during recovery"); + + /* + * Prepare to accumulate statistics. + * + * Note: because it is possible for log_checkpoints to change while a + * checkpoint proceeds, we always accumulate stats, even if + * log_checkpoints is currently off. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + /* + * Let smgr prepare for checkpoint; this has to happen outside the + * critical section and before we determine the REDO pointer. Note that + * smgr must not do anything that'd have to be undone if we decide no + * checkpoint is needed. + */ + SyncPreCheckpoint(); + + /* + * Use a critical section to force system panic if we have trouble. + */ + START_CRIT_SECTION(); + + if (shutdown) + { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_SHUTDOWNING; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + + /* Begin filling in the checkpoint WAL record */ + MemSet(&checkPoint, 0, sizeof(checkPoint)); + checkPoint.time = (pg_time_t) time(NULL); + + /* + * For Hot Standby, derive the oldestActiveXid before we fix the redo + * pointer. This allows us to begin accumulating changes to assemble our + * starting snapshot of locks and transactions. + */ + if (!shutdown && XLogStandbyInfoActive()) + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + else + checkPoint.oldestActiveXid = InvalidTransactionId; + + /* + * Get location of last important record before acquiring insert locks (as + * GetLastImportantRecPtr() also locks WAL locks). + */ + last_important_lsn = GetLastImportantRecPtr(); + + /* + * We must block concurrent insertions while examining insert state to + * determine the checkpoint REDO pointer. + */ + WALInsertLockAcquireExclusive(); + curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); + + /* + * If this isn't a shutdown or forced checkpoint, and if there has been no + * WAL activity requiring a checkpoint, skip it. The idea here is to + * avoid inserting duplicate checkpoints when the system is idle. + */ + if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_FORCE)) == 0) + { + if (last_important_lsn == ControlFile->checkPoint) + { + WALInsertLockRelease(); + END_CRIT_SECTION(); + ereport(DEBUG1, + (errmsg_internal("checkpoint skipped because system is idle"))); + return; + } + } + + /* + * An end-of-recovery checkpoint is created before anyone is allowed to + * write WAL. To allow us to write the checkpoint record, temporarily + * enable XLogInsertAllowed. + */ + if (flags & CHECKPOINT_END_OF_RECOVERY) + oldXLogAllowed = LocalSetXLogInsertAllowed(); + + checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID; + if (flags & CHECKPOINT_END_OF_RECOVERY) + checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID; + else + checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID; + + checkPoint.fullPageWrites = Insert->fullPageWrites; + + /* + * Compute new REDO record ptr = location of next XLOG record. + * + * NB: this is NOT necessarily where the checkpoint record itself will be, + * since other backends may insert more XLOG records while we're off doing + * the buffer flush work. Those XLOG records are logically after the + * checkpoint, even though physically before it. Got that? + */ + freespace = INSERT_FREESPACE(curInsert); + if (freespace == 0) + { + if (XLogSegmentOffset(curInsert, wal_segment_size) == 0) + curInsert += SizeOfXLogLongPHD; + else + curInsert += SizeOfXLogShortPHD; + } + checkPoint.redo = curInsert; + + /* + * Here we update the shared RedoRecPtr for future XLogInsert calls; this + * must be done while holding all the insertion locks. + * + * Note: if we fail to complete the checkpoint, RedoRecPtr will be left + * pointing past where it really needs to point. This is okay; the only + * consequence is that XLogInsert might back up whole buffers that it + * didn't really need to. We can't postpone advancing RedoRecPtr because + * XLogInserts that happen while we are dumping buffers must assume that + * their buffer changes are not included in the checkpoint. + */ + RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + + /* + * Now we can release the WAL insertion locks, allowing other xacts to + * proceed while we are flushing disk buffers. + */ + WALInsertLockRelease(); + + /* Update the info_lck-protected copy of RedoRecPtr as well */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->RedoRecPtr = checkPoint.redo; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * If enabled, log checkpoint start. We postpone this until now so as not + * to log anything if we decided to skip the checkpoint. + */ + if (log_checkpoints) + LogCheckpointStart(flags, false); + + /* Update the process title */ + update_checkpoint_display(flags, false, false); + + TRACE_POSTGRESQL_CHECKPOINT_START(flags); + + /* + * Get the other info we need for the checkpoint record. + * + * We don't need to save oldestClogXid in the checkpoint, it only matters + * for the short period in which clog is being truncated, and if we crash + * during that we'll redo the clog truncation and fix up oldestClogXid + * there. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + checkPoint.nextXid = ShmemVariableCache->nextXid; + checkPoint.oldestXid = ShmemVariableCache->oldestXid; + checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB; + LWLockRelease(XidGenLock); + + LWLockAcquire(CommitTsLock, LW_SHARED); + checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; + checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; + LWLockRelease(CommitTsLock); + + LWLockAcquire(OidGenLock, LW_SHARED); + checkPoint.nextOid = ShmemVariableCache->nextOid; + if (!shutdown) + checkPoint.nextOid += ShmemVariableCache->oidCount; + LWLockRelease(OidGenLock); + + MultiXactGetCheckptMulti(shutdown, + &checkPoint.nextMulti, + &checkPoint.nextMultiOffset, + &checkPoint.oldestMulti, + &checkPoint.oldestMultiDB); + + /* + * Having constructed the checkpoint record, ensure all shmem disk buffers + * and commit-log buffers are flushed to disk. + * + * This I/O could fail for various reasons. If so, we will fail to + * complete the checkpoint, but there is no reason to force a system + * panic. Accordingly, exit critical section while doing it. + */ + END_CRIT_SECTION(); + + /* + * In some cases there are groups of actions that must all occur on one + * side or the other of a checkpoint record. Before flushing the + * checkpoint record we must explicitly wait for any backend currently + * performing those groups of actions. + * + * One example is end of transaction, so we must wait for any transactions + * that are currently in commit critical sections. If an xact inserted + * its commit record into XLOG just before the REDO point, then a crash + * restart from the REDO point would not replay that record, which means + * that our flushing had better include the xact's update of pg_xact. So + * we wait till he's out of his commit critical section before proceeding. + * See notes in RecordTransactionCommit(). + * + * Because we've already released the insertion locks, this test is a bit + * fuzzy: it is possible that we will wait for xacts we didn't really need + * to wait for. But the delay should be short and it seems better to make + * checkpoint take a bit longer than to hold off insertions longer than + * necessary. (In fact, the whole reason we have this issue is that xact.c + * does commit record XLOG insertion and clog update as two separate steps + * protected by different locks, but again that seems best on grounds of + * minimizing lock contention.) + * + * A transaction that has not yet set delayChkptFlags when we look cannot + * be at risk, since it has not inserted its commit record yet; and one + * that's already cleared it is not at risk either, since it's done fixing + * clog and we will correctly flush the update below. So we cannot miss + * any xacts we need to wait for. + */ + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_START)); + } + pfree(vxids); + + CheckPointGuts(checkPoint.redo, flags); + + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_COMPLETE)); + } + pfree(vxids); + + /* + * Take a snapshot of running transactions and write this to WAL. This + * allows us to reconstruct the state of running transactions during + * archive recovery, if required. Skip, if this info disabled. + * + * If we are shutting down, or Startup process is completing crash + * recovery we don't need to write running xact data. + */ + if (!shutdown && XLogStandbyInfoActive()) + LogStandbySnapshot(); + + START_CRIT_SECTION(); + + /* + * Now insert the checkpoint record into XLOG. + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint)); + recptr = XLogInsert(RM_XLOG_ID, + shutdown ? XLOG_CHECKPOINT_SHUTDOWN : + XLOG_CHECKPOINT_ONLINE); + + XLogFlush(recptr); + + /* + * We mustn't write any new WAL after a shutdown checkpoint, or it will be + * overwritten at next startup. No-one should even try, this just allows + * sanity-checking. In the case of an end-of-recovery checkpoint, we want + * to just temporarily disable writing until the system has exited + * recovery. + */ + if (shutdown) + { + if (flags & CHECKPOINT_END_OF_RECOVERY) + LocalXLogInsertAllowed = oldXLogAllowed; + else + LocalXLogInsertAllowed = 0; /* never again write WAL */ + } + + /* + * We now have ProcLastRecPtr = start of actual checkpoint record, recptr + * = end of actual checkpoint record. + */ + if (shutdown && checkPoint.redo != ProcLastRecPtr) + ereport(PANIC, + (errmsg("concurrent write-ahead log activity while database system is shutting down"))); + + /* + * Remember the prior checkpoint's redo ptr for + * UpdateCheckPointDistanceEstimate() + */ + PriorRedoPtr = ControlFile->checkPointCopy.redo; + + /* + * Update the control file. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (shutdown) + ControlFile->state = DB_SHUTDOWNED; + ControlFile->checkPoint = ProcLastRecPtr; + ControlFile->checkPointCopy = checkPoint; + /* crash recovery should always recover to the end of WAL */ + ControlFile->minRecoveryPoint = InvalidXLogRecPtr; + ControlFile->minRecoveryPointTLI = 0; + + /* + * Persist unloggedLSN value. It's reset on crash recovery, so this goes + * unused on non-shutdown checkpoints, but seems useful to store it always + * for debugging purposes. + */ + SpinLockAcquire(&XLogCtl->ulsn_lck); + ControlFile->unloggedLSN = XLogCtl->unloggedLSN; + SpinLockRelease(&XLogCtl->ulsn_lck); + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * We are now done with critical updates; no need for system panic if we + * have trouble while fooling with old log segments. + */ + END_CRIT_SECTION(); + + /* + * Let smgr do post-checkpoint cleanup (eg, deleting old files). + */ + SyncPostCheckpoint(); + + /* + * Update the average distance between checkpoints if the prior checkpoint + * exists. + */ + if (PriorRedoPtr != InvalidXLogRecPtr) + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + /* + * Delete old log files, those no longer needed for last checkpoint to + * prevent the disk holding the xlog from growing full. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(recptr, &_logSegNo); + if (InvalidateObsoleteReplicationSlots(_logSegNo)) + { + /* + * Some slots have been invalidated; recalculate the old-segment + * horizon, starting again from RedoRecPtr. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(recptr, &_logSegNo); + } + _logSegNo--; + RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr, + checkPoint.ThisTimeLineID); + + /* + * Make more log segments if needed. (Do this after recycling old log + * segments, since that may supply some of the needed files.) + */ + if (!shutdown) + PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID); + + /* + * Truncate pg_subtrans if possible. We can throw away all data before + * the oldest XMIN of any running transaction. No future transaction will + * attempt to reference any pg_subtrans entry older than that (see Asserts + * in subtrans.c). During recovery, though, we mustn't do this because + * StartupSUBTRANS hasn't been called yet. + */ + if (!RecoveryInProgress()) + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + + /* Real work is done; log and update stats. */ + LogCheckpointEnd(false); + + /* Reset the process title */ + update_checkpoint_display(flags, false, true); + + TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written, + NBuffers, + CheckpointStats.ckpt_segs_added, + CheckpointStats.ckpt_segs_removed, + CheckpointStats.ckpt_segs_recycled); +} + +/* + * Mark the end of recovery in WAL though without running a full checkpoint. + * We can expect that a restartpoint is likely to be in progress as we + * do this, though we are unwilling to wait for it to complete. + * + * CreateRestartPoint() allows for the case where recovery may end before + * the restartpoint completes so there is no concern of concurrent behaviour. + */ +static void +CreateEndOfRecoveryRecord(void) +{ + xl_end_of_recovery xlrec; + XLogRecPtr recptr; + + /* sanity check */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used to end recovery"); + + xlrec.end_time = GetCurrentTimestamp(); + + WALInsertLockAcquireExclusive(); + xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID; + xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID; + WALInsertLockRelease(); + + START_CRIT_SECTION(); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY); + + XLogFlush(recptr); + + /* + * Update the control file so that crash recovery can follow the timeline + * changes to this point. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->minRecoveryPoint = recptr; + ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + END_CRIT_SECTION(); +} + +/* + * Write an OVERWRITE_CONTRECORD message. + * + * When on WAL replay we expect a continuation record at the start of a page + * that is not there, recovery ends and WAL writing resumes at that point. + * But it's wrong to resume writing new WAL back at the start of the record + * that was broken, because downstream consumers of that WAL (physical + * replicas) are not prepared to "rewind". So the first action after + * finishing replay of all valid WAL must be to write a record of this type + * at the point where the contrecord was missing; to support xlogreader + * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added + * to the page header where the record occurs. xlogreader has an ad-hoc + * mechanism to report metadata about the broken record, which is what we + * use here. + * + * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to + * skip the record it was reading, and pass back the LSN of the skipped + * record, so that its caller can verify (on "replay" of that record) that the + * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten. + * + * 'aborted_lsn' is the beginning position of the record that was incomplete. + * It is included in the WAL record. 'pagePtr' and 'newTLI' point to the + * beginning of the XLOG page where the record is to be inserted. They must + * match the current WAL insert position, they're passed here just so that we + * can verify that. + */ +static XLogRecPtr +CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, + TimeLineID newTLI) +{ + xl_overwrite_contrecord xlrec; + XLogRecPtr recptr; + XLogPageHeader pagehdr; + XLogRecPtr startPos; + + /* sanity checks */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used at end of recovery"); + if (pagePtr % XLOG_BLCKSZ != 0) + elog(ERROR, "invalid position for missing continuation record %X/%X", + LSN_FORMAT_ARGS(pagePtr)); + + /* The current WAL insert position should be right after the page header */ + startPos = pagePtr; + if (XLogSegmentOffset(startPos, wal_segment_size) == 0) + startPos += SizeOfXLogLongPHD; + else + startPos += SizeOfXLogShortPHD; + recptr = GetXLogInsertRecPtr(); + if (recptr != startPos) + elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD", + LSN_FORMAT_ARGS(recptr)); + + START_CRIT_SECTION(); + + /* + * Initialize the XLOG page header (by GetXLogBuffer), and set the + * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag. + * + * No other backend is allowed to write WAL yet, so acquiring the WAL + * insertion lock is just pro forma. + */ + WALInsertLockAcquire(); + pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI); + pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD; + WALInsertLockRelease(); + + /* + * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the + * page. We know it becomes the first record, because no other backend is + * allowed to write WAL yet. + */ + XLogBeginInsert(); + xlrec.overwritten_lsn = aborted_lsn; + xlrec.overwrite_time = GetCurrentTimestamp(); + XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD); + + /* check that the record was inserted to the right place */ + if (ProcLastRecPtr != startPos) + elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X", + LSN_FORMAT_ARGS(ProcLastRecPtr)); + + XLogFlush(recptr); + + END_CRIT_SECTION(); + + return recptr; +} + +/* + * Flush all data in shared memory to disk, and fsync + * + * This is the common code shared between regular checkpoints and + * recovery restartpoints. + */ +static void +CheckPointGuts(XLogRecPtr checkPointRedo, int flags) +{ + CheckPointRelationMap(); + CheckPointReplicationSlots(); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); + CheckPointReplicationOrigin(); + + /* Write out all dirty data in SLRUs and the main buffer pool */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); + CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); + CheckPointCLOG(); + CheckPointCommitTs(); + CheckPointSUBTRANS(); + CheckPointMultiXact(); + CheckPointPredicate(); + CheckPointBuffers(flags); + + /* Perform all queued up fsyncs */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); + CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); + ProcessSyncRequests(); + CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); + + /* We deliberately delay 2PC checkpointing as long as possible */ + CheckPointTwoPhase(checkPointRedo); +} + +/* + * Save a checkpoint for recovery restart if appropriate + * + * This function is called each time a checkpoint record is read from XLOG. + * It must determine whether the checkpoint represents a safe restartpoint or + * not. If so, the checkpoint record is stashed in shared memory so that + * CreateRestartPoint can consult it. (Note that the latter function is + * executed by the checkpointer, while this one will be executed by the + * startup process.) + */ +static void +RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record) +{ + /* + * Also refrain from creating a restartpoint if we have seen any + * references to non-existent pages. Restarting recovery from the + * restartpoint would not see the references, so we would lose the + * cross-check that the pages belonged to a relation that was dropped + * later. + */ + if (XLogHaveInvalidPages()) + { + elog(trace_recovery(DEBUG2), + "could not record restart point at %X/%X because there " + "are unresolved references to invalid pages", + LSN_FORMAT_ARGS(checkPoint->redo)); + return; + } + + /* + * Copy the checkpoint record to shared memory, so that checkpointer can + * work out the next time it wants to perform a restartpoint. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr; + XLogCtl->lastCheckPointEndPtr = record->EndRecPtr; + XLogCtl->lastCheckPoint = *checkPoint; + SpinLockRelease(&XLogCtl->info_lck); +} + +/* + * Establish a restartpoint if possible. + * + * This is similar to CreateCheckPoint, but is used during WAL recovery + * to establish a point from which recovery can roll forward without + * replaying the entire recovery log. + * + * Returns true if a new restartpoint was established. We can only establish + * a restartpoint if we have replayed a safe checkpoint record since last + * restartpoint. + */ +bool +CreateRestartPoint(int flags) +{ + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; + XLogRecPtr PriorRedoPtr; + XLogRecPtr receivePtr; + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr endptr; + XLogSegNo _logSegNo; + TimestampTz xtime; + + /* Concurrent checkpoint/restartpoint cannot happen */ + Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER); + + /* Get a local copy of the last safe checkpoint record. */ + SpinLockAcquire(&XLogCtl->info_lck); + lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr; + lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr; + lastCheckPoint = XLogCtl->lastCheckPoint; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Check that we're still in recovery mode. It's ok if we exit recovery + * mode after this check, the restart point is valid anyway. + */ + if (!RecoveryInProgress()) + { + ereport(DEBUG2, + (errmsg_internal("skipping restartpoint, recovery has already ended"))); + return false; + } + + /* + * If the last checkpoint record we've replayed is already our last + * restartpoint, we can't perform a new restart point. We still update + * minRecoveryPoint in that case, so that if this is a shutdown restart + * point, we won't start up earlier than before. That's not strictly + * necessary, but when hot standby is enabled, it would be rather weird if + * the database opened up for read-only connections at a point-in-time + * before the last shutdown. Such time travel is still possible in case of + * immediate shutdown, though. + * + * We don't explicitly advance minRecoveryPoint when we do create a + * restartpoint. It's assumed that flushing the buffers will do that as a + * side-effect. + */ + if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) || + lastCheckPoint.redo <= ControlFile->checkPointCopy.redo) + { + ereport(DEBUG2, + (errmsg_internal("skipping restartpoint, already performed at %X/%X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)))); + + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + if (flags & CHECKPOINT_IS_SHUTDOWN) + { + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; + UpdateControlFile(); + LWLockRelease(ControlFileLock); + } + return false; + } + + /* + * Update the shared RedoRecPtr so that the startup process can calculate + * the number of segments replayed since last restartpoint, and request a + * restartpoint if it exceeds CheckPointSegments. + * + * Like in CreateCheckPoint(), hold off insertions to update it, although + * during recovery this is just pro forma, because no WAL insertions are + * happening. + */ + WALInsertLockAcquireExclusive(); + RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo; + WALInsertLockRelease(); + + /* Also update the info_lck-protected copy */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->RedoRecPtr = lastCheckPoint.redo; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Prepare to accumulate statistics. + * + * Note: because it is possible for log_checkpoints to change while a + * checkpoint proceeds, we always accumulate stats, even if + * log_checkpoints is currently off. + */ + MemSet(&CheckpointStats, 0, sizeof(CheckpointStats)); + CheckpointStats.ckpt_start_t = GetCurrentTimestamp(); + + if (log_checkpoints) + LogCheckpointStart(flags, true); + + /* Update the process title */ + update_checkpoint_display(flags, true, false); + + CheckPointGuts(lastCheckPoint.redo, flags); + + /* + * Remember the prior checkpoint's redo ptr for + * UpdateCheckPointDistanceEstimate() + */ + PriorRedoPtr = ControlFile->checkPointCopy.redo; + + /* + * Update pg_control, using current time. Check that it still shows an + * older checkpoint, else do nothing; this is a quick hack to make sure + * nothing really bad happens if somehow we get here after the + * end-of-recovery checkpoint. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo) + { + /* + * Update the checkpoint information. We do this even if the cluster + * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL + * segments recycled below. + */ + ControlFile->checkPoint = lastCheckPointRecPtr; + ControlFile->checkPointCopy = lastCheckPoint; + + /* + * Ensure minRecoveryPoint is past the checkpoint record and update it + * if the control file still shows DB_IN_ARCHIVE_RECOVERY. Normally, + * this will have happened already while writing out dirty buffers, + * but not necessarily - e.g. because no buffers were dirtied. We do + * this because a backup performed in recovery uses minRecoveryPoint + * to determine which WAL files must be included in the backup, and + * the file (or files) containing the checkpoint record must be + * included, at a minimum. Note that for an ordinary restart of + * recovery there's no value in having the minimum recovery point any + * earlier than this anyway, because redo will begin just after the + * checkpoint record. + */ + if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY) + { + if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr) + { + ControlFile->minRecoveryPoint = lastCheckPointEndPtr; + ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID; + + /* update local copy */ + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + if (flags & CHECKPOINT_IS_SHUTDOWN) + ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; + } + UpdateControlFile(); + } + LWLockRelease(ControlFileLock); + + /* + * Update the average distance between checkpoints/restartpoints if the + * prior checkpoint exists. + */ + if (PriorRedoPtr != InvalidXLogRecPtr) + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + /* + * Delete old log files, those no longer needed for last restartpoint to + * prevent the disk holding the xlog from growing full. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + + /* + * Retreat _logSegNo using the current end of xlog replayed or received, + * whichever is later. + */ + receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr; + KeepLogSeg(endptr, &_logSegNo); + if (InvalidateObsoleteReplicationSlots(_logSegNo)) + { + /* + * Some slots have been invalidated; recalculate the old-segment + * horizon, starting again from RedoRecPtr. + */ + XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + KeepLogSeg(endptr, &_logSegNo); + } + _logSegNo--; + + /* + * Try to recycle segments on a useful timeline. If we've been promoted + * since the beginning of this restartpoint, use the new timeline chosen + * at end of recovery. If we're still in recovery, use the timeline we're + * currently replaying. + * + * There is no guarantee that the WAL segments will be useful on the + * current timeline; if recovery proceeds to a new timeline right after + * this, the pre-allocated WAL segments on this timeline will not be used, + * and will go wasted until recycled on the next restartpoint. We'll live + * with that. + */ + if (!RecoveryInProgress()) + replayTLI = XLogCtl->InsertTimeLineID; + + RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI); + + /* + * Make more log segments if needed. (Do this after recycling old log + * segments, since that may supply some of the needed files.) + */ + PreallocXlogFiles(endptr, replayTLI); + + /* + * Truncate pg_subtrans if possible. We can throw away all data before + * the oldest XMIN of any running transaction. No future transaction will + * attempt to reference any pg_subtrans entry older than that (see Asserts + * in subtrans.c). When hot standby is disabled, though, we mustn't do + * this because StartupSUBTRANS hasn't been called yet. + */ + if (EnableHotStandby) + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); + + /* Real work is done; log and update stats. */ + LogCheckpointEnd(true); + + /* Reset the process title */ + update_checkpoint_display(flags, true, true); + + xtime = GetLatestXTime(); + ereport((log_checkpoints ? LOG : DEBUG2), + (errmsg("recovery restart point at %X/%X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)), + xtime ? errdetail("Last completed transaction was at log time %s.", + timestamptz_to_str(xtime)) : 0)); + + /* + * Finally, execute archive_cleanup_command, if any. + */ + if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0) + ExecuteRecoveryCommand(archiveCleanupCommand, + "archive_cleanup_command", + false, + WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND); + + return true; +} + +/* + * Report availability of WAL for the given target LSN + * (typically a slot's restart_lsn) + * + * Returns one of the following enum values: + * + * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of + * max_wal_size. + * + * * WALAVAIL_EXTENDED means it is still available by preserving extra + * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller + * than max_wal_size, this state is not returned. + * + * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will + * remove reserved segments. The walsender using this slot may return to the + * above. + * + * * WALAVAIL_REMOVED means it has been removed. A replication stream on + * a slot with this LSN cannot continue after a restart. + * + * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL. + */ +WALAvailability +GetWALAvailability(XLogRecPtr targetLSN) +{ + XLogRecPtr currpos; /* current write LSN */ + XLogSegNo currSeg; /* segid of currpos */ + XLogSegNo targetSeg; /* segid of targetLSN */ + XLogSegNo oldestSeg; /* actual oldest segid */ + XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */ + XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */ + uint64 keepSegs; + + /* + * slot does not reserve WAL. Either deactivated, or has never been active + */ + if (XLogRecPtrIsInvalid(targetLSN)) + return WALAVAIL_INVALID_LSN; + + /* + * Calculate the oldest segment currently reserved by all slots, + * considering wal_keep_size and max_slot_wal_keep_size. Initialize + * oldestSlotSeg to the current segment. + */ + currpos = GetXLogWriteRecPtr(); + XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size); + KeepLogSeg(currpos, &oldestSlotSeg); + + /* + * Find the oldest extant segment file. We get 1 until checkpoint removes + * the first WAL segment file since startup, which causes the status being + * wrong under certain abnormal conditions but that doesn't actually harm. + */ + oldestSeg = XLogGetLastRemovedSegno() + 1; + + /* calculate oldest segment by max_wal_size */ + XLByteToSeg(currpos, currSeg, wal_segment_size); + keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1; + + if (currSeg > keepSegs) + oldestSegMaxWalSize = currSeg - keepSegs; + else + oldestSegMaxWalSize = 1; + + /* the segment we care about */ + XLByteToSeg(targetLSN, targetSeg, wal_segment_size); + + /* + * No point in returning reserved or extended status values if the + * targetSeg is known to be lost. + */ + if (targetSeg >= oldestSlotSeg) + { + /* show "reserved" when targetSeg is within max_wal_size */ + if (targetSeg >= oldestSegMaxWalSize) + return WALAVAIL_RESERVED; + + /* being retained by slots exceeding max_wal_size */ + return WALAVAIL_EXTENDED; + } + + /* WAL segments are no longer retained but haven't been removed yet */ + if (targetSeg >= oldestSeg) + return WALAVAIL_UNRESERVED; + + /* Definitely lost */ + return WALAVAIL_REMOVED; +} + + +/* + * Retreat *logSegNo to the last segment that we need to retain because of + * either wal_keep_size or replication slots. + * + * This is calculated by subtracting wal_keep_size from the given xlog + * location, recptr and by making sure that that result is below the + * requirement of replication slots. For the latter criterion we do consider + * the effects of max_slot_wal_keep_size: reserve at most that much space back + * from recptr. + * + * Note about replication slots: if this function calculates a value + * that's further ahead than what slots need reserved, then affected + * slots need to be invalidated and this function invoked again. + * XXX it might be a good idea to rewrite this function so that + * invalidation is optionally done here, instead. + */ +static void +KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) +{ + XLogSegNo currSegNo; + XLogSegNo segno; + XLogRecPtr keep; + + XLByteToSeg(recptr, currSegNo, wal_segment_size); + segno = currSegNo; + + /* + * Calculate how many segments are kept by slots first, adjusting for + * max_slot_wal_keep_size. + */ + keep = XLogGetReplicationSlotMinimumLSN(); + if (keep != InvalidXLogRecPtr && keep < recptr) + { + XLByteToSeg(keep, segno, wal_segment_size); + + /* Cap by max_slot_wal_keep_size ... */ + if (max_slot_wal_keep_size_mb >= 0) + { + uint64 slot_keep_segs; + + slot_keep_segs = + ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size); + + if (currSegNo - segno > slot_keep_segs) + segno = currSegNo - slot_keep_segs; + } + } + + /* but, keep at least wal_keep_size if that's set */ + if (wal_keep_size_mb > 0) + { + uint64 keep_segs; + + keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size); + if (currSegNo - segno < keep_segs) + { + /* avoid underflow, don't go below 1 */ + if (currSegNo <= keep_segs) + segno = 1; + else + segno = currSegNo - keep_segs; + } + } + + /* don't delete WAL segments newer than the calculated segment */ + if (segno < *logSegNo) + *logSegNo = segno; +} + +/* + * Write a NEXTOID log record + */ +void +XLogPutNextOid(Oid nextOid) +{ + XLogBeginInsert(); + XLogRegisterData((char *) (&nextOid), sizeof(Oid)); + (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID); + + /* + * We need not flush the NEXTOID record immediately, because any of the + * just-allocated OIDs could only reach disk as part of a tuple insert or + * update that would have its own XLOG record that must follow the NEXTOID + * record. Therefore, the standard buffer LSN interlock applied to those + * records will ensure no such OID reaches disk before the NEXTOID record + * does. + * + * Note, however, that the above statement only covers state "within" the + * database. When we use a generated OID as a file or directory name, we + * are in a sense violating the basic WAL rule, because that filesystem + * change may reach disk before the NEXTOID WAL record does. The impact + * of this is that if a database crash occurs immediately afterward, we + * might after restart re-generate the same OID and find that it conflicts + * with the leftover file or directory. But since for safety's sake we + * always loop until finding a nonconflicting filename, this poses no real + * problem in practice. See pgsql-hackers discussion 27-Sep-2006. + */ +} + +/* + * Write an XLOG SWITCH record. + * + * Here we just blindly issue an XLogInsert request for the record. + * All the magic happens inside XLogInsert. + * + * The return value is either the end+1 address of the switch record, + * or the end+1 address of the prior segment if we did not need to + * write a switch record because we are already at segment start. + */ +XLogRecPtr +RequestXLogSwitch(bool mark_unimportant) +{ + XLogRecPtr RecPtr; + + /* XLOG SWITCH has no data */ + XLogBeginInsert(); + + if (mark_unimportant) + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH); + + return RecPtr; +} + +/* + * Write a RESTORE POINT record + */ +XLogRecPtr +XLogRestorePoint(const char *rpName) +{ + XLogRecPtr RecPtr; + xl_restore_point xlrec; + + xlrec.rp_time = GetCurrentTimestamp(); + strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point)); + + RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); + + ereport(LOG, + (errmsg("restore point \"%s\" created at %X/%X", + rpName, LSN_FORMAT_ARGS(RecPtr)))); + + return RecPtr; +} + +/* + * Check if any of the GUC parameters that are critical for hot standby + * have changed, and update the value in pg_control file if necessary. + */ +static void +XLogReportParameters(void) +{ + if (wal_level != ControlFile->wal_level || + wal_log_hints != ControlFile->wal_log_hints || + MaxConnections != ControlFile->MaxConnections || + max_worker_processes != ControlFile->max_worker_processes || + max_wal_senders != ControlFile->max_wal_senders || + max_prepared_xacts != ControlFile->max_prepared_xacts || + max_locks_per_xact != ControlFile->max_locks_per_xact || + track_commit_timestamp != ControlFile->track_commit_timestamp) + { + /* + * The change in number of backend slots doesn't need to be WAL-logged + * if archiving is not enabled, as you can't start archive recovery + * with wal_level=minimal anyway. We don't really care about the + * values in pg_control either if wal_level=minimal, but seems better + * to keep them up-to-date to avoid confusion. + */ + if (wal_level != ControlFile->wal_level || XLogIsNeeded()) + { + xl_parameter_change xlrec; + XLogRecPtr recptr; + + xlrec.MaxConnections = MaxConnections; + xlrec.max_worker_processes = max_worker_processes; + xlrec.max_wal_senders = max_wal_senders; + xlrec.max_prepared_xacts = max_prepared_xacts; + xlrec.max_locks_per_xact = max_locks_per_xact; + xlrec.wal_level = wal_level; + xlrec.wal_log_hints = wal_log_hints; + xlrec.track_commit_timestamp = track_commit_timestamp; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE); + XLogFlush(recptr); + } + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + ControlFile->MaxConnections = MaxConnections; + ControlFile->max_worker_processes = max_worker_processes; + ControlFile->max_wal_senders = max_wal_senders; + ControlFile->max_prepared_xacts = max_prepared_xacts; + ControlFile->max_locks_per_xact = max_locks_per_xact; + ControlFile->wal_level = wal_level; + ControlFile->wal_log_hints = wal_log_hints; + ControlFile->track_commit_timestamp = track_commit_timestamp; + UpdateControlFile(); + + LWLockRelease(ControlFileLock); + } +} + +/* + * Update full_page_writes in shared memory, and write an + * XLOG_FPW_CHANGE record if necessary. + * + * Note: this function assumes there is no other process running + * concurrently that could update it. + */ +void +UpdateFullPageWrites(void) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + bool recoveryInProgress; + + /* + * Do nothing if full_page_writes has not been changed. + * + * It's safe to check the shared full_page_writes without the lock, + * because we assume that there is no concurrently running process which + * can update it. + */ + if (fullPageWrites == Insert->fullPageWrites) + return; + + /* + * Perform this outside critical section so that the WAL insert + * initialization done by RecoveryInProgress() doesn't trigger an + * assertion failure. + */ + recoveryInProgress = RecoveryInProgress(); + + START_CRIT_SECTION(); + + /* + * It's always safe to take full page images, even when not strictly + * required, but not the other round. So if we're setting full_page_writes + * to true, first set it true and then write the WAL record. If we're + * setting it to false, first write the WAL record and then set the global + * flag. + */ + if (fullPageWrites) + { + WALInsertLockAcquireExclusive(); + Insert->fullPageWrites = true; + WALInsertLockRelease(); + } + + /* + * Write an XLOG_FPW_CHANGE record. This allows us to keep track of + * full_page_writes during archive recovery, if required. + */ + if (XLogStandbyInfoActive() && !recoveryInProgress) + { + XLogBeginInsert(); + XLogRegisterData((char *) (&fullPageWrites), sizeof(bool)); + + XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE); + } + + if (!fullPageWrites) + { + WALInsertLockAcquireExclusive(); + Insert->fullPageWrites = false; + WALInsertLockRelease(); + } + END_CRIT_SECTION(); +} + +/* + * XLOG resource manager's routines + * + * Definitions of info values are in include/catalog/pg_control.h, though + * not all record types are related to control file updates. + * + * NOTE: Some XLOG record types that are directly related to WAL recovery + * are handled in xlogrecovery_redo(). + */ +void +xlog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + + /* + * In XLOG rmgr, backup blocks are only used by XLOG_FPI and + * XLOG_FPI_FOR_HINT records. + */ + Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + !XLogRecHasAnyBlockRefs(record)); + + if (info == XLOG_NEXTOID) + { + Oid nextOid; + + /* + * We used to try to take the maximum of ShmemVariableCache->nextOid + * and the recorded nextOid, but that fails if the OID counter wraps + * around. Since no OID allocation should be happening during replay + * anyway, better to just believe the record exactly. We still take + * OidGenLock while setting the variable, just in case. + */ + memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid)); + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextOid = nextOid; + ShmemVariableCache->oidCount = 0; + LWLockRelease(OidGenLock); + } + else if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + TimeLineID replayTLI; + + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + /* In a SHUTDOWN checkpoint, believe the counters exactly */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextXid = checkPoint.nextXid; + LWLockRelease(XidGenLock); + LWLockAcquire(OidGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->oidCount = 0; + LWLockRelease(OidGenLock); + MultiXactSetNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); + + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); + + /* + * No need to set oldestClogXid here as well; it'll be set when we + * redo an xl_clog_truncate if it changed since initialization. + */ + SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + + /* + * If we see a shutdown checkpoint while waiting for an end-of-backup + * record, the backup was canceled and the end-of-backup record will + * never arrive. + */ + if (ArchiveRecoveryRequested && + !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) && + XLogRecPtrIsInvalid(ControlFile->backupEndPoint)) + ereport(PANIC, + (errmsg("online backup was canceled, recovery cannot continue"))); + + /* + * If we see a shutdown checkpoint, we know that nothing was running + * on the primary at this point. So fake-up an empty running-xacts + * record and use that here and now. Recover additional standby state + * for prepared transactions. + */ + if (standbyState >= STANDBY_INITIALIZED) + { + TransactionId *xids; + int nxids; + TransactionId oldestActiveXID; + TransactionId latestCompletedXid; + RunningTransactionsData running; + + oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); + + /* + * Construct a RunningTransactions snapshot representing a shut + * down server, with only prepared transactions still alive. We're + * never overflowed at this point because all subxids are listed + * with their parent prepared transactions. + */ + running.xcnt = nxids; + running.subxcnt = 0; + running.subxid_overflow = false; + running.nextXid = XidFromFullTransactionId(checkPoint.nextXid); + running.oldestRunningXid = oldestActiveXID; + latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid); + TransactionIdRetreat(latestCompletedXid); + Assert(TransactionIdIsNormal(latestCompletedXid)); + running.latestCompletedXid = latestCompletedXid; + running.xids = xids; + + ProcArrayApplyRecoveryInfo(&running); + + StandbyRecoverPreparedTransactions(); + } + + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * We should've already switched to the new TLI before replaying this + * record. + */ + (void) GetCurrentReplayRecPtr(&replayTLI); + if (checkPoint.ThisTimeLineID != replayTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record", + checkPoint.ThisTimeLineID, replayTLI))); + + RecoveryRestartPoint(&checkPoint, record); + } + else if (info == XLOG_CHECKPOINT_ONLINE) + { + CheckPoint checkPoint; + TimeLineID replayTLI; + + memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); + /* In an ONLINE checkpoint, treat the XID counter as a minimum */ + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid, + checkPoint.nextXid)) + ShmemVariableCache->nextXid = checkPoint.nextXid; + LWLockRelease(XidGenLock); + + /* + * We ignore the nextOid counter in an ONLINE checkpoint, preferring + * to track OID assignment through XLOG_NEXTOID records. The nextOid + * counter is from the start of the checkpoint and might well be stale + * compared to later XLOG_NEXTOID records. We could try to take the + * maximum of the nextOid counter and our latest value, but since + * there's no particular guarantee about the speed with which the OID + * counter wraps around, that's a risky thing to do. In any case, + * users of the nextOid counter are required to avoid assignment of + * duplicates, so that a somewhat out-of-date value should be safe. + */ + + /* Handle multixact */ + MultiXactAdvanceNextMXact(checkPoint.nextMulti, + checkPoint.nextMultiOffset); + + /* + * NB: This may perform multixact truncation when replaying WAL + * generated by an older primary. + */ + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); + if (TransactionIdPrecedes(ShmemVariableCache->oldestXid, + checkPoint.oldestXid)) + SetTransactionIdLimit(checkPoint.oldestXid, + checkPoint.oldestXidDB); + /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + LWLockRelease(ControlFileLock); + + /* Update shared-memory copy of checkpoint XID/epoch */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->ckptFullXid = checkPoint.nextXid; + SpinLockRelease(&XLogCtl->info_lck); + + /* TLI should not change in an on-line checkpoint */ + (void) GetCurrentReplayRecPtr(&replayTLI); + if (checkPoint.ThisTimeLineID != replayTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record", + checkPoint.ThisTimeLineID, replayTLI))); + + RecoveryRestartPoint(&checkPoint, record); + } + else if (info == XLOG_OVERWRITE_CONTRECORD) + { + /* nothing to do here, handled in xlogrecovery_redo() */ + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + TimeLineID replayTLI; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); + + /* + * For Hot Standby, we could treat this like a Shutdown Checkpoint, + * but this case is rarer and harder to test, so the benefit doesn't + * outweigh the potential extra cost of maintenance. + */ + + /* + * We should've already switched to the new TLI before replaying this + * record. + */ + (void) GetCurrentReplayRecPtr(&replayTLI); + if (xlrec.ThisTimeLineID != replayTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record", + xlrec.ThisTimeLineID, replayTLI))); + } + else if (info == XLOG_NOOP) + { + /* nothing to do here */ + } + else if (info == XLOG_SWITCH) + { + /* nothing to do here */ + } + else if (info == XLOG_RESTORE_POINT) + { + /* nothing to do here, handled in xlogrecovery.c */ + } + else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + { + /* + * XLOG_FPI records contain nothing else but one or more block + * references. Every block reference must include a full-page image + * even if full_page_writes was disabled when the record was generated + * - otherwise there would be no point in this record. + * + * XLOG_FPI_FOR_HINT records are generated when a page needs to be + * WAL-logged because of a hint bit update. They are only generated + * when checksums and/or wal_log_hints are enabled. They may include + * no full-page images if full_page_writes was disabled when they were + * generated. In this case there is nothing to do here. + * + * No recovery conflicts are generated by these generic records - if a + * resource manager needs to generate conflicts, it has to define a + * separate WAL record type and redo routine. + */ + for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + Buffer buffer; + + if (!XLogRecHasBlockImage(record, block_id)) + { + if (info == XLOG_FPI) + elog(ERROR, "XLOG_FPI record did not contain a full-page image"); + continue; + } + + if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + UnlockReleaseBuffer(buffer); + } + } + else if (info == XLOG_BACKUP_END) + { + /* nothing to do here, handled in xlogrecovery_redo() */ + } + else if (info == XLOG_PARAMETER_CHANGE) + { + xl_parameter_change xlrec; + + /* Update our copy of the parameters in pg_control */ + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change)); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->MaxConnections = xlrec.MaxConnections; + ControlFile->max_worker_processes = xlrec.max_worker_processes; + ControlFile->max_wal_senders = xlrec.max_wal_senders; + ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts; + ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact; + ControlFile->wal_level = xlrec.wal_level; + ControlFile->wal_log_hints = xlrec.wal_log_hints; + + /* + * Update minRecoveryPoint to ensure that if recovery is aborted, we + * recover back up to this point before allowing hot standby again. + * This is important if the max_* settings are decreased, to ensure + * you don't run queries against the WAL preceding the change. The + * local copies cannot be updated as long as crash recovery is + * happening and we expect all the WAL to be replayed. + */ + if (InArchiveRecovery) + { + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn) + { + TimeLineID replayTLI; + + (void) GetCurrentReplayRecPtr(&replayTLI); + ControlFile->minRecoveryPoint = lsn; + ControlFile->minRecoveryPointTLI = replayTLI; + } + + CommitTsParameterChange(xlrec.track_commit_timestamp, + ControlFile->track_commit_timestamp); + ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp; + + UpdateControlFile(); + LWLockRelease(ControlFileLock); + + /* Check to see if any parameter change gives a problem on recovery */ + CheckRequiredParameterValues(); + } + else if (info == XLOG_FPW_CHANGE) + { + bool fpw; + + memcpy(&fpw, XLogRecGetData(record), sizeof(bool)); + + /* + * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that + * do_pg_backup_start() and do_pg_backup_stop() can check whether + * full_page_writes has been disabled during online backup. + */ + if (!fpw) + { + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr) + XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + } + + /* Keep track of full_page_writes */ + lastFullPageWrites = fpw; + } +} + +/* + * Return the (possible) sync flag used for opening a file, depending on the + * value of the GUC wal_sync_method. + */ +static int +get_sync_bit(int method) +{ + int o_direct_flag = 0; + + /* If fsync is disabled, never open in sync mode */ + if (!enableFsync) + return 0; + + /* + * Optimize writes by bypassing kernel cache with O_DIRECT when using + * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are + * disabled, otherwise the archive command or walsender process will read + * the WAL soon after writing it, which is guaranteed to cause a physical + * read if we bypassed the kernel cache. We also skip the + * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same + * reason. + * + * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * written by walreceiver is normally read by the startup process soon + * after it's written. Also, walreceiver performs unaligned writes, which + * don't work with O_DIRECT, so it is required for correctness too. + */ + if (!XLogIsNeeded() && !AmWalReceiverProcess()) + o_direct_flag = PG_O_DIRECT; + + switch (method) + { + /* + * enum values for all sync options are defined even if they are + * not supported on the current platform. But if not, they are + * not included in the enum option array, and therefore will never + * be seen here. + */ + case SYNC_METHOD_FSYNC: + case SYNC_METHOD_FSYNC_WRITETHROUGH: + case SYNC_METHOD_FDATASYNC: + return 0; +#ifdef OPEN_SYNC_FLAG + case SYNC_METHOD_OPEN: + return OPEN_SYNC_FLAG | o_direct_flag; +#endif +#ifdef OPEN_DATASYNC_FLAG + case SYNC_METHOD_OPEN_DSYNC: + return OPEN_DATASYNC_FLAG | o_direct_flag; +#endif + default: + /* can't happen (unless we are out of sync with option array) */ + elog(ERROR, "unrecognized wal_sync_method: %d", method); + return 0; /* silence warning */ + } +} + +/* + * GUC support + */ +void +assign_xlog_sync_method(int new_sync_method, void *extra) +{ + if (sync_method != new_sync_method) + { + /* + * To ensure that no blocks escape unsynced, force an fsync on the + * currently open log segment (if any). Also, if the open flag is + * changing, close the log file so it will be reopened (with new flag + * bit) at next use. + */ + if (openLogFile >= 0) + { + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN); + if (pg_fsync(openLogFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + save_errno = errno; + XLogFileName(xlogfname, openLogTLI, openLogSegNo, + wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", xlogfname))); + } + + pgstat_report_wait_end(); + if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) + XLogFileClose(); + } + } +} + + +/* + * Issue appropriate kind of fsync (if any) for an XLOG output file. + * + * 'fd' is a file descriptor for the XLOG file to be fsync'd. + * 'segno' is for error reporting purposes. + */ +void +issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) +{ + char *msg = NULL; + instr_time start; + + Assert(tli != 0); + + /* + * Quick exit if fsync is disabled or write() has already synced the WAL + * file. + */ + if (!enableFsync || + sync_method == SYNC_METHOD_OPEN || + sync_method == SYNC_METHOD_OPEN_DSYNC) + return; + + /* Measure I/O timing to sync the WAL file */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); + switch (sync_method) + { + case SYNC_METHOD_FSYNC: + if (pg_fsync_no_writethrough(fd) != 0) + msg = _("could not fsync file \"%s\": %m"); + break; +#ifdef HAVE_FSYNC_WRITETHROUGH + case SYNC_METHOD_FSYNC_WRITETHROUGH: + if (pg_fsync_writethrough(fd) != 0) + msg = _("could not fsync write-through file \"%s\": %m"); + break; +#endif +#ifdef HAVE_FDATASYNC + case SYNC_METHOD_FDATASYNC: + if (pg_fdatasync(fd) != 0) + msg = _("could not fdatasync file \"%s\": %m"); + break; +#endif + case SYNC_METHOD_OPEN: + case SYNC_METHOD_OPEN_DSYNC: + /* not reachable */ + Assert(false); + break; + default: + elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); + break; + } + + /* PANIC if failed to fsync */ + if (msg) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, tli, segno, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg(msg, xlogfname))); + } + + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL files were synced. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + PendingWalStats.wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); + } + + PendingWalStats.wal_sync++; +} + +/* + * do_pg_backup_start is the workhorse of the user-visible pg_backup_start() + * function. It creates the necessary starting checkpoint and constructs the + * backup label and tablespace map. + * + * Input parameters are "backupidstr" (the backup label string) and "fast" + * (if true, we do the checkpoint in immediate mode to make it faster). + * + * The backup label and tablespace map contents are appended to *labelfile and + * *tblspcmapfile, and the caller is responsible for including them in the + * backup archive as 'backup_label' and 'tablespace_map'. + * tblspcmapfile is required mainly for tar format in windows as native windows + * utilities are not able to create symlinks while extracting files from tar. + * However for consistency and platform-independence, we do it the same way + * everywhere. + * + * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs + * describing the cluster's tablespaces. + * + * Returns the minimum WAL location that must be present to restore from this + * backup, and the corresponding timeline ID in *starttli_p. + * + * Every successfully started backup must be stopped by calling + * do_pg_backup_stop() or do_pg_abort_backup(). There can be many + * backups active at the same time. + * + * It is the responsibility of the caller of this function to verify the + * permissions of the calling user! + */ +XLogRecPtr +do_pg_backup_start(const char *backupidstr, bool fast, TimeLineID *starttli_p, + StringInfo labelfile, List **tablespaces, + StringInfo tblspcmapfile) +{ + bool backup_started_in_recovery = false; + XLogRecPtr checkpointloc; + XLogRecPtr startpoint; + TimeLineID starttli; + pg_time_t stamp_time; + char strfbuf[128]; + char xlogfilename[MAXFNAMELEN]; + XLogSegNo _logSegNo; + + backup_started_in_recovery = RecoveryInProgress(); + + /* + * During recovery, we don't need to check WAL level. Because, if WAL + * level is not sufficient, it's impossible to get here during recovery. + */ + if (!backup_started_in_recovery && !XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for making an online backup"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + if (strlen(backupidstr) > MAXPGPATH) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("backup label too long (max %d bytes)", + MAXPGPATH))); + + /* + * Mark backup active in shared memory. We must do full-page WAL writes + * during an on-line backup even if not doing so at other times, because + * it's quite possible for the backup dump to obtain a "torn" (partially + * written) copy of a database page if it reads the page concurrently with + * our write to the same page. This can be fixed as long as the first + * write to the page in the WAL sequence is a full-page write. Hence, we + * turn on forcePageWrites and then force a CHECKPOINT, to ensure there + * are no dirty pages in shared memory that might get dumped while the + * backup is in progress without having a corresponding WAL record. (Once + * the backup is complete, we need not force full-page writes anymore, + * since we expect that any pages not modified during the backup interval + * must have been correctly captured by the backup.) + * + * Note that forcePageWrites has no effect during an online backup from + * the standby. + * + * We must hold all the insertion locks to change the value of + * forcePageWrites, to ensure adequate interlocking against + * XLogInsertRecord(). + */ + WALInsertLockAcquireExclusive(); + XLogCtl->Insert.runningBackups++; + XLogCtl->Insert.forcePageWrites = true; + WALInsertLockRelease(); + + /* Ensure we release forcePageWrites if fail below */ + PG_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0); + { + bool gotUniqueStartpoint = false; + DIR *tblspcdir; + struct dirent *de; + tablespaceinfo *ti; + int datadirpathlen; + + /* + * Force an XLOG file switch before the checkpoint, to ensure that the + * WAL segment the checkpoint is written to doesn't contain pages with + * old timeline IDs. That would otherwise happen if you called + * pg_backup_start() right after restoring from a PITR archive: the + * first WAL segment containing the startup checkpoint has pages in + * the beginning with the old timeline ID. That can cause trouble at + * recovery: we won't have a history file covering the old timeline if + * pg_wal directory was not included in the base backup and the WAL + * archive was cleared too before starting the backup. + * + * This also ensures that we have emitted a WAL page header that has + * XLP_BKP_REMOVABLE off before we emit the checkpoint record. + * Therefore, if a WAL archiver (such as pglesslog) is trying to + * compress out removable backup blocks, it won't remove any that + * occur after this point. + * + * During recovery, we skip forcing XLOG file switch, which means that + * the backup taken during recovery is not available for the special + * recovery case described above. + */ + if (!backup_started_in_recovery) + RequestXLogSwitch(false); + + do + { + bool checkpointfpw; + + /* + * Force a CHECKPOINT. Aside from being necessary to prevent torn + * page problems, this guarantees that two successive backup runs + * will have different checkpoint positions and hence different + * history file names, even if nothing happened in between. + * + * During recovery, establish a restartpoint if possible. We use + * the last restartpoint as the backup starting checkpoint. This + * means that two successive backup runs can have same checkpoint + * positions. + * + * Since the fact that we are executing do_pg_backup_start() + * during recovery means that checkpointer is running, we can use + * RequestCheckpoint() to establish a restartpoint. + * + * We use CHECKPOINT_IMMEDIATE only if requested by user (via + * passing fast = true). Otherwise this can take awhile. + */ + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | + (fast ? CHECKPOINT_IMMEDIATE : 0)); + + /* + * Now we need to fetch the checkpoint record location, and also + * its REDO pointer. The oldest point in WAL that would be needed + * to restore starting from the checkpoint is precisely the REDO + * pointer. + */ + LWLockAcquire(ControlFileLock, LW_SHARED); + checkpointloc = ControlFile->checkPoint; + startpoint = ControlFile->checkPointCopy.redo; + starttli = ControlFile->checkPointCopy.ThisTimeLineID; + checkpointfpw = ControlFile->checkPointCopy.fullPageWrites; + LWLockRelease(ControlFileLock); + + if (backup_started_in_recovery) + { + XLogRecPtr recptr; + + /* + * Check to see if all WAL replayed during online backup + * (i.e., since last restartpoint used as backup starting + * checkpoint) contain full-page writes. + */ + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->lastFpwDisableRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (!checkpointfpw || startpoint <= recptr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL generated with full_page_writes=off was replayed " + "since last restartpoint"), + errhint("This means that the backup being taken on the standby " + "is corrupt and should not be used. " + "Enable full_page_writes and run CHECKPOINT on the primary, " + "and then try an online backup again."))); + + /* + * During recovery, since we don't use the end-of-backup WAL + * record and don't write the backup history file, the + * starting WAL location doesn't need to be unique. This means + * that two base backups started at the same time might use + * the same checkpoint as starting locations. + */ + gotUniqueStartpoint = true; + } + + /* + * If two base backups are started at the same time (in WAL sender + * processes), we need to make sure that they use different + * checkpoints as starting locations, because we use the starting + * WAL location as a unique identifier for the base backup in the + * end-of-backup WAL record and when we write the backup history + * file. Perhaps it would be better generate a separate unique ID + * for each backup instead of forcing another checkpoint, but + * taking a checkpoint right after another is not that expensive + * either because only few buffers have been dirtied yet. + */ + WALInsertLockAcquireExclusive(); + if (XLogCtl->Insert.lastBackupStart < startpoint) + { + XLogCtl->Insert.lastBackupStart = startpoint; + gotUniqueStartpoint = true; + } + WALInsertLockRelease(); + } while (!gotUniqueStartpoint); + + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size); + + /* + * Construct tablespace_map file. + */ + datadirpathlen = strlen(DataDir); + + /* Collect information about all tablespaces */ + tblspcdir = AllocateDir("pg_tblspc"); + while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) + { + char fullpath[MAXPGPATH + 10]; + char linkpath[MAXPGPATH]; + char *relpath = NULL; + int rllen; + StringInfoData escapedpath; + char *s; + + /* Skip anything that doesn't look like a tablespace */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name); + + /* + * Skip anything that isn't a symlink/junction. For testing only, + * we sometimes use allow_in_place_tablespaces to create + * directories directly under pg_tblspc, which would fail below. + */ + if (get_dirent_type(fullpath, de, false, ERROR) != PGFILETYPE_LNK) + continue; + +#if defined(HAVE_READLINK) || defined(WIN32) + rllen = readlink(fullpath, linkpath, sizeof(linkpath)); + if (rllen < 0) + { + ereport(WARNING, + (errmsg("could not read symbolic link \"%s\": %m", + fullpath))); + continue; + } + else if (rllen >= sizeof(linkpath)) + { + ereport(WARNING, + (errmsg("symbolic link \"%s\" target is too long", + fullpath))); + continue; + } + linkpath[rllen] = '\0'; + + /* + * Build a backslash-escaped version of the link path to include + * in the tablespace map file. + */ + initStringInfo(&escapedpath); + for (s = linkpath; *s; s++) + { + if (*s == '\n' || *s == '\r' || *s == '\\') + appendStringInfoChar(&escapedpath, '\\'); + appendStringInfoChar(&escapedpath, *s); + } + + /* + * Relpath holds the relative path of the tablespace directory + * when it's located within PGDATA, or NULL if it's located + * elsewhere. + */ + if (rllen > datadirpathlen && + strncmp(linkpath, DataDir, datadirpathlen) == 0 && + IS_DIR_SEP(linkpath[datadirpathlen])) + relpath = linkpath + datadirpathlen + 1; + + ti = palloc(sizeof(tablespaceinfo)); + ti->oid = pstrdup(de->d_name); + ti->path = pstrdup(linkpath); + ti->rpath = relpath ? pstrdup(relpath) : NULL; + ti->size = -1; + + if (tablespaces) + *tablespaces = lappend(*tablespaces, ti); + + appendStringInfo(tblspcmapfile, "%s %s\n", + ti->oid, escapedpath.data); + + pfree(escapedpath.data); +#else + + /* + * If the platform does not have symbolic links, it should not be + * possible to have tablespaces - clearly somebody else created + * them. Warn about it and ignore. + */ + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablespaces are not supported on this platform"))); +#endif + } + FreeDir(tblspcdir); + + /* + * Construct backup label file. + */ + + /* Use the log timezone here, not the session timezone */ + stamp_time = (pg_time_t) time(NULL); + pg_strftime(strfbuf, sizeof(strfbuf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&stamp_time, log_timezone)); + appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(startpoint), xlogfilename); + appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n", + LSN_FORMAT_ARGS(checkpointloc)); + appendStringInfo(labelfile, "BACKUP METHOD: streamed\n"); + appendStringInfo(labelfile, "BACKUP FROM: %s\n", + backup_started_in_recovery ? "standby" : "primary"); + appendStringInfo(labelfile, "START TIME: %s\n", strfbuf); + appendStringInfo(labelfile, "LABEL: %s\n", backupidstr); + appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli); + } + PG_END_ENSURE_ERROR_CLEANUP(pg_backup_start_callback, (Datum) 0); + + /* + * Mark that the start phase has correctly finished for the backup. + */ + sessionBackupState = SESSION_BACKUP_RUNNING; + + /* + * We're done. As a convenience, return the starting WAL location. + */ + if (starttli_p) + *starttli_p = starttli; + return startpoint; +} + +/* Error cleanup callback for pg_backup_start */ +static void +pg_backup_start_callback(int code, Datum arg) +{ + /* Update backup counters and forcePageWrites on failure */ + WALInsertLockAcquireExclusive(); + + Assert(XLogCtl->Insert.runningBackups > 0); + XLogCtl->Insert.runningBackups--; + + if (XLogCtl->Insert.runningBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + WALInsertLockRelease(); +} + +/* + * Utility routine to fetch the session-level status of a backup running. + */ +SessionBackupState +get_backup_status(void) +{ + return sessionBackupState; +} + +/* + * do_pg_backup_stop + * + * Utility function called at the end of an online backup. It cleans up the + * backup state and can optionally wait for WAL segments to be archived. + * + * Returns the last WAL location that must be present to restore from this + * backup, and the corresponding timeline ID in *stoptli_p. + * + * It is the responsibility of the caller of this function to verify the + * permissions of the calling user! + */ +XLogRecPtr +do_pg_backup_stop(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) +{ + bool backup_started_in_recovery = false; + XLogRecPtr startpoint; + XLogRecPtr stoppoint; + TimeLineID stoptli; + pg_time_t stamp_time; + char strfbuf[128]; + char histfilepath[MAXPGPATH]; + char startxlogfilename[MAXFNAMELEN]; + char stopxlogfilename[MAXFNAMELEN]; + char lastxlogfilename[MAXFNAMELEN]; + char histfilename[MAXFNAMELEN]; + char backupfrom[20]; + XLogSegNo _logSegNo; + FILE *fp; + char ch; + int seconds_before_warning; + int waits = 0; + bool reported_waiting = false; + char *remaining; + char *ptr; + uint32 hi, + lo; + + backup_started_in_recovery = RecoveryInProgress(); + + /* + * During recovery, we don't need to check WAL level. Because, if WAL + * level is not sufficient, it's impossible to get here during recovery. + */ + if (!backup_started_in_recovery && !XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for making an online backup"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + /* + * OK to update backup counters, forcePageWrites, and session-level lock. + * + * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them. + * Otherwise they can be updated inconsistently, and which might cause + * do_pg_abort_backup() to fail. + */ + WALInsertLockAcquireExclusive(); + + /* + * It is expected that each do_pg_backup_start() call is matched by + * exactly one do_pg_backup_stop() call. + */ + Assert(XLogCtl->Insert.runningBackups > 0); + XLogCtl->Insert.runningBackups--; + + if (XLogCtl->Insert.runningBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + + /* + * Clean up session-level lock. + * + * You might think that WALInsertLockRelease() can be called before + * cleaning up session-level lock because session-level lock doesn't need + * to be protected with WAL insertion lock. But since + * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be + * cleaned up before it. + */ + sessionBackupState = SESSION_BACKUP_NONE; + + WALInsertLockRelease(); + + /* + * Read and parse the START WAL LOCATION line (this code is pretty crude, + * but we are not expecting any variability in the file format). + */ + if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", + &hi, &lo, startxlogfilename, + &ch) != 4 || ch != '\n') + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + startpoint = ((uint64) hi) << 32 | lo; + remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */ + + /* + * Parse the BACKUP FROM line. If we are taking an online backup from the + * standby, we confirm that the standby has not been promoted during the + * backup. + */ + ptr = strstr(remaining, "BACKUP FROM:"); + if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the standby was promoted during online backup"), + errhint("This means that the backup being taken is corrupt " + "and should not be used. " + "Try taking another online backup."))); + + /* + * During recovery, we don't write an end-of-backup record. We assume that + * pg_control was backed up last and its minimum recovery point can be + * available as the backup end location. Since we don't have an + * end-of-backup record, we use the pg_control value to check whether + * we've reached the end of backup when starting recovery from this + * backup. We have no way of checking if pg_control wasn't backed up last + * however. + * + * We don't force a switch to new WAL file but it is still possible to + * wait for all the required files to be archived if waitforarchive is + * true. This is okay if we use the backup to start a standby and fetch + * the missing WAL using streaming replication. But in the case of an + * archive recovery, a user should set waitforarchive to true and wait for + * them to be archived to ensure that all the required files are + * available. + * + * We return the current minimum recovery point as the backup end + * location. Note that it can be greater than the exact backup end + * location if the minimum recovery point is updated after the backup of + * pg_control. This is harmless for current uses. + * + * XXX currently a backup history file is for informational and debug + * purposes only. It's not essential for an online backup. Furthermore, + * even if it's created, it will not be archived during recovery because + * an archiver is not invoked. So it doesn't seem worthwhile to write a + * backup history file during recovery. + */ + if (backup_started_in_recovery) + { + XLogRecPtr recptr; + + /* + * Check to see if all WAL replayed during online backup contain + * full-page writes. + */ + SpinLockAcquire(&XLogCtl->info_lck); + recptr = XLogCtl->lastFpwDisableRecPtr; + SpinLockRelease(&XLogCtl->info_lck); + + if (startpoint <= recptr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL generated with full_page_writes=off was replayed " + "during online backup"), + errhint("This means that the backup being taken on the standby " + "is corrupt and should not be used. " + "Enable full_page_writes and run CHECKPOINT on the primary, " + "and then try an online backup again."))); + + + LWLockAcquire(ControlFileLock, LW_SHARED); + stoppoint = ControlFile->minRecoveryPoint; + stoptli = ControlFile->minRecoveryPointTLI; + LWLockRelease(ControlFileLock); + } + else + { + /* + * Write the backup-end xlog record + */ + XLogBeginInsert(); + XLogRegisterData((char *) (&startpoint), sizeof(startpoint)); + stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END); + + /* + * Given that we're not in recovery, InsertTimeLineID is set and can't + * change, so we can read it without a lock. + */ + stoptli = XLogCtl->InsertTimeLineID; + + /* + * Force a switch to a new xlog segment file, so that the backup is + * valid as soon as archiver moves out the current segment file. + */ + RequestXLogSwitch(false); + + XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size); + XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size); + + /* Use the log timezone here, not the session timezone */ + stamp_time = (pg_time_t) time(NULL); + pg_strftime(strfbuf, sizeof(strfbuf), + "%Y-%m-%d %H:%M:%S %Z", + pg_localtime(&stamp_time, log_timezone)); + + /* + * Write the backup history file + */ + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + BackupHistoryFilePath(histfilepath, stoptli, _logSegNo, + startpoint, wal_segment_size); + fp = AllocateFile(histfilepath, "w"); + if (!fp) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + histfilepath))); + fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(startpoint), startxlogfilename); + fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n", + LSN_FORMAT_ARGS(stoppoint), stopxlogfilename); + + /* + * Transfer remaining lines including label and start timeline to + * history file. + */ + fprintf(fp, "%s", remaining); + fprintf(fp, "STOP TIME: %s\n", strfbuf); + fprintf(fp, "STOP TIMELINE: %u\n", stoptli); + if (fflush(fp) || ferror(fp) || FreeFile(fp)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + histfilepath))); + + /* + * Clean out any no-longer-needed history files. As a side effect, + * this will post a .ready file for the newly created history file, + * notifying the archiver that history file may be archived + * immediately. + */ + CleanupBackupHistory(); + } + + /* + * If archiving is enabled, wait for all the required WAL files to be + * archived before returning. If archiving isn't enabled, the required WAL + * needs to be transported via streaming replication (hopefully with + * wal_keep_size set high enough), or some more exotic mechanism like + * polling and copying files from pg_wal with script. We have no knowledge + * of those mechanisms, so it's up to the user to ensure that he gets all + * the required WAL. + * + * We wait until both the last WAL file filled during backup and the + * history file have been archived, and assume that the alphabetic sorting + * property of the WAL files ensures any earlier WAL files are safely + * archived as well. + * + * We wait forever, since archive_command is supposed to work and we + * assume the admin wanted his backup to work completely. If you don't + * wish to wait, then either waitforarchive should be passed in as false, + * or you can set statement_timeout. Also, some notices are issued to + * clue in anyone who might be doing this interactively. + */ + + if (waitforarchive && + ((!backup_started_in_recovery && XLogArchivingActive()) || + (backup_started_in_recovery && XLogArchivingAlways()))) + { + XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size); + XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size); + + XLByteToSeg(startpoint, _logSegNo, wal_segment_size); + BackupHistoryFileName(histfilename, stoptli, _logSegNo, + startpoint, wal_segment_size); + + seconds_before_warning = 60; + waits = 0; + + while (XLogArchiveIsBusy(lastxlogfilename) || + XLogArchiveIsBusy(histfilename)) + { + CHECK_FOR_INTERRUPTS(); + + if (!reported_waiting && waits > 5) + { + ereport(NOTICE, + (errmsg("base backup done, waiting for required WAL segments to be archived"))); + reported_waiting = true; + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, + WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE); + ResetLatch(MyLatch); + + if (++waits >= seconds_before_warning) + { + seconds_before_warning *= 2; /* This wraps in >10 years... */ + ereport(WARNING, + (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)", + waits), + errhint("Check that your archive_command is executing properly. " + "You can safely cancel this backup, " + "but the database backup will not be usable without all the WAL segments."))); + } + } + + ereport(NOTICE, + (errmsg("all required WAL segments have been archived"))); + } + else if (waitforarchive) + ereport(NOTICE, + (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup"))); + + /* + * We're done. As a convenience, return the ending WAL location. + */ + if (stoptli_p) + *stoptli_p = stoptli; + return stoppoint; +} + + +/* + * do_pg_abort_backup: abort a running backup + * + * This does just the most basic steps of do_pg_backup_stop(), by taking the + * system out of backup mode, thus making it a lot more safe to call from + * an error handler. + * + * The caller can pass 'arg' as 'true' or 'false' to control whether a warning + * is emitted. + * + * NB: This gets used as a before_shmem_exit handler, hence the odd-looking + * signature. + */ +void +do_pg_abort_backup(int code, Datum arg) +{ + bool emit_warning = DatumGetBool(arg); + + /* + * Quick exit if session does not have a running backup. + */ + if (sessionBackupState != SESSION_BACKUP_RUNNING) + return; + + WALInsertLockAcquireExclusive(); + Assert(XLogCtl->Insert.runningBackups > 0); + XLogCtl->Insert.runningBackups--; + + if (XLogCtl->Insert.runningBackups == 0) + { + XLogCtl->Insert.forcePageWrites = false; + } + + sessionBackupState = SESSION_BACKUP_NONE; + WALInsertLockRelease(); + + if (emit_warning) + ereport(WARNING, + (errmsg("aborting backup due to backend exiting before pg_backup_stop was called"))); +} + +/* + * Register a handler that will warn about unterminated backups at end of + * session, unless this has already been done. + */ +void +register_persistent_abort_backup_handler(void) +{ + static bool already_done = false; + + if (already_done) + return; + before_shmem_exit(do_pg_abort_backup, DatumGetBool(true)); + already_done = true; +} + +/* + * Get latest WAL insert pointer + */ +XLogRecPtr +GetXLogInsertRecPtr(void) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 current_bytepos; + + SpinLockAcquire(&Insert->insertpos_lck); + current_bytepos = Insert->CurrBytePos; + SpinLockRelease(&Insert->insertpos_lck); + + return XLogBytePosToRecPtr(current_bytepos); +} + +/* + * Get latest WAL write pointer + */ +XLogRecPtr +GetXLogWriteRecPtr(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + return LogwrtResult.Write; +} + +/* + * Returns the redo pointer of the last checkpoint or restartpoint. This is + * the oldest point in WAL that we still need, if we have to restart recovery. + */ +void +GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) +{ + LWLockAcquire(ControlFileLock, LW_SHARED); + *oldrecptr = ControlFile->checkPointCopy.redo; + *oldtli = ControlFile->checkPointCopy.ThisTimeLineID; + LWLockRelease(ControlFileLock); +} + +/* Thin wrapper around ShutdownWalRcv(). */ +void +XLogShutdownWalRcv(void) +{ + ShutdownWalRcv(); + + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = false; + LWLockRelease(ControlFileLock); +} + +/* Enable WAL file recycling and preallocation. */ +void +SetInstallXLogFileSegmentActive(void) +{ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); +} + +bool +IsInstallXLogFileSegmentActive(void) +{ + bool result; + + LWLockAcquire(ControlFileLock, LW_SHARED); + result = XLogCtl->InstallXLogFileSegmentActive; + LWLockRelease(ControlFileLock); + + return result; +} + +/* + * Update the WalWriterSleeping flag. + */ +void +SetWalWriterSleeping(bool sleeping) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->WalWriterSleeping = sleeping; + SpinLockRelease(&XLogCtl->info_lck); +} diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c new file mode 100644 index 0000000..6516a74 --- /dev/null +++ b/src/backend/access/transam/xlogarchive.c @@ -0,0 +1,762 @@ +/*------------------------------------------------------------------------- + * + * xlogarchive.c + * Functions for archiving WAL files and restoring from the archive. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogarchive.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <sys/wait.h> +#include <signal.h> +#include <unistd.h> + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "common/archive.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/startup.h" +#include "postmaster/pgarch.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" + +/* + * Attempt to retrieve the specified file from off-line archival storage. + * If successful, fill "path" with its complete path (note that this will be + * a temp file name that doesn't follow the normal naming convention), and + * return true. + * + * If not successful, fill "path" with the name of the normal on-line file + * (which may or may not actually exist, but we'll try to use it), and return + * false. + * + * For fixed-size files, the caller may pass the expected size as an + * additional crosscheck on successful recovery. If the file size is not + * known, set expectedSize = 0. + * + * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments + * in the archive. This is used when fetching the initial checkpoint record, + * when we are not yet sure how far back we need the WAL. + */ +bool +RestoreArchivedFile(char *path, const char *xlogfname, + const char *recovername, off_t expectedSize, + bool cleanupEnabled) +{ + char xlogpath[MAXPGPATH]; + char *xlogRestoreCmd; + char lastRestartPointFname[MAXPGPATH]; + int rc; + struct stat stat_buf; + XLogSegNo restartSegNo; + XLogRecPtr restartRedoPtr; + TimeLineID restartTli; + + /* + * Ignore restore_command when not in archive recovery (meaning we are in + * crash recovery). + */ + if (!ArchiveRecoveryRequested) + goto not_available; + + /* In standby mode, restore_command might not be supplied */ + if (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0) + goto not_available; + + /* + * When doing archive recovery, we always prefer an archived log file even + * if a file of the same name exists in XLOGDIR. The reason is that the + * file in XLOGDIR could be an old, un-filled or partly-filled version + * that was copied and restored as part of backing up $PGDATA. + * + * We could try to optimize this slightly by checking the local copy + * lastchange timestamp against the archived copy, but we have no API to + * do this, nor can we guarantee that the lastchange timestamp was + * preserved correctly when we copied to archive. Our aim is robustness, + * so we elect not to do this. + * + * If we cannot obtain the log file from the archive, however, we will try + * to use the XLOGDIR file if it exists. This is so that we can make use + * of log segments that weren't yet transferred to the archive. + * + * Notice that we don't actually overwrite any files when we copy back + * from archive because the restore_command may inadvertently restore + * inappropriate xlogs, or they may be corrupt, so we may wish to fallback + * to the segments remaining in current XLOGDIR later. The + * copy-from-archive filename is always the same, ensuring that we don't + * run out of disk space on long recoveries. + */ + snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername); + + /* + * Make sure there is no existing file named recovername. + */ + if (stat(xlogpath, &stat_buf) != 0) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + xlogpath))); + } + else + { + if (unlink(xlogpath) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + xlogpath))); + } + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted from the + * archive, though there is no requirement to do so. + * + * If cleanup is not enabled, initialise this with the filename of + * InvalidXLogRecPtr, which will prevent the deletion of any WAL files + * from the archive because of the alphabetic sorting property of WAL + * filenames. + * + * Once we have successfully located the redo pointer of the checkpoint + * from which we start recovery we never request a file prior to the redo + * pointer of the last restartpoint. When redo begins we know that we have + * successfully located it, so there is no need for additional status + * flags to signify the point when we can begin deleting WAL files from + * the archive. + */ + if (cleanupEnabled) + { + GetOldestRestartPoint(&restartRedoPtr, &restartTli); + XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size); + XLogFileName(lastRestartPointFname, restartTli, restartSegNo, + wal_segment_size); + /* we shouldn't need anything earlier than last restart point */ + Assert(strcmp(lastRestartPointFname, xlogfname) <= 0); + } + else + XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size); + + /* Build the restore command to execute */ + xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand, + xlogpath, xlogfname, + lastRestartPointFname); + if (xlogRestoreCmd == NULL) + elog(ERROR, "could not build restore command \"%s\"", + recoveryRestoreCommand); + + ereport(DEBUG3, + (errmsg_internal("executing restore command \"%s\"", + xlogRestoreCmd))); + + pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND); + + /* + * PreRestoreCommand() informs the SIGTERM handler for the startup process + * that it should proc_exit() right away. This is done for the duration + * of the system() call because there isn't a good way to break out while + * it is executing. Since we might call proc_exit() in a signal handler, + * it is best to put any additional logic before or after the + * PreRestoreCommand()/PostRestoreCommand() section. + */ + PreRestoreCommand(); + + /* + * Copy xlog from archival storage to XLOGDIR + */ + rc = system(xlogRestoreCmd); + + PostRestoreCommand(); + + pgstat_report_wait_end(); + pfree(xlogRestoreCmd); + + if (rc == 0) + { + /* + * command apparently succeeded, but let's make sure the file is + * really there now and has the correct size. + */ + if (stat(xlogpath, &stat_buf) == 0) + { + if (expectedSize > 0 && stat_buf.st_size != expectedSize) + { + int elevel; + + /* + * If we find a partial file in standby mode, we assume it's + * because it's just being copied to the archive, and keep + * trying. + * + * Otherwise treat a wrong-sized file as FATAL to ensure the + * DBA would notice it, but is that too strong? We could try + * to plow ahead with a local copy of the file ... but the + * problem is that there probably isn't one, and we'd + * incorrectly conclude we've reached the end of WAL and we're + * done recovering ... + */ + if (StandbyMode && stat_buf.st_size < expectedSize) + elevel = DEBUG1; + else + elevel = FATAL; + ereport(elevel, + (errmsg("archive file \"%s\" has wrong size: %lld instead of %lld", + xlogfname, + (long long int) stat_buf.st_size, + (long long int) expectedSize))); + return false; + } + else + { + ereport(LOG, + (errmsg("restored log file \"%s\" from archive", + xlogfname))); + strcpy(path, xlogpath); + return true; + } + } + else + { + /* stat failed */ + int elevel = (errno == ENOENT) ? LOG : FATAL; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", xlogpath), + errdetail("restore_command returned a zero exit status, but stat() failed."))); + } + } + + /* + * Remember, we rollforward UNTIL the restore fails so failure here is + * just part of the process... that makes it difficult to determine + * whether the restore failed because there isn't an archive to restore, + * or because the administrator has specified the restore program + * incorrectly. We have to assume the former. + * + * However, if the failure was due to any sort of signal, it's best to + * punt and abort recovery. (If we "return false" here, upper levels will + * assume that recovery is complete and start up the database!) It's + * essential to abort on child SIGINT and SIGQUIT, because per spec + * system() ignores SIGINT and SIGQUIT while waiting; if we see one of + * those it's a good bet we should have gotten it too. + * + * On SIGTERM, assume we have received a fast shutdown request, and exit + * cleanly. It's pure chance whether we receive the SIGTERM first, or the + * child process. If we receive it first, the signal handler will call + * proc_exit, otherwise we do it here. If we or the child process received + * SIGTERM for any other reason than a fast shutdown request, postmaster + * will perform an immediate shutdown when it sees us exiting + * unexpectedly. + * + * We treat hard shell errors such as "command not found" as fatal, too. + */ + if (wait_result_is_signal(rc, SIGTERM)) + proc_exit(1); + + ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2, + (errmsg("could not restore file \"%s\" from archive: %s", + xlogfname, wait_result_to_str(rc)))); + +not_available: + + /* + * if an archived file is not available, there might still be a version of + * this file in XLOGDIR, so return that as the filename to open. + * + * In many recovery scenarios we expect this to fail also, but if so that + * just means we've reached the end of WAL. + */ + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); + return false; +} + +/* + * Attempt to execute an external shell command during recovery. + * + * 'command' is the shell command to be executed, 'commandName' is a + * human-readable name describing the command emitted in the logs. If + * 'failOnSignal' is true and the command is killed by a signal, a FATAL + * error is thrown. Otherwise a WARNING is emitted. + * + * This is currently used for recovery_end_command and archive_cleanup_command. + */ +void +ExecuteRecoveryCommand(const char *command, const char *commandName, + bool failOnSignal, uint32 wait_event_info) +{ + char xlogRecoveryCmd[MAXPGPATH]; + char lastRestartPointFname[MAXPGPATH]; + char *dp; + char *endp; + const char *sp; + int rc; + XLogSegNo restartSegNo; + XLogRecPtr restartRedoPtr; + TimeLineID restartTli; + + Assert(command && commandName); + + /* + * Calculate the archive file cutoff point for use during log shipping + * replication. All files earlier than this point can be deleted from the + * archive, though there is no requirement to do so. + */ + GetOldestRestartPoint(&restartRedoPtr, &restartTli); + XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size); + XLogFileName(lastRestartPointFname, restartTli, restartSegNo, + wal_segment_size); + + /* + * construct the command to be executed + */ + dp = xlogRecoveryCmd; + endp = xlogRecoveryCmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = command; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'r': + /* %r: filename of last restartpoint */ + sp++; + strlcpy(dp, lastRestartPointFname, endp - dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + ereport(DEBUG3, + (errmsg_internal("executing %s \"%s\"", commandName, command))); + + /* + * execute the constructed command + */ + pgstat_report_wait_start(wait_event_info); + rc = system(xlogRecoveryCmd); + pgstat_report_wait_end(); + + if (rc != 0) + { + /* + * If the failure was due to any sort of signal, it's best to punt and + * abort recovery. See comments in RestoreArchivedFile(). + */ + ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING, + /*------ + translator: First %s represents a postgresql.conf parameter name like + "recovery_end_command", the 2nd is the value of that parameter, the + third an already translated error message. */ + (errmsg("%s \"%s\": %s", commandName, + command, wait_result_to_str(rc)))); + } +} + + +/* + * A file was restored from the archive under a temporary filename (path), + * and now we want to keep it. Rename it under the permanent filename in + * pg_wal (xlogfname), replacing any existing file with the same name. + */ +void +KeepFileRestoredFromArchive(const char *path, const char *xlogfname) +{ + char xlogfpath[MAXPGPATH]; + bool reload = false; + struct stat statbuf; + + snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname); + + if (stat(xlogfpath, &statbuf) == 0) + { + char oldpath[MAXPGPATH]; + +#ifdef WIN32 + static unsigned int deletedcounter = 1; + + /* + * On Windows, if another process (e.g a walsender process) holds the + * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the + * file will still show up in directory listing until the last handle + * is closed, and we cannot rename the new file in its place until + * that. To avoid that problem, rename the old file to a temporary + * name first. Use a counter to create a unique filename, because the + * same file might be restored from the archive multiple times, and a + * walsender could still be holding onto an old deleted version of it. + */ + snprintf(oldpath, MAXPGPATH, "%s.deleted%u", + xlogfpath, deletedcounter++); + if (rename(xlogfpath, oldpath) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + xlogfpath, oldpath))); + } +#else + /* same-size buffers, so this never truncates */ + strlcpy(oldpath, xlogfpath, MAXPGPATH); +#endif + if (unlink(oldpath) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + xlogfpath))); + reload = true; + } + + durable_rename(path, xlogfpath, ERROR); + + /* + * Create .done file forcibly to prevent the restored segment from being + * archived again later. + */ + if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS) + XLogArchiveForceDone(xlogfname); + else + XLogArchiveNotify(xlogfname); + + /* + * If the existing file was replaced, since walsenders might have it open, + * request them to reload a currently-open segment. This is only required + * for WAL segments, walsenders don't hold other files open, but there's + * no harm in doing this too often, and we don't know what kind of a file + * we're dealing with here. + */ + if (reload) + WalSndRqstFileReload(); + + /* + * Signal walsender that new WAL has arrived. Again, this isn't necessary + * if we restored something other than a WAL segment, but it does no harm + * either. + */ + WalSndWakeup(); +} + +/* + * XLogArchiveNotify + * + * Create an archive notification file + * + * The name of the notification file is the message that will be picked up + * by the archiver, e.g. we write 0000000100000001000000C6.ready + * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6, + * then when complete, rename it to 0000000100000001000000C6.done + */ +void +XLogArchiveNotify(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + FILE *fd; + + /* insert an otherwise empty file called <XLOG>.ready */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + fd = AllocateFile(archiveStatusPath, "w"); + if (fd == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create archive status file \"%s\": %m", + archiveStatusPath))); + return; + } + if (FreeFile(fd)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write archive status file \"%s\": %m", + archiveStatusPath))); + return; + } + + /* + * Timeline history files are given the highest archival priority to lower + * the chance that a promoted standby will choose a timeline that is + * already in use. However, the archiver ordinarily tries to gather + * multiple files to archive from each scan of the archive_status + * directory, which means that newly created timeline history files could + * be left unarchived for a while. To ensure that the archiver picks up + * timeline history files as soon as possible, we force the archiver to + * scan the archive_status directory the next time it looks for a file to + * archive. + */ + if (IsTLHistoryFileName(xlog)) + PgArchForceDirScan(); + + /* Notify archiver that it's got something to do */ + if (IsUnderPostmaster) + PgArchWakeup(); +} + +/* + * Convenience routine to notify using segment number representation of filename + */ +void +XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli) +{ + char xlog[MAXFNAMELEN]; + + Assert(tli != 0); + + XLogFileName(xlog, tli, segno, wal_segment_size); + XLogArchiveNotify(xlog); +} + +/* + * XLogArchiveForceDone + * + * Emit notification forcibly that an XLOG segment file has been successfully + * archived, by creating <XLOG>.done regardless of whether <XLOG>.ready + * exists or not. + */ +void +XLogArchiveForceDone(const char *xlog) +{ + char archiveReady[MAXPGPATH]; + char archiveDone[MAXPGPATH]; + struct stat stat_buf; + FILE *fd; + + /* Exit if already known done */ + StatusFilePath(archiveDone, xlog, ".done"); + if (stat(archiveDone, &stat_buf) == 0) + return; + + /* If .ready exists, rename it to .done */ + StatusFilePath(archiveReady, xlog, ".ready"); + if (stat(archiveReady, &stat_buf) == 0) + { + (void) durable_rename(archiveReady, archiveDone, WARNING); + return; + } + + /* insert an otherwise empty file called <XLOG>.done */ + fd = AllocateFile(archiveDone, "w"); + if (fd == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create archive status file \"%s\": %m", + archiveDone))); + return; + } + if (FreeFile(fd)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write archive status file \"%s\": %m", + archiveDone))); + return; + } +} + +/* + * XLogArchiveCheckDone + * + * This is called when we are ready to delete or recycle an old XLOG segment + * file or backup history file. If it is okay to delete it then return true. + * If it is not time to delete it, make sure a .ready file exists, and return + * false. + * + * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists, + * then return false; else create <XLOG>.ready and return false. + * + * The reason we do things this way is so that if the original attempt to + * create <XLOG>.ready fails, we'll retry during subsequent checkpoints. + */ +bool +XLogArchiveCheckDone(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* The file is always deletable if archive_mode is "off". */ + if (!XLogArchivingActive()) + return true; + + /* + * During archive recovery, the file is deletable if archive_mode is not + * "always". + */ + if (!XLogArchivingAlways() && + GetRecoveryState() == RECOVERY_STATE_ARCHIVE) + return true; + + /* + * At this point of the logic, note that we are either a primary with + * archive_mode set to "on" or "always", or a standby with archive_mode + * set to "always". + */ + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Retry creation of the .ready file */ + XLogArchiveNotify(xlog); + return false; +} + +/* + * XLogArchiveIsBusy + * + * Check to see if an XLOG segment file is still unarchived. + * This is almost but not quite the inverse of XLogArchiveCheckDone: in + * the first place we aren't chartered to recreate the .ready file, and + * in the second place we should consider that if the file is already gone + * then it's not busy. (This check is needed to handle the race condition + * that a checkpoint already deleted the no-longer-needed file.) + */ +bool +XLogArchiveIsBusy(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return false; + + /* + * Check to see if the WAL file has been removed by checkpoint, which + * implies it has already been archived, and explains why we can't see a + * status file for it. + */ + snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog); + if (stat(archiveStatusPath, &stat_buf) != 0 && + errno == ENOENT) + return false; + + return true; +} + +/* + * XLogArchiveIsReadyOrDone + * + * Check to see if an XLOG segment file has a .ready or .done file. + * This is similar to XLogArchiveIsBusy(), but returns true if the file + * is already archived or is about to be archived. + * + * This is currently only used at recovery. During normal operation this + * would be racy: the file might get removed or marked with .ready as we're + * checking it, or immediately after we return. + */ +bool +XLogArchiveIsReadyOrDone(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + /* First check for .done --- this means archiver is done with it */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* check for .ready --- this means archiver is still busy with it */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + /* Race condition --- maybe archiver just finished, so recheck */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + return false; +} + +/* + * XLogArchiveIsReady + * + * Check to see if an XLOG segment file has an archive notification (.ready) + * file. + */ +bool +XLogArchiveIsReady(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + struct stat stat_buf; + + StatusFilePath(archiveStatusPath, xlog, ".ready"); + if (stat(archiveStatusPath, &stat_buf) == 0) + return true; + + return false; +} + +/* + * XLogArchiveCleanup + * + * Cleanup archive notification file(s) for a particular xlog segment + */ +void +XLogArchiveCleanup(const char *xlog) +{ + char archiveStatusPath[MAXPGPATH]; + + /* Remove the .done file */ + StatusFilePath(archiveStatusPath, xlog, ".done"); + unlink(archiveStatusPath); + /* should we complain about failure? */ + + /* Remove the .ready file if present --- normally it shouldn't be */ + StatusFilePath(archiveStatusPath, xlog, ".ready"); + unlink(archiveStatusPath); + /* should we complain about failure? */ +} diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c new file mode 100644 index 0000000..02bd919 --- /dev/null +++ b/src/backend/access/transam/xlogfuncs.c @@ -0,0 +1,648 @@ +/*------------------------------------------------------------------------- + * + * xlogfuncs.c + * + * PostgreSQL write-ahead log manager user interface functions + * + * This file contains WAL control and information functions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/htup_details.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/walreceiver.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/smgr.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/pg_lsn.h" +#include "utils/timestamp.h" +#include "utils/tuplestore.h" + +/* + * Store label file and tablespace map during backups. + */ +static StringInfo label_file; +static StringInfo tblspc_map_file; + +/* + * pg_backup_start: set up for taking an on-line backup dump + * + * Essentially what this does is to create a backup label file in $PGDATA, + * where it will be archived as part of the backup dump. The label file + * contains the user-supplied label string (typically this would be used + * to tell where the backup dump will be stored) and the starting time and + * starting WAL location for the dump. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_backup_start(PG_FUNCTION_ARGS) +{ + text *backupid = PG_GETARG_TEXT_PP(0); + bool fast = PG_GETARG_BOOL(1); + char *backupidstr; + XLogRecPtr startpoint; + SessionBackupState status = get_backup_status(); + MemoryContext oldcontext; + + backupidstr = text_to_cstring(backupid); + + if (status == SESSION_BACKUP_RUNNING) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress in this session"))); + + /* + * Label file and tablespace map file need to be long-lived, since they + * are read in pg_backup_stop. + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + label_file = makeStringInfo(); + tblspc_map_file = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + register_persistent_abort_backup_handler(); + + startpoint = do_pg_backup_start(backupidstr, fast, NULL, label_file, + NULL, tblspc_map_file); + + PG_RETURN_LSN(startpoint); +} + + +/* + * pg_backup_stop: finish taking an on-line backup. + * + * The first parameter (variable 'waitforarchive'), which is optional, + * allows the user to choose if they want to wait for the WAL to be archived + * or if we should just return as soon as the WAL record is written. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_backup_stop(PG_FUNCTION_ARGS) +{ +#define PG_STOP_BACKUP_V2_COLS 3 + TupleDesc tupdesc; + Datum values[PG_STOP_BACKUP_V2_COLS]; + bool nulls[PG_STOP_BACKUP_V2_COLS]; + + bool waitforarchive = PG_GETARG_BOOL(0); + XLogRecPtr stoppoint; + SessionBackupState status = get_backup_status(); + + /* Initialize attributes information in the tuple descriptor */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + if (status != SESSION_BACKUP_RUNNING) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("backup is not in progress"), + errhint("Did you call pg_backup_start()?"))); + + /* + * Stop the backup. Return a copy of the backup label and tablespace map + * so they can be written to disk by the caller. + */ + stoppoint = do_pg_backup_stop(label_file->data, waitforarchive, NULL); + + values[0] = LSNGetDatum(stoppoint); + values[1] = CStringGetTextDatum(label_file->data); + values[2] = CStringGetTextDatum(tblspc_map_file->data); + + /* Free structures allocated in TopMemoryContext */ + pfree(label_file->data); + pfree(label_file); + label_file = NULL; + pfree(tblspc_map_file->data); + pfree(tblspc_map_file); + tblspc_map_file = NULL; + + /* Returns the record as Datum */ + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * pg_switch_wal: switch to next xlog file + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_switch_wal(PG_FUNCTION_ARGS) +{ + XLogRecPtr switchpoint; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + switchpoint = RequestXLogSwitch(false); + + /* + * As a convenience, return the WAL location of the switch record + */ + PG_RETURN_LSN(switchpoint); +} + +/* + * pg_create_restore_point: a named point for restore + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_create_restore_point(PG_FUNCTION_ARGS) +{ + text *restore_name = PG_GETARG_TEXT_PP(0); + char *restore_name_str; + XLogRecPtr restorepoint; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + if (!XLogIsNeeded()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL level not sufficient for creating a restore point"), + errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); + + restore_name_str = text_to_cstring(restore_name); + + if (strlen(restore_name_str) >= MAXFNAMELEN) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1))); + + restorepoint = XLogRestorePoint(restore_name_str); + + /* + * As a convenience, return the WAL location of the restore point record + */ + PG_RETURN_LSN(restorepoint); +} + +/* + * Report the current WAL write location (same format as pg_backup_start etc) + * + * This is useful for determining how much of WAL is visible to an external + * archiving process. Note that the data before this point is written out + * to the kernel, but is not necessarily synced to disk. + */ +Datum +pg_current_wal_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetXLogWriteRecPtr(); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the current WAL insert location (same format as pg_backup_start etc) + * + * This function is mostly for debugging purposes. + */ +Datum +pg_current_wal_insert_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetXLogInsertRecPtr(); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the current WAL flush location (same format as pg_backup_start etc) + * + * This function is mostly for debugging purposes. + */ +Datum +pg_current_wal_flush_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr current_recptr; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("WAL control functions cannot be executed during recovery."))); + + current_recptr = GetFlushRecPtr(NULL); + + PG_RETURN_LSN(current_recptr); +} + +/* + * Report the last WAL receive location (same format as pg_backup_start etc) + * + * This is useful for determining how much of WAL is guaranteed to be received + * and synced to disk by walreceiver. + */ +Datum +pg_last_wal_receive_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr recptr; + + recptr = GetWalRcvFlushRecPtr(NULL, NULL); + + if (recptr == 0) + PG_RETURN_NULL(); + + PG_RETURN_LSN(recptr); +} + +/* + * Report the last WAL replay location (same format as pg_backup_start etc) + * + * This is useful for determining how much of WAL is visible to read-only + * connections during recovery. + */ +Datum +pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) +{ + XLogRecPtr recptr; + + recptr = GetXLogReplayRecPtr(NULL); + + if (recptr == 0) + PG_RETURN_NULL(); + + PG_RETURN_LSN(recptr); +} + +/* + * Compute an xlog file name and decimal byte offset given a WAL location, + * such as is returned by pg_backup_stop() or pg_switch_wal(). + * + * Note that a location exactly at a segment boundary is taken to be in + * the previous segment. This is usually the right thing, since the + * expected usage is to determine which xlog file(s) are ready to archive. + */ +Datum +pg_walfile_name_offset(PG_FUNCTION_ARGS) +{ + XLogSegNo xlogsegno; + uint32 xrecoff; + XLogRecPtr locationpoint = PG_GETARG_LSN(0); + char xlogfilename[MAXFNAMELEN]; + Datum values[2]; + bool isnull[2]; + TupleDesc resultTupleDesc; + HeapTuple resultHeapTuple; + Datum result; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("%s cannot be executed during recovery.", + "pg_walfile_name_offset()"))); + + /* + * Construct a tuple descriptor for the result row. This must match this + * function's pg_proc entry! + */ + resultTupleDesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name", + TEXTOID, -1, 0); + TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset", + INT4OID, -1, 0); + + resultTupleDesc = BlessTupleDesc(resultTupleDesc); + + /* + * xlogfilename + */ + XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); + XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno, + wal_segment_size); + + values[0] = CStringGetTextDatum(xlogfilename); + isnull[0] = false; + + /* + * offset + */ + xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size); + + values[1] = UInt32GetDatum(xrecoff); + isnull[1] = false; + + /* + * Tuple jam: Having first prepared your Datums, then squash together + */ + resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull); + + result = HeapTupleGetDatum(resultHeapTuple); + + PG_RETURN_DATUM(result); +} + +/* + * Compute an xlog file name given a WAL location, + * such as is returned by pg_backup_stop() or pg_switch_wal(). + */ +Datum +pg_walfile_name(PG_FUNCTION_ARGS) +{ + XLogSegNo xlogsegno; + XLogRecPtr locationpoint = PG_GETARG_LSN(0); + char xlogfilename[MAXFNAMELEN]; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("%s cannot be executed during recovery.", + "pg_walfile_name()"))); + + XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); + XLogFileName(xlogfilename, GetWALInsertionTimeLine(), xlogsegno, + wal_segment_size); + + PG_RETURN_TEXT_P(cstring_to_text(xlogfilename)); +} + +/* + * pg_wal_replay_pause - Request to pause recovery + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_wal_replay_pause(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (PromoteIsTriggered()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("standby promotion is ongoing"), + errhint("%s cannot be executed after promotion is triggered.", + "pg_wal_replay_pause()"))); + + SetRecoveryPause(true); + + /* wake up the recovery process so that it can process the pause request */ + WakeupRecovery(); + + PG_RETURN_VOID(); +} + +/* + * pg_wal_replay_resume - resume recovery now + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_wal_replay_resume(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (PromoteIsTriggered()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("standby promotion is ongoing"), + errhint("%s cannot be executed after promotion is triggered.", + "pg_wal_replay_resume()"))); + + SetRecoveryPause(false); + + PG_RETURN_VOID(); +} + +/* + * pg_is_wal_replay_paused + */ +Datum +pg_is_wal_replay_paused(PG_FUNCTION_ARGS) +{ + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + PG_RETURN_BOOL(GetRecoveryPauseState() != RECOVERY_NOT_PAUSED); +} + +/* + * pg_get_wal_replay_pause_state - Returns the recovery pause state. + * + * Returned values: + * + * 'not paused' - if pause is not requested + * 'pause requested' - if pause is requested but recovery is not yet paused + * 'paused' - if recovery is paused + */ +Datum +pg_get_wal_replay_pause_state(PG_FUNCTION_ARGS) +{ + char *statestr = NULL; + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + /* get the recovery pause state */ + switch (GetRecoveryPauseState()) + { + case RECOVERY_NOT_PAUSED: + statestr = "not paused"; + break; + case RECOVERY_PAUSE_REQUESTED: + statestr = "pause requested"; + break; + case RECOVERY_PAUSED: + statestr = "paused"; + break; + } + + Assert(statestr != NULL); + PG_RETURN_TEXT_P(cstring_to_text(statestr)); +} + +/* + * Returns timestamp of latest processed commit/abort record. + * + * When the server has been started normally without recovery the function + * returns NULL. + */ +Datum +pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS) +{ + TimestampTz xtime; + + xtime = GetLatestXTime(); + if (xtime == 0) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(xtime); +} + +/* + * Returns bool with current recovery mode, a global state. + */ +Datum +pg_is_in_recovery(PG_FUNCTION_ARGS) +{ + PG_RETURN_BOOL(RecoveryInProgress()); +} + +/* + * Compute the difference in bytes between two WAL locations. + */ +Datum +pg_wal_lsn_diff(PG_FUNCTION_ARGS) +{ + Datum result; + + result = DirectFunctionCall2(pg_lsn_mi, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1)); + + PG_RETURN_NUMERIC(result); +} + +/* + * Promotes a standby server. + * + * A result of "true" means that promotion has been completed if "wait" is + * "true", or initiated if "wait" is false. + */ +Datum +pg_promote(PG_FUNCTION_ARGS) +{ + bool wait = PG_GETARG_BOOL(0); + int wait_seconds = PG_GETARG_INT32(1); + FILE *promote_file; + int i; + + if (!RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is not in progress"), + errhint("Recovery control functions can only be executed during recovery."))); + + if (wait_seconds <= 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"wait_seconds\" must not be negative or zero"))); + + /* create the promote signal file */ + promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w"); + if (!promote_file) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + PROMOTE_SIGNAL_FILE))); + + if (FreeFile(promote_file)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + PROMOTE_SIGNAL_FILE))); + + /* signal the postmaster */ + if (kill(PostmasterPid, SIGUSR1) != 0) + { + ereport(WARNING, + (errmsg("failed to send signal to postmaster: %m"))); + (void) unlink(PROMOTE_SIGNAL_FILE); + PG_RETURN_BOOL(false); + } + + /* return immediately if waiting was not requested */ + if (!wait) + PG_RETURN_BOOL(true); + + /* wait for the amount of time wanted until promotion */ +#define WAITS_PER_SECOND 10 + for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++) + { + int rc; + + ResetLatch(MyLatch); + + if (!RecoveryInProgress()) + PG_RETURN_BOOL(true); + + CHECK_FOR_INTERRUPTS(); + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 1000L / WAITS_PER_SECOND, + WAIT_EVENT_PROMOTE); + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (rc & WL_POSTMASTER_DEATH) + PG_RETURN_BOOL(false); + } + + ereport(WARNING, + (errmsg_plural("server did not promote within %d second", + "server did not promote within %d seconds", + wait_seconds, + wait_seconds))); + PG_RETURN_BOOL(false); +} diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c new file mode 100644 index 0000000..35cc055 --- /dev/null +++ b/src/backend/access/transam/xloginsert.c @@ -0,0 +1,1318 @@ +/*------------------------------------------------------------------------- + * + * xloginsert.c + * Functions for constructing WAL records + * + * Constructing a WAL record begins with a call to XLogBeginInsert, + * followed by a number of XLogRegister* calls. The registered data is + * collected in private working memory, and finally assembled into a chain + * of XLogRecData structs by a call to XLogRecordAssemble(). See + * access/transam/README for details. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xloginsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#ifdef USE_LZ4 +#include <lz4.h> +#endif + +#ifdef USE_ZSTD +#include <zstd.h> +#endif + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xloginsert.h" +#include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "replication/origin.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" +#include "utils/memutils.h" + +/* + * Guess the maximum buffer size required to store a compressed version of + * backup block image. + */ +#ifdef USE_LZ4 +#define LZ4_MAX_BLCKSZ LZ4_COMPRESSBOUND(BLCKSZ) +#else +#define LZ4_MAX_BLCKSZ 0 +#endif + +#ifdef USE_ZSTD +#define ZSTD_MAX_BLCKSZ ZSTD_COMPRESSBOUND(BLCKSZ) +#else +#define ZSTD_MAX_BLCKSZ 0 +#endif + +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + +/* Buffer size required to store a compressed version of backup block image */ +#define COMPRESS_BUFSIZE Max(Max(PGLZ_MAX_BLCKSZ, LZ4_MAX_BLCKSZ), ZSTD_MAX_BLCKSZ) + +/* + * For each block reference registered with XLogRegisterBuffer, we fill in + * a registered_buffer struct. + */ +typedef struct +{ + bool in_use; /* is this slot in use? */ + uint8 flags; /* REGBUF_* flags */ + RelFileNode rnode; /* identifies the relation and block */ + ForkNumber forkno; + BlockNumber block; + Page page; /* page content */ + uint32 rdata_len; /* total length of data in rdata chain */ + XLogRecData *rdata_head; /* head of the chain of data registered with + * this block */ + XLogRecData *rdata_tail; /* last entry in the chain, or &rdata_head if + * empty */ + + XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to + * backup block data in XLogRecordAssemble() */ + + /* buffer to store a compressed version of backup block image */ + char compressed_page[COMPRESS_BUFSIZE]; +} registered_buffer; + +static registered_buffer *registered_buffers; +static int max_registered_buffers; /* allocated size */ +static int max_registered_block_id = 0; /* highest block_id + 1 currently + * registered */ + +/* + * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered + * with XLogRegisterData(...). + */ +static XLogRecData *mainrdata_head; +static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head; +static uint32 mainrdata_len; /* total # of bytes in chain */ + +/* flags for the in-progress insertion */ +static uint8 curinsert_flags = 0; + +/* + * These are used to hold the record header while constructing a record. + * 'hdr_scratch' is not a plain variable, but is palloc'd at initialization, + * because we want it to be MAXALIGNed and padding bytes zeroed. + * + * For simplicity, it's allocated large enough to hold the headers for any + * WAL record. + */ +static XLogRecData hdr_rdt; +static char *hdr_scratch = NULL; + +#define SizeOfXlogOrigin (sizeof(RepOriginId) + sizeof(char)) +#define SizeOfXLogTransactionId (sizeof(TransactionId) + sizeof(char)) + +#define HEADER_SCRATCH_SIZE \ + (SizeOfXLogRecord + \ + MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \ + SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \ + SizeOfXLogTransactionId) + +/* + * An array of XLogRecData structs, to hold registered data. + */ +static XLogRecData *rdatas; +static int num_rdatas; /* entries currently used */ +static int max_rdatas; /* allocated size */ + +static bool begininsert_called = false; + +/* Memory context to hold the registered buffer and data references. */ +static MemoryContext xloginsert_cxt; + +static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi, + bool *topxid_included); +static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, + uint16 hole_length, char *dest, uint16 *dlen); + +/* + * Begin constructing a WAL record. This must be called before the + * XLogRegister* functions and XLogInsert(). + */ +void +XLogBeginInsert(void) +{ + Assert(max_registered_block_id == 0); + Assert(mainrdata_last == (XLogRecData *) &mainrdata_head); + Assert(mainrdata_len == 0); + + /* cross-check on whether we should be here or not */ + if (!XLogInsertAllowed()) + elog(ERROR, "cannot make new WAL entries during recovery"); + + if (begininsert_called) + elog(ERROR, "XLogBeginInsert was already called"); + + begininsert_called = true; +} + +/* + * Ensure that there are enough buffer and data slots in the working area, + * for subsequent XLogRegisterBuffer, XLogRegisterData and XLogRegisterBufData + * calls. + * + * There is always space for a small number of buffers and data chunks, enough + * for most record types. This function is for the exceptional cases that need + * more. + */ +void +XLogEnsureRecordSpace(int max_block_id, int ndatas) +{ + int nbuffers; + + /* + * This must be called before entering a critical section, because + * allocating memory inside a critical section can fail. repalloc() will + * check the same, but better to check it here too so that we fail + * consistently even if the arrays happen to be large enough already. + */ + Assert(CritSectionCount == 0); + + /* the minimum values can't be decreased */ + if (max_block_id < XLR_NORMAL_MAX_BLOCK_ID) + max_block_id = XLR_NORMAL_MAX_BLOCK_ID; + if (ndatas < XLR_NORMAL_RDATAS) + ndatas = XLR_NORMAL_RDATAS; + + if (max_block_id > XLR_MAX_BLOCK_ID) + elog(ERROR, "maximum number of WAL record block references exceeded"); + nbuffers = max_block_id + 1; + + if (nbuffers > max_registered_buffers) + { + registered_buffers = (registered_buffer *) + repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers); + + /* + * At least the padding bytes in the structs must be zeroed, because + * they are included in WAL data, but initialize it all for tidiness. + */ + MemSet(®istered_buffers[max_registered_buffers], 0, + (nbuffers - max_registered_buffers) * sizeof(registered_buffer)); + max_registered_buffers = nbuffers; + } + + if (ndatas > max_rdatas) + { + rdatas = (XLogRecData *) repalloc(rdatas, sizeof(XLogRecData) * ndatas); + max_rdatas = ndatas; + } +} + +/* + * Reset WAL record construction buffers. + */ +void +XLogResetInsertion(void) +{ + int i; + + for (i = 0; i < max_registered_block_id; i++) + registered_buffers[i].in_use = false; + + num_rdatas = 0; + max_registered_block_id = 0; + mainrdata_len = 0; + mainrdata_last = (XLogRecData *) &mainrdata_head; + curinsert_flags = 0; + begininsert_called = false; +} + +/* + * Register a reference to a buffer with the WAL record being constructed. + * This must be called for every page that the WAL-logged operation modifies. + */ +void +XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) +{ + registered_buffer *regbuf; + + /* NO_IMAGE doesn't make sense with FORCE_IMAGE */ + Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE)))); + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + { + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + max_registered_block_id = block_id + 1; + } + + regbuf = ®istered_buffers[block_id]; + + BufferGetTag(buffer, ®buf->rnode, ®buf->forkno, ®buf->block); + regbuf->page = BufferGetPage(buffer); + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Like XLogRegisterBuffer, but for registering a block that's not in the + * shared buffer pool (i.e. when you don't have a Buffer for it). + */ +void +XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum, + BlockNumber blknum, Page page, uint8 flags) +{ + registered_buffer *regbuf; + + Assert(begininsert_called); + + if (block_id >= max_registered_block_id) + max_registered_block_id = block_id + 1; + + if (block_id >= max_registered_buffers) + elog(ERROR, "too many registered buffers"); + + regbuf = ®istered_buffers[block_id]; + + regbuf->rnode = *rnode; + regbuf->forkno = forknum; + regbuf->block = blknum; + regbuf->page = page; + regbuf->flags = flags; + regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; + regbuf->rdata_len = 0; + + /* + * Check that this page hasn't already been registered with some other + * block_id. + */ +#ifdef USE_ASSERT_CHECKING + { + int i; + + for (i = 0; i < max_registered_block_id; i++) + { + registered_buffer *regbuf_old = ®istered_buffers[i]; + + if (i == block_id || !regbuf_old->in_use) + continue; + + Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || + regbuf_old->forkno != regbuf->forkno || + regbuf_old->block != regbuf->block); + } + } +#endif + + regbuf->in_use = true; +} + +/* + * Add data to the WAL record that's being constructed. + * + * The data is appended to the "main chunk", available at replay with + * XLogRecGetData(). + */ +void +XLogRegisterData(char *data, int len) +{ + XLogRecData *rdata; + + Assert(begininsert_called); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + /* + * we use the mainrdata_last pointer to track the end of the chain, so no + * need to clear 'next' here. + */ + + mainrdata_last->next = rdata; + mainrdata_last = rdata; + + mainrdata_len += len; +} + +/* + * Add buffer-specific data to the WAL record that's being constructed. + * + * Block_id must reference a block previously registered with + * XLogRegisterBuffer(). If this is called more than once for the same + * block_id, the data is appended. + * + * The maximum amount of data that can be registered per block is 65535 + * bytes. That should be plenty; if you need more than BLCKSZ bytes to + * reconstruct the changes to the page, you might as well just log a full + * copy of it. (the "main data" that's not associated with a block is not + * limited) + */ +void +XLogRegisterBufData(uint8 block_id, char *data, int len) +{ + registered_buffer *regbuf; + XLogRecData *rdata; + + Assert(begininsert_called); + + /* find the registered buffer struct */ + regbuf = ®istered_buffers[block_id]; + if (!regbuf->in_use) + elog(ERROR, "no block with id %d registered with WAL insertion", + block_id); + + if (num_rdatas >= max_rdatas) + elog(ERROR, "too much WAL data"); + rdata = &rdatas[num_rdatas++]; + + rdata->data = data; + rdata->len = len; + + regbuf->rdata_tail->next = rdata; + regbuf->rdata_tail = rdata; + regbuf->rdata_len += len; +} + +/* + * Set insert status flags for the upcoming WAL record. + * + * The flags that can be used here are: + * - XLOG_INCLUDE_ORIGIN, to determine if the replication origin should be + * included in the record. + * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for + * durability, which allows to avoid triggering WAL archiving and other + * background activity. + */ +void +XLogSetRecordFlags(uint8 flags) +{ + Assert(begininsert_called); + curinsert_flags |= flags; +} + +/* + * Insert an XLOG record having the specified RMID and info bytes, with the + * body of the record being the data and buffer references registered earlier + * with XLogRegister* calls. + * + * Returns XLOG pointer to end of record (beginning of next record). + * This can be used as LSN for data pages affected by the logged action. + * (LSN is the XLOG point up to which the XLOG must be flushed to disk + * before the data page can be written out. This implements the basic + * WAL rule "write the log before the data".) + */ +XLogRecPtr +XLogInsert(RmgrId rmid, uint8 info) +{ + XLogRecPtr EndPos; + + /* XLogBeginInsert() must have been called. */ + if (!begininsert_called) + elog(ERROR, "XLogBeginInsert was not called"); + + /* + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. + */ + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) + elog(PANIC, "invalid xlog info mask %02X", info); + + TRACE_POSTGRESQL_WAL_INSERT(rmid, info); + + /* + * In bootstrap mode, we don't actually log anything but XLOG resources; + * return a phony record pointer. + */ + if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID) + { + XLogResetInsertion(); + EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ + return EndPos; + } + + do + { + XLogRecPtr RedoRecPtr; + bool doPageWrites; + bool topxid_included = false; + XLogRecPtr fpw_lsn; + XLogRecData *rdt; + int num_fpi = 0; + + /* + * Get values needed to decide whether to do full-page writes. Since + * we don't yet have an insertion lock, these could change under us, + * but XLogInsertRecord will recheck them once it has a lock. + */ + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, + &fpw_lsn, &num_fpi, &topxid_included); + + EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi, + topxid_included); + } while (EndPos == InvalidXLogRecPtr); + + XLogResetInsertion(); + + return EndPos; +} + +/* + * Assemble a WAL record from the registered data and buffers into an + * XLogRecData chain, ready for insertion with XLogInsertRecord(). + * + * The record header fields are filled in, except for the xl_prev field. The + * calculated CRC does not include the record header yet. + * + * If there are any registered buffers, and a full-page image was not taken + * of all of them, *fpw_lsn is set to the lowest LSN among such pages. This + * signals that the assembled record is only good for insertion on the + * assumption that the RedoRecPtr and doPageWrites values were up-to-date. + * + * *topxid_included is set if the topmost transaction ID is logged with the + * current subtransaction. + */ +static XLogRecData * +XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi, bool *topxid_included) +{ + XLogRecData *rdt; + uint32 total_len = 0; + int block_id; + pg_crc32c rdata_crc; + registered_buffer *prev_regbuf = NULL; + XLogRecData *rdt_datas_last; + XLogRecord *rechdr; + char *scratch = hdr_scratch; + + /* + * Note: this function can be called multiple times for the same record. + * All the modifications we do to the rdata chains below must handle that. + */ + + /* The record begins with the fixed-size header */ + rechdr = (XLogRecord *) scratch; + scratch += SizeOfXLogRecord; + + hdr_rdt.next = NULL; + rdt_datas_last = &hdr_rdt; + hdr_rdt.data = hdr_scratch; + + /* + * Enforce consistency checks for this record if user is looking for it. + * Do this before at the beginning of this routine to give the possibility + * for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY directly for + * a record. + */ + if (wal_consistency_checking[rmid]) + info |= XLR_CHECK_CONSISTENCY; + + /* + * Make an rdata chain containing all the data portions of all block + * references. This includes the data for full-page images. Also append + * the headers for the block references in the scratch buffer. + */ + *fpw_lsn = InvalidXLogRecPtr; + for (block_id = 0; block_id < max_registered_block_id; block_id++) + { + registered_buffer *regbuf = ®istered_buffers[block_id]; + bool needs_backup; + bool needs_data; + XLogRecordBlockHeader bkpb; + XLogRecordBlockImageHeader bimg; + XLogRecordBlockCompressHeader cbimg = {0}; + bool samerel; + bool is_compressed = false; + bool include_image; + + if (!regbuf->in_use) + continue; + + /* Determine if this block needs to be backed up */ + if (regbuf->flags & REGBUF_FORCE_IMAGE) + needs_backup = true; + else if (regbuf->flags & REGBUF_NO_IMAGE) + needs_backup = false; + else if (!doPageWrites) + needs_backup = false; + else + { + /* + * We assume page LSN is first data on *every* page that can be + * passed to XLogInsert, whether it has the standard page layout + * or not. + */ + XLogRecPtr page_lsn = PageGetLSN(regbuf->page); + + needs_backup = (page_lsn <= RedoRecPtr); + if (!needs_backup) + { + if (*fpw_lsn == InvalidXLogRecPtr || page_lsn < *fpw_lsn) + *fpw_lsn = page_lsn; + } + } + + /* Determine if the buffer data needs to included */ + if (regbuf->rdata_len == 0) + needs_data = false; + else if ((regbuf->flags & REGBUF_KEEP_DATA) != 0) + needs_data = true; + else + needs_data = !needs_backup; + + bkpb.id = block_id; + bkpb.fork_flags = regbuf->forkno; + bkpb.data_length = 0; + + if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) + bkpb.fork_flags |= BKPBLOCK_WILL_INIT; + + /* + * If needs_backup is true or WAL checking is enabled for current + * resource manager, log a full-page write for the current block. + */ + include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0; + + if (include_image) + { + Page page = regbuf->page; + uint16 compressed_len = 0; + + /* + * The page needs to be backed up, so calculate its hole length + * and offset. + */ + if (regbuf->flags & REGBUF_STANDARD) + { + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bimg.hole_offset = lower; + cbimg.hole_length = upper - lower; + } + else + { + /* No "hole" to remove */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } + } + else + { + /* Not a standard page header, don't try to eliminate "hole" */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } + + /* + * Try to compress a block image if wal_compression is enabled + */ + if (wal_compression != WAL_COMPRESSION_NONE) + { + is_compressed = + XLogCompressBackupBlock(page, bimg.hole_offset, + cbimg.hole_length, + regbuf->compressed_page, + &compressed_len); + } + + /* + * Fill in the remaining fields in the XLogRecordBlockHeader + * struct + */ + bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; + + /* Report a full page image constructed for the WAL record */ + *num_fpi += 1; + + /* + * Construct XLogRecData entries for the page content. + */ + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + + bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + + /* + * If WAL consistency checking is enabled for the resource manager + * of this WAL record, a full-page image is included in the record + * for the block modified. During redo, the full-page is replayed + * only if BKPIMAGE_APPLY is set. + */ + if (needs_backup) + bimg.bimg_info |= BKPIMAGE_APPLY; + + if (is_compressed) + { + /* The current compression is stored in the WAL record */ + bimg.length = compressed_len; + + /* Set the compression method used for this block */ + switch ((WalCompression) wal_compression) + { + case WAL_COMPRESSION_PGLZ: + bimg.bimg_info |= BKPIMAGE_COMPRESS_PGLZ; + break; + + case WAL_COMPRESSION_LZ4: +#ifdef USE_LZ4 + bimg.bimg_info |= BKPIMAGE_COMPRESS_LZ4; +#else + elog(ERROR, "LZ4 is not supported by this build"); +#endif + break; + + case WAL_COMPRESSION_ZSTD: +#ifdef USE_ZSTD + bimg.bimg_info |= BKPIMAGE_COMPRESS_ZSTD; +#else + elog(ERROR, "zstd is not supported by this build"); +#endif + break; + + case WAL_COMPRESSION_NONE: + Assert(false); /* cannot happen */ + break; + /* no default case, so that compiler will warn */ + } + + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compressed_len; + } + else + { + bimg.length = BLCKSZ - cbimg.hole_length; + + if (cbimg.hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = bimg.hole_offset; + + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + + rdt_datas_last->data = + page + (bimg.hole_offset + cbimg.hole_length); + rdt_datas_last->len = + BLCKSZ - (bimg.hole_offset + cbimg.hole_length); + } + } + + total_len += bimg.length; + } + + if (needs_data) + { + /* + * Link the caller-supplied rdata chain for this buffer to the + * overall list. + */ + bkpb.fork_flags |= BKPBLOCK_HAS_DATA; + bkpb.data_length = regbuf->rdata_len; + total_len += regbuf->rdata_len; + + rdt_datas_last->next = regbuf->rdata_head; + rdt_datas_last = regbuf->rdata_tail; + } + + if (prev_regbuf && RelFileNodeEquals(regbuf->rnode, prev_regbuf->rnode)) + { + samerel = true; + bkpb.fork_flags |= BKPBLOCK_SAME_REL; + } + else + samerel = false; + prev_regbuf = regbuf; + + /* Ok, copy the header to the scratch buffer */ + memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); + scratch += SizeOfXLogRecordBlockHeader; + if (include_image) + { + memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); + scratch += SizeOfXLogRecordBlockImageHeader; + if (cbimg.hole_length != 0 && is_compressed) + { + memcpy(scratch, &cbimg, + SizeOfXLogRecordBlockCompressHeader); + scratch += SizeOfXLogRecordBlockCompressHeader; + } + } + if (!samerel) + { + memcpy(scratch, ®buf->rnode, sizeof(RelFileNode)); + scratch += sizeof(RelFileNode); + } + memcpy(scratch, ®buf->block, sizeof(BlockNumber)); + scratch += sizeof(BlockNumber); + } + + /* followed by the record's origin, if any */ + if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) && + replorigin_session_origin != InvalidRepOriginId) + { + *(scratch++) = (char) XLR_BLOCK_ID_ORIGIN; + memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin)); + scratch += sizeof(replorigin_session_origin); + } + + /* followed by toplevel XID, if not already included in previous record */ + if (IsSubxactTopXidLogPending()) + { + TransactionId xid = GetTopTransactionIdIfAny(); + + /* Set the flag that the top xid is included in the WAL */ + *topxid_included = true; + + *(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID; + memcpy(scratch, &xid, sizeof(TransactionId)); + scratch += sizeof(TransactionId); + } + + /* followed by main data, if any */ + if (mainrdata_len > 0) + { + if (mainrdata_len > 255) + { + *(scratch++) = (char) XLR_BLOCK_ID_DATA_LONG; + memcpy(scratch, &mainrdata_len, sizeof(uint32)); + scratch += sizeof(uint32); + } + else + { + *(scratch++) = (char) XLR_BLOCK_ID_DATA_SHORT; + *(scratch++) = (uint8) mainrdata_len; + } + rdt_datas_last->next = mainrdata_head; + rdt_datas_last = mainrdata_last; + total_len += mainrdata_len; + } + rdt_datas_last->next = NULL; + + hdr_rdt.len = (scratch - hdr_scratch); + total_len += hdr_rdt.len; + + /* + * Calculate CRC of the data + * + * Note that the record header isn't added into the CRC initially since we + * don't know the prev-link yet. Thus, the CRC will represent the CRC of + * the whole record in the order: rdata, then backup blocks, then record + * header. + */ + INIT_CRC32C(rdata_crc); + COMP_CRC32C(rdata_crc, hdr_scratch + SizeOfXLogRecord, hdr_rdt.len - SizeOfXLogRecord); + for (rdt = hdr_rdt.next; rdt != NULL; rdt = rdt->next) + COMP_CRC32C(rdata_crc, rdt->data, rdt->len); + + /* + * Fill in the fields in the record header. Prev-link is filled in later, + * once we know where in the WAL the record will be inserted. The CRC does + * not include the record header yet. + */ + rechdr->xl_xid = GetCurrentTransactionIdIfAny(); + rechdr->xl_tot_len = total_len; + rechdr->xl_info = info; + rechdr->xl_rmid = rmid; + rechdr->xl_prev = InvalidXLogRecPtr; + rechdr->xl_crc = rdata_crc; + + return &hdr_rdt; +} + +/* + * Create a compressed version of a backup block image. + * + * Returns false if compression fails (i.e., compressed result is actually + * bigger than original). Otherwise, returns true and sets 'dlen' to + * the length of compressed block image. + */ +static bool +XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, + char *dest, uint16 *dlen) +{ + int32 orig_len = BLCKSZ - hole_length; + int32 len = -1; + int32 extra_bytes = 0; + char *source; + PGAlignedBlock tmp; + + if (hole_length != 0) + { + /* must skip the hole */ + source = tmp.data; + memcpy(source, page, hole_offset); + memcpy(source + hole_offset, + page + (hole_offset + hole_length), + BLCKSZ - (hole_length + hole_offset)); + + /* + * Extra data needs to be stored in WAL record for the compressed + * version of block image if the hole exists. + */ + extra_bytes = SizeOfXLogRecordBlockCompressHeader; + } + else + source = page; + + switch ((WalCompression) wal_compression) + { + case WAL_COMPRESSION_PGLZ: + len = pglz_compress(source, orig_len, dest, PGLZ_strategy_default); + break; + + case WAL_COMPRESSION_LZ4: +#ifdef USE_LZ4 + len = LZ4_compress_default(source, dest, orig_len, + COMPRESS_BUFSIZE); + if (len <= 0) + len = -1; /* failure */ +#else + elog(ERROR, "LZ4 is not supported by this build"); +#endif + break; + + case WAL_COMPRESSION_ZSTD: +#ifdef USE_ZSTD + len = ZSTD_compress(dest, COMPRESS_BUFSIZE, source, orig_len, + ZSTD_CLEVEL_DEFAULT); + if (ZSTD_isError(len)) + len = -1; /* failure */ +#else + elog(ERROR, "zstd is not supported by this build"); +#endif + break; + + case WAL_COMPRESSION_NONE: + Assert(false); /* cannot happen */ + break; + /* no default case, so that compiler will warn */ + } + + /* + * We recheck the actual size even if compression reports success and see + * if the number of bytes saved by compression is larger than the length + * of extra data needed for the compressed version of block image. + */ + if (len >= 0 && + len + extra_bytes < orig_len) + { + *dlen = (uint16) len; /* successful compression */ + return true; + } + return false; +} + +/* + * Determine whether the buffer referenced has to be backed up. + * + * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites + * could change later, so the result should be used for optimization purposes + * only. + */ +bool +XLogCheckBufferNeedsBackup(Buffer buffer) +{ + XLogRecPtr RedoRecPtr; + bool doPageWrites; + Page page; + + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + page = BufferGetPage(buffer); + + if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) + return true; /* buffer requires backup */ + + return false; /* buffer does not need to be backed up */ +} + +/* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Callable while holding just share lock on the buffer content. + * + * We can't use the plain backup block mechanism since that relies on the + * Buffer being exclusively locked. Since some modifications (setting LSN, hint + * bits) are allowed in a sharelocked buffer that can lead to wal checksum + * failures. So instead we copy the page and insert the copied data as normal + * record data. + * + * We only need to do something if page has not yet been full page written in + * this checkpoint round. The LSN of the inserted wal record is returned if we + * had to write, InvalidXLogRecPtr otherwise. + * + * It is possible that multiple concurrent backends could attempt to write WAL + * records. In that case, multiple copies of the same block would be recorded + * in separate WAL records by different backends, though that is still OK from + * a correctness perspective. + */ +XLogRecPtr +XLogSaveBufferForHint(Buffer buffer, bool buffer_std) +{ + XLogRecPtr recptr = InvalidXLogRecPtr; + XLogRecPtr lsn; + XLogRecPtr RedoRecPtr; + + /* + * Ensure no checkpoint can change our view of RedoRecPtr. + */ + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) != 0); + + /* + * Update RedoRecPtr so that we can make the right decision + */ + RedoRecPtr = GetRedoRecPtr(); + + /* + * We assume page LSN is first data on *every* page that can be passed to + * XLogInsert, whether it has the standard page layout or not. Since we're + * only holding a share-lock on the page, we must take the buffer header + * lock when we look at the LSN. + */ + lsn = BufferGetLSNAtomic(buffer); + + if (lsn <= RedoRecPtr) + { + int flags = 0; + PGAlignedBlock copied_buffer; + char *origdata = (char *) BufferGetBlock(buffer); + RelFileNode rnode; + ForkNumber forkno; + BlockNumber blkno; + + /* + * Copy buffer so we don't have to worry about concurrent hint bit or + * lsn updates. We assume pd_lower/upper cannot be changed without an + * exclusive lock, so the contents bkp are not racy. + */ + if (buffer_std) + { + /* Assume we can omit data between pd_lower and pd_upper */ + Page page = BufferGetPage(buffer); + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + memcpy(copied_buffer.data, origdata, lower); + memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); + } + else + memcpy(copied_buffer.data, origdata, BLCKSZ); + + XLogBeginInsert(); + + if (buffer_std) + flags |= REGBUF_STANDARD; + + BufferGetTag(buffer, &rnode, &forkno, &blkno); + XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); + } + + return recptr; +} + +/* + * Write a WAL record containing a full image of a page. Caller is responsible + * for writing the page to disk after calling this routine. + * + * Note: If you're using this function, you should be building pages in private + * memory and writing them directly to smgr. If you're using buffers, call + * log_newpage_buffer instead. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + int flags; + XLogRecPtr recptr; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + XLogBeginInsert(); + XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + /* + * The page may be uninitialized. If so, we can't set the LSN because that + * would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, recptr); + } + + return recptr; +} + +/* + * Like log_newpage(), but allows logging multiple pages in one operation. + * It is more efficient than calling log_newpage() for each page separately, + * because we can write multiple pages in a single WAL record. + */ +void +log_newpages(RelFileNode *rnode, ForkNumber forkNum, int num_pages, + BlockNumber *blknos, Page *pages, bool page_std) +{ + int flags; + XLogRecPtr recptr; + int i; + int j; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + /* + * Iterate over all the pages. They are collected into batches of + * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each + * batch. + */ + XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0); + + i = 0; + while (i < num_pages) + { + int batch_start = i; + int nbatch; + + XLogBeginInsert(); + + nbatch = 0; + while (nbatch < XLR_MAX_BLOCK_ID && i < num_pages) + { + XLogRegisterBlock(nbatch, rnode, forkNum, blknos[i], pages[i], flags); + i++; + nbatch++; + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (j = batch_start; j < i; j++) + { + /* + * The page may be uninitialized. If so, we can't set the LSN + * because that would corrupt the page. + */ + if (!PageIsNew(pages[j])) + { + PageSetLSN(pages[j], recptr); + } + } + } +} + +/* + * Write a WAL record containing a full image of a page. + * + * Caller should initialize the buffer and mark it dirty before calling this + * function. This function will set the page LSN. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage_buffer(Buffer buffer, bool page_std) +{ + Page page = BufferGetPage(buffer); + RelFileNode rnode; + ForkNumber forkNum; + BlockNumber blkno; + + /* Shared buffers should be modified in a critical section. */ + Assert(CritSectionCount > 0); + + BufferGetTag(buffer, &rnode, &forkNum, &blkno); + + return log_newpage(&rnode, forkNum, blkno, page, page_std); +} + +/* + * WAL-log a range of blocks in a relation. + * + * An image of all pages with block numbers 'startblk' <= X < 'endblk' is + * written to the WAL. If the range is large, this is done in multiple WAL + * records. + * + * If all page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to true. That allows + * the unused space to be left out from the WAL records, making them smaller. + * + * NOTE: This function acquires exclusive-locks on the pages. Typically, this + * is used on a newly-built relation, and the caller is holding a + * AccessExclusiveLock on it, so no other backend can be accessing it at the + * same time. If that's not the case, you must ensure that this does not + * cause a deadlock through some other means. + */ +void +log_newpage_range(Relation rel, ForkNumber forkNum, + BlockNumber startblk, BlockNumber endblk, + bool page_std) +{ + int flags; + BlockNumber blkno; + + flags = REGBUF_FORCE_IMAGE; + if (page_std) + flags |= REGBUF_STANDARD; + + /* + * Iterate over all the pages in the range. They are collected into + * batches of XLR_MAX_BLOCK_ID pages, and a single WAL-record is written + * for each batch. + */ + XLogEnsureRecordSpace(XLR_MAX_BLOCK_ID - 1, 0); + + blkno = startblk; + while (blkno < endblk) + { + Buffer bufpack[XLR_MAX_BLOCK_ID]; + XLogRecPtr recptr; + int nbufs; + int i; + + CHECK_FOR_INTERRUPTS(); + + /* Collect a batch of blocks. */ + nbufs = 0; + while (nbufs < XLR_MAX_BLOCK_ID && blkno < endblk) + { + Buffer buf = ReadBufferExtended(rel, forkNum, blkno, + RBM_NORMAL, NULL); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Completely empty pages are not WAL-logged. Writing a WAL record + * would change the LSN, and we don't want that. We want the page + * to stay empty. + */ + if (!PageIsNew(BufferGetPage(buf))) + bufpack[nbufs++] = buf; + else + UnlockReleaseBuffer(buf); + blkno++; + } + + /* Nothing more to do if all remaining blocks were empty. */ + if (nbufs == 0) + break; + + /* Write WAL record for this batch. */ + XLogBeginInsert(); + + START_CRIT_SECTION(); + for (i = 0; i < nbufs; i++) + { + XLogRegisterBuffer(i, bufpack[i], flags); + MarkBufferDirty(bufpack[i]); + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (i = 0; i < nbufs; i++) + { + PageSetLSN(BufferGetPage(bufpack[i]), recptr); + UnlockReleaseBuffer(bufpack[i]); + } + END_CRIT_SECTION(); + } +} + +/* + * Allocate working buffers needed for WAL record construction. + */ +void +InitXLogInsert(void) +{ + /* Initialize the working areas */ + if (xloginsert_cxt == NULL) + { + xloginsert_cxt = AllocSetContextCreate(TopMemoryContext, + "WAL record construction", + ALLOCSET_DEFAULT_SIZES); + } + + if (registered_buffers == NULL) + { + registered_buffers = (registered_buffer *) + MemoryContextAllocZero(xloginsert_cxt, + sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1)); + max_registered_buffers = XLR_NORMAL_MAX_BLOCK_ID + 1; + } + if (rdatas == NULL) + { + rdatas = MemoryContextAlloc(xloginsert_cxt, + sizeof(XLogRecData) * XLR_NORMAL_RDATAS); + max_rdatas = XLR_NORMAL_RDATAS; + } + + /* + * Allocate a buffer to hold the header information for a WAL record. + */ + if (hdr_scratch == NULL) + hdr_scratch = MemoryContextAllocZero(xloginsert_cxt, + HEADER_SCRATCH_SIZE); +} diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c new file mode 100644 index 0000000..b98b319 --- /dev/null +++ b/src/backend/access/transam/xlogprefetcher.c @@ -0,0 +1,1105 @@ +/*------------------------------------------------------------------------- + * + * xlogprefetcher.c + * Prefetching support for recovery. + * + * Portions Copyright (c) 2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/transam/xlogprefetcher.c + * + * This module provides a drop-in replacement for an XLogReader that tries to + * minimize I/O stalls by looking ahead in the WAL. If blocks that will be + * accessed in the near future are not already in the buffer pool, it initiates + * I/Os that might complete before the caller eventually needs the data. When + * referenced blocks are found in the buffer pool already, the buffer is + * recorded in the decoded record so that XLogReadBufferForRedo() can try to + * avoid a second buffer mapping table lookup. + * + * Currently, only the main fork is considered for prefetching. Currently, + * prefetching is only effective on systems where BufferPrefetch() does + * something useful (mainly Linux). + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogprefetcher.h" +#include "access/xlogreader.h" +#include "access/xlogutils.h" +#include "catalog/pg_class.h" +#include "catalog/pg_control.h" +#include "catalog/storage_xlog.h" +#include "commands/dbcommands_xlog.h" +#include "utils/fmgrprotos.h" +#include "utils/timestamp.h" +#include "funcapi.h" +#include "pgstat.h" +#include "miscadmin.h" +#include "port/atomics.h" +#include "storage/bufmgr.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" + +/* + * Every time we process this much WAL, we'll update the values in + * pg_stat_recovery_prefetch. + */ +#define XLOGPREFETCHER_STATS_DISTANCE BLCKSZ + +/* + * To detect repeated access to the same block and skip useless extra system + * calls, we remember a small window of recently prefetched blocks. + */ +#define XLOGPREFETCHER_SEQ_WINDOW_SIZE 4 + +/* + * When maintenance_io_concurrency is not saturated, we're prepared to look + * ahead up to N times that number of block references. + */ +#define XLOGPREFETCHER_DISTANCE_MULTIPLIER 4 + +/* Define to log internal debugging messages. */ +/* #define XLOGPREFETCHER_DEBUG_LEVEL LOG */ + +/* GUCs */ +int recovery_prefetch = RECOVERY_PREFETCH_TRY; + +#ifdef USE_PREFETCH +#define RecoveryPrefetchEnabled() \ + (recovery_prefetch != RECOVERY_PREFETCH_OFF && \ + maintenance_io_concurrency > 0) +#else +#define RecoveryPrefetchEnabled() false +#endif + +static int XLogPrefetchReconfigureCount = 0; + +/* + * Enum used to report whether an IO should be started. + */ +typedef enum +{ + LRQ_NEXT_NO_IO, + LRQ_NEXT_IO, + LRQ_NEXT_AGAIN +} LsnReadQueueNextStatus; + +/* + * Type of callback that can decide which block to prefetch next. For now + * there is only one. + */ +typedef LsnReadQueueNextStatus (*LsnReadQueueNextFun) (uintptr_t lrq_private, + XLogRecPtr *lsn); + +/* + * A simple circular queue of LSNs, using to control the number of + * (potentially) inflight IOs. This stands in for a later more general IO + * control mechanism, which is why it has the apparently unnecessary + * indirection through a function pointer. + */ +typedef struct LsnReadQueue +{ + LsnReadQueueNextFun next; + uintptr_t lrq_private; + uint32 max_inflight; + uint32 inflight; + uint32 completed; + uint32 head; + uint32 tail; + uint32 size; + struct + { + bool io; + XLogRecPtr lsn; + } queue[FLEXIBLE_ARRAY_MEMBER]; +} LsnReadQueue; + +/* + * A prefetcher. This is a mechanism that wraps an XLogReader, prefetching + * blocks that will be soon be referenced, to try to avoid IO stalls. + */ +struct XLogPrefetcher +{ + /* WAL reader and current reading state. */ + XLogReaderState *reader; + DecodedXLogRecord *record; + int next_block_id; + + /* When to publish stats. */ + XLogRecPtr next_stats_shm_lsn; + + /* Book-keeping to avoid accessing blocks that don't exist yet. */ + HTAB *filter_table; + dlist_head filter_queue; + + /* Book-keeping to avoid repeat prefetches. */ + RelFileNode recent_rnode[XLOGPREFETCHER_SEQ_WINDOW_SIZE]; + BlockNumber recent_block[XLOGPREFETCHER_SEQ_WINDOW_SIZE]; + int recent_idx; + + /* Book-keeping to disable prefetching temporarily. */ + XLogRecPtr no_readahead_until; + + /* IO depth manager. */ + LsnReadQueue *streaming_read; + + XLogRecPtr begin_ptr; + + int reconfigure_count; +}; + +/* + * A temporary filter used to track block ranges that haven't been created + * yet, whole relations that haven't been created yet, and whole relations + * that (we assume) have already been dropped, or will be created by bulk WAL + * operators. + */ +typedef struct XLogPrefetcherFilter +{ + RelFileNode rnode; + XLogRecPtr filter_until_replayed; + BlockNumber filter_from_block; + dlist_node link; +} XLogPrefetcherFilter; + +/* + * Counters exposed in shared memory for pg_stat_recovery_prefetch. + */ +typedef struct XLogPrefetchStats +{ + pg_atomic_uint64 reset_time; /* Time of last reset. */ + pg_atomic_uint64 prefetch; /* Prefetches initiated. */ + pg_atomic_uint64 hit; /* Blocks already in cache. */ + pg_atomic_uint64 skip_init; /* Zero-inited blocks skipped. */ + pg_atomic_uint64 skip_new; /* New/missing blocks filtered. */ + pg_atomic_uint64 skip_fpw; /* FPWs skipped. */ + pg_atomic_uint64 skip_rep; /* Repeat accesses skipped. */ + + /* Dynamic values */ + int wal_distance; /* Number of WAL bytes ahead. */ + int block_distance; /* Number of block references ahead. */ + int io_depth; /* Number of I/Os in progress. */ +} XLogPrefetchStats; + +static inline void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, + RelFileNode rnode, + BlockNumber blockno, + XLogRecPtr lsn); +static inline bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, + RelFileNode rnode, + BlockNumber blockno); +static inline void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, + XLogRecPtr replaying_lsn); +static LsnReadQueueNextStatus XLogPrefetcherNextBlock(uintptr_t pgsr_private, + XLogRecPtr *lsn); + +static XLogPrefetchStats *SharedStats; + +static inline LsnReadQueue * +lrq_alloc(uint32 max_distance, + uint32 max_inflight, + uintptr_t lrq_private, + LsnReadQueueNextFun next) +{ + LsnReadQueue *lrq; + uint32 size; + + Assert(max_distance >= max_inflight); + + size = max_distance + 1; /* full ring buffer has a gap */ + lrq = palloc(offsetof(LsnReadQueue, queue) + sizeof(lrq->queue[0]) * size); + lrq->lrq_private = lrq_private; + lrq->max_inflight = max_inflight; + lrq->size = size; + lrq->next = next; + lrq->head = 0; + lrq->tail = 0; + lrq->inflight = 0; + lrq->completed = 0; + + return lrq; +} + +static inline void +lrq_free(LsnReadQueue *lrq) +{ + pfree(lrq); +} + +static inline uint32 +lrq_inflight(LsnReadQueue *lrq) +{ + return lrq->inflight; +} + +static inline uint32 +lrq_completed(LsnReadQueue *lrq) +{ + return lrq->completed; +} + +static inline void +lrq_prefetch(LsnReadQueue *lrq) +{ + /* Try to start as many IOs as we can within our limits. */ + while (lrq->inflight < lrq->max_inflight && + lrq->inflight + lrq->completed < lrq->size - 1) + { + Assert(((lrq->head + 1) % lrq->size) != lrq->tail); + switch (lrq->next(lrq->lrq_private, &lrq->queue[lrq->head].lsn)) + { + case LRQ_NEXT_AGAIN: + return; + case LRQ_NEXT_IO: + lrq->queue[lrq->head].io = true; + lrq->inflight++; + break; + case LRQ_NEXT_NO_IO: + lrq->queue[lrq->head].io = false; + lrq->completed++; + break; + } + lrq->head++; + if (lrq->head == lrq->size) + lrq->head = 0; + } +} + +static inline void +lrq_complete_lsn(LsnReadQueue *lrq, XLogRecPtr lsn) +{ + /* + * We know that LSNs before 'lsn' have been replayed, so we can now assume + * that any IOs that were started before then have finished. + */ + while (lrq->tail != lrq->head && + lrq->queue[lrq->tail].lsn < lsn) + { + if (lrq->queue[lrq->tail].io) + lrq->inflight--; + else + lrq->completed--; + lrq->tail++; + if (lrq->tail == lrq->size) + lrq->tail = 0; + } + if (RecoveryPrefetchEnabled()) + lrq_prefetch(lrq); +} + +size_t +XLogPrefetchShmemSize(void) +{ + return sizeof(XLogPrefetchStats); +} + +/* + * Reset all counters to zero. + */ +void +XLogPrefetchResetStats(void) +{ + pg_atomic_write_u64(&SharedStats->reset_time, GetCurrentTimestamp()); + pg_atomic_write_u64(&SharedStats->prefetch, 0); + pg_atomic_write_u64(&SharedStats->hit, 0); + pg_atomic_write_u64(&SharedStats->skip_init, 0); + pg_atomic_write_u64(&SharedStats->skip_new, 0); + pg_atomic_write_u64(&SharedStats->skip_fpw, 0); + pg_atomic_write_u64(&SharedStats->skip_rep, 0); +} + +void +XLogPrefetchShmemInit(void) +{ + bool found; + + SharedStats = (XLogPrefetchStats *) + ShmemInitStruct("XLogPrefetchStats", + sizeof(XLogPrefetchStats), + &found); + + if (!found) + { + pg_atomic_init_u64(&SharedStats->reset_time, GetCurrentTimestamp()); + pg_atomic_init_u64(&SharedStats->prefetch, 0); + pg_atomic_init_u64(&SharedStats->hit, 0); + pg_atomic_init_u64(&SharedStats->skip_init, 0); + pg_atomic_init_u64(&SharedStats->skip_new, 0); + pg_atomic_init_u64(&SharedStats->skip_fpw, 0); + pg_atomic_init_u64(&SharedStats->skip_rep, 0); + } +} + +/* + * Called when any GUC is changed that affects prefetching. + */ +void +XLogPrefetchReconfigure(void) +{ + XLogPrefetchReconfigureCount++; +} + +/* + * Increment a counter in shared memory. This is equivalent to *counter++ on a + * plain uint64 without any memory barrier or locking, except on platforms + * where readers can't read uint64 without possibly observing a torn value. + */ +static inline void +XLogPrefetchIncrement(pg_atomic_uint64 *counter) +{ + Assert(AmStartupProcess() || !IsUnderPostmaster); + pg_atomic_write_u64(counter, pg_atomic_read_u64(counter) + 1); +} + +/* + * Create a prefetcher that is ready to begin prefetching blocks referenced by + * WAL records. + */ +XLogPrefetcher * +XLogPrefetcherAllocate(XLogReaderState *reader) +{ + XLogPrefetcher *prefetcher; + static HASHCTL hash_table_ctl = { + .keysize = sizeof(RelFileNode), + .entrysize = sizeof(XLogPrefetcherFilter) + }; + + prefetcher = palloc0(sizeof(XLogPrefetcher)); + + prefetcher->reader = reader; + prefetcher->filter_table = hash_create("XLogPrefetcherFilterTable", 1024, + &hash_table_ctl, + HASH_ELEM | HASH_BLOBS); + dlist_init(&prefetcher->filter_queue); + + SharedStats->wal_distance = 0; + SharedStats->block_distance = 0; + SharedStats->io_depth = 0; + + /* First usage will cause streaming_read to be allocated. */ + prefetcher->reconfigure_count = XLogPrefetchReconfigureCount - 1; + + return prefetcher; +} + +/* + * Destroy a prefetcher and release all resources. + */ +void +XLogPrefetcherFree(XLogPrefetcher *prefetcher) +{ + lrq_free(prefetcher->streaming_read); + hash_destroy(prefetcher->filter_table); + pfree(prefetcher); +} + +/* + * Provide access to the reader. + */ +XLogReaderState * +XLogPrefetcherGetReader(XLogPrefetcher *prefetcher) +{ + return prefetcher->reader; +} + +/* + * Update the statistics visible in the pg_stat_recovery_prefetch view. + */ +void +XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher) +{ + uint32 io_depth; + uint32 completed; + int64 wal_distance; + + + /* How far ahead of replay are we now? */ + if (prefetcher->reader->decode_queue_tail) + { + wal_distance = + prefetcher->reader->decode_queue_tail->lsn - + prefetcher->reader->decode_queue_head->lsn; + } + else + { + wal_distance = 0; + } + + /* How many IOs are currently in flight and completed? */ + io_depth = lrq_inflight(prefetcher->streaming_read); + completed = lrq_completed(prefetcher->streaming_read); + + /* Update the instantaneous stats visible in pg_stat_recovery_prefetch. */ + SharedStats->io_depth = io_depth; + SharedStats->block_distance = io_depth + completed; + SharedStats->wal_distance = wal_distance; + + prefetcher->next_stats_shm_lsn = + prefetcher->reader->ReadRecPtr + XLOGPREFETCHER_STATS_DISTANCE; +} + +/* + * A callback that examines the next block reference in the WAL, and possibly + * starts an IO so that a later read will be fast. + * + * Returns LRQ_NEXT_AGAIN if no more WAL data is available yet. + * + * Returns LRQ_NEXT_IO if the next block reference is for a main fork block + * that isn't in the buffer pool, and the kernel has been asked to start + * reading it to make a future read system call faster. An LSN is written to + * *lsn, and the I/O will be considered to have completed once that LSN is + * replayed. + * + * Returns LRQ_NO_IO if we examined the next block reference and found that it + * was already in the buffer pool, or we decided for various reasons not to + * prefetch. + */ +static LsnReadQueueNextStatus +XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) +{ + XLogPrefetcher *prefetcher = (XLogPrefetcher *) pgsr_private; + XLogReaderState *reader = prefetcher->reader; + XLogRecPtr replaying_lsn = reader->ReadRecPtr; + + /* + * We keep track of the record and block we're up to between calls with + * prefetcher->record and prefetcher->next_block_id. + */ + for (;;) + { + DecodedXLogRecord *record; + + /* Try to read a new future record, if we don't already have one. */ + if (prefetcher->record == NULL) + { + bool nonblocking; + + /* + * If there are already records or an error queued up that could + * be replayed, we don't want to block here. Otherwise, it's OK + * to block waiting for more data: presumably the caller has + * nothing else to do. + */ + nonblocking = XLogReaderHasQueuedRecordOrError(reader); + + /* Readahead is disabled until we replay past a certain point. */ + if (nonblocking && replaying_lsn <= prefetcher->no_readahead_until) + return LRQ_NEXT_AGAIN; + + record = XLogReadAhead(prefetcher->reader, nonblocking); + if (record == NULL) + { + /* + * We can't read any more, due to an error or lack of data in + * nonblocking mode. Don't try to read ahead again until + * we've replayed everything already decoded. + */ + if (nonblocking && prefetcher->reader->decode_queue_tail) + prefetcher->no_readahead_until = + prefetcher->reader->decode_queue_tail->lsn; + + return LRQ_NEXT_AGAIN; + } + + /* + * If prefetching is disabled, we don't need to analyze the record + * or issue any prefetches. We just need to cause one record to + * be decoded. + */ + if (!RecoveryPrefetchEnabled()) + { + *lsn = InvalidXLogRecPtr; + return LRQ_NEXT_NO_IO; + } + + /* We have a new record to process. */ + prefetcher->record = record; + prefetcher->next_block_id = 0; + } + else + { + /* Continue to process from last call, or last loop. */ + record = prefetcher->record; + } + + /* + * Check for operations that require us to filter out block ranges, or + * pause readahead completely. + */ + if (replaying_lsn < record->lsn) + { + uint8 rmid = record->header.xl_rmid; + uint8 record_type = record->header.xl_info & ~XLR_INFO_MASK; + + if (rmid == RM_XLOG_ID) + { + if (record_type == XLOG_CHECKPOINT_SHUTDOWN || + record_type == XLOG_END_OF_RECOVERY) + { + /* + * These records might change the TLI. Avoid potential + * bugs if we were to allow "read TLI" and "replay TLI" to + * differ without more analysis. + */ + prefetcher->no_readahead_until = record->lsn; + +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing all readahead until %X/%X is replayed due to possible TLI change", + LSN_FORMAT_ARGS(record->lsn)); +#endif + + /* Fall through so we move past this record. */ + } + } + else if (rmid == RM_DBASE_ID) + { + /* + * When databases are created with the file-copy strategy, + * there are no WAL records to tell us about the creation of + * individual relations. + */ + if (record_type == XLOG_DBASE_CREATE_FILE_COPY) + { + xl_dbase_create_file_copy_rec *xlrec = + (xl_dbase_create_file_copy_rec *) record->main_data; + RelFileNode rnode = {InvalidOid, xlrec->db_id, InvalidOid}; + + /* + * Don't try to prefetch anything in this database until + * it has been created, or we might confuse the blocks of + * different generations, if a database OID or relfilenode + * is reused. It's also more efficient than discovering + * that relations don't exist on disk yet with ENOENT + * errors. + */ + XLogPrefetcherAddFilter(prefetcher, rnode, 0, record->lsn); + +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy", + rnode.dbNode, + LSN_FORMAT_ARGS(record->lsn)); +#endif + } + } + else if (rmid == RM_SMGR_ID) + { + if (record_type == XLOG_SMGR_CREATE) + { + xl_smgr_create *xlrec = (xl_smgr_create *) + record->main_data; + + if (xlrec->forkNum == MAIN_FORKNUM) + { + /* + * Don't prefetch anything for this whole relation + * until it has been created. Otherwise we might + * confuse the blocks of different generations, if a + * relfilenode is reused. This also avoids the need + * to discover the problem via extra syscalls that + * report ENOENT. + */ + XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, 0, + record->lsn); + +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + xlrec->rnode.spcNode, + xlrec->rnode.dbNode, + xlrec->rnode.relNode, + LSN_FORMAT_ARGS(record->lsn)); +#endif + } + } + else if (record_type == XLOG_SMGR_TRUNCATE) + { + xl_smgr_truncate *xlrec = (xl_smgr_truncate *) + record->main_data; + + /* + * Don't consider prefetching anything in the truncated + * range until the truncation has been performed. + */ + XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, + xlrec->blkno, + record->lsn); + +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + xlrec->rnode.spcNode, + xlrec->rnode.dbNode, + xlrec->rnode.relNode, + xlrec->blkno, + LSN_FORMAT_ARGS(record->lsn)); +#endif + } + } + } + + /* Scan the block references, starting where we left off last time. */ + while (prefetcher->next_block_id <= record->max_block_id) + { + int block_id = prefetcher->next_block_id++; + DecodedBkpBlock *block = &record->blocks[block_id]; + SMgrRelation reln; + PrefetchBufferResult result; + + if (!block->in_use) + continue; + + Assert(!BufferIsValid(block->prefetch_buffer));; + + /* + * Record the LSN of this record. When it's replayed, + * LsnReadQueue will consider any IOs submitted for earlier LSNs + * to be finished. + */ + *lsn = record->lsn; + + /* We don't try to prefetch anything but the main fork for now. */ + if (block->forknum != MAIN_FORKNUM) + { + return LRQ_NEXT_NO_IO; + } + + /* + * If there is a full page image attached, we won't be reading the + * page, so don't bother trying to prefetch. + */ + if (block->has_image) + { + XLogPrefetchIncrement(&SharedStats->skip_fpw); + return LRQ_NEXT_NO_IO; + } + + /* There is no point in reading a page that will be zeroed. */ + if (block->flags & BKPBLOCK_WILL_INIT) + { + XLogPrefetchIncrement(&SharedStats->skip_init); + return LRQ_NEXT_NO_IO; + } + + /* Should we skip prefetching this block due to a filter? */ + if (XLogPrefetcherIsFiltered(prefetcher, block->rnode, block->blkno)) + { + XLogPrefetchIncrement(&SharedStats->skip_new); + return LRQ_NEXT_NO_IO; + } + + /* There is no point in repeatedly prefetching the same block. */ + for (int i = 0; i < XLOGPREFETCHER_SEQ_WINDOW_SIZE; ++i) + { + if (block->blkno == prefetcher->recent_block[i] && + RelFileNodeEquals(block->rnode, prefetcher->recent_rnode[i])) + { + /* + * XXX If we also remembered where it was, we could set + * recent_buffer so that recovery could skip smgropen() + * and a buffer table lookup. + */ + XLogPrefetchIncrement(&SharedStats->skip_rep); + return LRQ_NEXT_NO_IO; + } + } + prefetcher->recent_rnode[prefetcher->recent_idx] = block->rnode; + prefetcher->recent_block[prefetcher->recent_idx] = block->blkno; + prefetcher->recent_idx = + (prefetcher->recent_idx + 1) % XLOGPREFETCHER_SEQ_WINDOW_SIZE; + + /* + * We could try to have a fast path for repeated references to the + * same relation (with some scheme to handle invalidations + * safely), but for now we'll call smgropen() every time. + */ + reln = smgropen(block->rnode, InvalidBackendId); + + /* + * If the relation file doesn't exist on disk, for example because + * we're replaying after a crash and the file will be created and + * then unlinked by WAL that hasn't been replayed yet, suppress + * further prefetching in the relation until this record is + * replayed. + */ + if (!smgrexists(reln, MAIN_FORKNUM)) + { +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + LSN_FORMAT_ARGS(record->lsn)); +#endif + XLogPrefetcherAddFilter(prefetcher, block->rnode, 0, + record->lsn); + XLogPrefetchIncrement(&SharedStats->skip_new); + return LRQ_NEXT_NO_IO; + } + + /* + * If the relation isn't big enough to contain the referenced + * block yet, suppress prefetching of this block and higher until + * this record is replayed. + */ + if (block->blkno >= smgrnblocks(reln, block->forknum)) + { +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + block->blkno, + LSN_FORMAT_ARGS(record->lsn)); +#endif + XLogPrefetcherAddFilter(prefetcher, block->rnode, block->blkno, + record->lsn); + XLogPrefetchIncrement(&SharedStats->skip_new); + return LRQ_NEXT_NO_IO; + } + + /* Try to initiate prefetching. */ + result = PrefetchSharedBuffer(reln, block->forknum, block->blkno); + if (BufferIsValid(result.recent_buffer)) + { + /* Cache hit, nothing to do. */ + XLogPrefetchIncrement(&SharedStats->hit); + block->prefetch_buffer = result.recent_buffer; + return LRQ_NEXT_NO_IO; + } + else if (result.initiated_io) + { + /* Cache miss, I/O (presumably) started. */ + XLogPrefetchIncrement(&SharedStats->prefetch); + block->prefetch_buffer = InvalidBuffer; + return LRQ_NEXT_IO; + } + else + { + /* + * This shouldn't be possible, because we already determined + * that the relation exists on disk and is big enough. + * Something is wrong with the cache invalidation for + * smgrexists(), smgrnblocks(), or the file was unlinked or + * truncated beneath our feet? + */ + elog(ERROR, + "could not prefetch relation %u/%u/%u block %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + block->blkno); + } + } + + /* + * Several callsites need to be able to read exactly one record + * without any internal readahead. Examples: xlog.c reading + * checkpoint records with emode set to PANIC, which might otherwise + * cause XLogPageRead() to panic on some future page, and xlog.c + * determining where to start writing WAL next, which depends on the + * contents of the reader's internal buffer after reading one record. + * Therefore, don't even think about prefetching until the first + * record after XLogPrefetcherBeginRead() has been consumed. + */ + if (prefetcher->reader->decode_queue_tail && + prefetcher->reader->decode_queue_tail->lsn == prefetcher->begin_ptr) + return LRQ_NEXT_AGAIN; + + /* Advance to the next record. */ + prefetcher->record = NULL; + } + pg_unreachable(); +} + +/* + * Expose statistics about recovery prefetching. + */ +Datum +pg_stat_get_recovery_prefetch(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_RECOVERY_PREFETCH_COLS 10 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Datum values[PG_STAT_GET_RECOVERY_PREFETCH_COLS]; + bool nulls[PG_STAT_GET_RECOVERY_PREFETCH_COLS]; + + InitMaterializedSRF(fcinfo, 0); + + for (int i = 0; i < PG_STAT_GET_RECOVERY_PREFETCH_COLS; ++i) + nulls[i] = false; + + values[0] = TimestampTzGetDatum(pg_atomic_read_u64(&SharedStats->reset_time)); + values[1] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->prefetch)); + values[2] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->hit)); + values[3] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_init)); + values[4] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_new)); + values[5] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_fpw)); + values[6] = Int64GetDatum(pg_atomic_read_u64(&SharedStats->skip_rep)); + values[7] = Int32GetDatum(SharedStats->wal_distance); + values[8] = Int32GetDatum(SharedStats->block_distance); + values[9] = Int32GetDatum(SharedStats->io_depth); + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + + return (Datum) 0; +} + +/* + * Don't prefetch any blocks >= 'blockno' from a given 'rnode', until 'lsn' + * has been replayed. + */ +static inline void +XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, RelFileNode rnode, + BlockNumber blockno, XLogRecPtr lsn) +{ + XLogPrefetcherFilter *filter; + bool found; + + filter = hash_search(prefetcher->filter_table, &rnode, HASH_ENTER, &found); + if (!found) + { + /* + * Don't allow any prefetching of this block or higher until replayed. + */ + filter->filter_until_replayed = lsn; + filter->filter_from_block = blockno; + dlist_push_head(&prefetcher->filter_queue, &filter->link); + } + else + { + /* + * We were already filtering this rnode. Extend the filter's lifetime + * to cover this WAL record, but leave the lower of the block numbers + * there because we don't want to have to track individual blocks. + */ + filter->filter_until_replayed = lsn; + dlist_delete(&filter->link); + dlist_push_head(&prefetcher->filter_queue, &filter->link); + filter->filter_from_block = Min(filter->filter_from_block, blockno); + } +} + +/* + * Have we replayed any records that caused us to begin filtering a block + * range? That means that relations should have been created, extended or + * dropped as required, so we can stop filtering out accesses to a given + * relfilenode. + */ +static inline void +XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn) +{ + while (unlikely(!dlist_is_empty(&prefetcher->filter_queue))) + { + XLogPrefetcherFilter *filter = dlist_tail_element(XLogPrefetcherFilter, + link, + &prefetcher->filter_queue); + + if (filter->filter_until_replayed >= replaying_lsn) + break; + + dlist_delete(&filter->link); + hash_search(prefetcher->filter_table, filter, HASH_REMOVE, NULL); + } +} + +/* + * Check if a given block should be skipped due to a filter. + */ +static inline bool +XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileNode rnode, + BlockNumber blockno) +{ + /* + * Test for empty queue first, because we expect it to be empty most of + * the time and we can avoid the hash table lookup in that case. + */ + if (unlikely(!dlist_is_empty(&prefetcher->filter_queue))) + { + XLogPrefetcherFilter *filter; + + /* See if the block range is filtered. */ + filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL); + if (filter && filter->filter_from_block <= blockno) + { +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + rnode.spcNode, rnode.dbNode, rnode.relNode, blockno, + LSN_FORMAT_ARGS(filter->filter_until_replayed), + filter->filter_from_block); +#endif + return true; + } + + /* See if the whole database is filtered. */ + rnode.relNode = InvalidOid; + rnode.spcNode = InvalidOid; + filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL); + if (filter) + { +#ifdef XLOGPREFETCHER_DEBUG_LEVEL + elog(XLOGPREFETCHER_DEBUG_LEVEL, + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + rnode.spcNode, rnode.dbNode, rnode.relNode, blockno, + LSN_FORMAT_ARGS(filter->filter_until_replayed)); +#endif + return true; + } + } + + return false; +} + +/* + * A wrapper for XLogBeginRead() that also resets the prefetcher. + */ +void +XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr) +{ + /* This will forget about any in-flight IO. */ + prefetcher->reconfigure_count--; + + /* Book-keeping to avoid readahead on first read. */ + prefetcher->begin_ptr = recPtr; + + prefetcher->no_readahead_until = 0; + + /* This will forget about any queued up records in the decoder. */ + XLogBeginRead(prefetcher->reader, recPtr); +} + +/* + * A wrapper for XLogReadRecord() that provides the same interface, but also + * tries to initiate I/O for blocks referenced in future WAL records. + */ +XLogRecord * +XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg) +{ + DecodedXLogRecord *record; + XLogRecPtr replayed_up_to; + + /* + * See if it's time to reset the prefetching machinery, because a relevant + * GUC was changed. + */ + if (unlikely(XLogPrefetchReconfigureCount != prefetcher->reconfigure_count)) + { + uint32 max_distance; + uint32 max_inflight; + + if (prefetcher->streaming_read) + lrq_free(prefetcher->streaming_read); + + if (RecoveryPrefetchEnabled()) + { + Assert(maintenance_io_concurrency > 0); + max_inflight = maintenance_io_concurrency; + max_distance = max_inflight * XLOGPREFETCHER_DISTANCE_MULTIPLIER; + } + else + { + max_inflight = 1; + max_distance = 1; + } + + prefetcher->streaming_read = lrq_alloc(max_distance, + max_inflight, + (uintptr_t) prefetcher, + XLogPrefetcherNextBlock); + + prefetcher->reconfigure_count = XLogPrefetchReconfigureCount; + } + + /* + * Release last returned record, if there is one, as it's now been + * replayed. + */ + replayed_up_to = XLogReleasePreviousRecord(prefetcher->reader); + + /* + * Can we drop any filters yet? If we were waiting for a relation to be + * created or extended, it is now OK to access blocks in the covered + * range. + */ + XLogPrefetcherCompleteFilters(prefetcher, replayed_up_to); + + /* + * All IO initiated by earlier WAL is now completed. This might trigger + * further prefetching. + */ + lrq_complete_lsn(prefetcher->streaming_read, replayed_up_to); + + /* + * If there's nothing queued yet, then start prefetching to cause at least + * one record to be queued. + */ + if (!XLogReaderHasQueuedRecordOrError(prefetcher->reader)) + { + Assert(lrq_inflight(prefetcher->streaming_read) == 0); + Assert(lrq_completed(prefetcher->streaming_read) == 0); + lrq_prefetch(prefetcher->streaming_read); + } + + /* Read the next record. */ + record = XLogNextRecord(prefetcher->reader, errmsg); + if (!record) + return NULL; + + /* + * The record we just got is the "current" one, for the benefit of the + * XLogRecXXX() macros. + */ + Assert(record == prefetcher->reader->record); + + /* + * If maintenance_io_concurrency is set very low, we might have started + * prefetching some but not all of the blocks referenced in the record + * we're about to return. Forget about the rest of the blocks in this + * record by dropping the prefetcher's reference to it. + */ + if (record == prefetcher->record) + prefetcher->record = NULL; + + /* + * See if it's time to compute some statistics, because enough WAL has + * been processed. + */ + if (unlikely(record->lsn >= prefetcher->next_stats_shm_lsn)) + XLogPrefetcherComputeStats(prefetcher); + + Assert(record == prefetcher->reader->record); + + return &record->header; +} + +bool +check_recovery_prefetch(int *new_value, void **extra, GucSource source) +{ +#ifndef USE_PREFETCH + if (*new_value == RECOVERY_PREFETCH_ON) + { + GUC_check_errdetail("recovery_prefetch is not supported on platforms that lack posix_fadvise()."); + return false; + } +#endif + + return true; +} + +void +assign_recovery_prefetch(int new_value, void *extra) +{ + /* Reconfigure prefetching, because a setting it depends on changed. */ + recovery_prefetch = new_value; + if (AmStartupProcess()) + XLogPrefetchReconfigure(); +} diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c new file mode 100644 index 0000000..c15da9d --- /dev/null +++ b/src/backend/access/transam/xlogreader.c @@ -0,0 +1,2165 @@ +/*------------------------------------------------------------------------- + * + * xlogreader.c + * Generic XLog reading facility + * + * Portions Copyright (c) 2013-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/xlogreader.c + * + * NOTES + * See xlogreader.h for more notes on this facility. + * + * This file is compiled as both front-end and backend code, so it + * may not use ereport, server-defined static variables, etc. + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#ifdef USE_LZ4 +#include <lz4.h> +#endif +#ifdef USE_ZSTD +#include <zstd.h> +#endif + +#include "access/transam.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" +#include "catalog/pg_control.h" +#include "common/pg_lzcompress.h" +#include "replication/origin.h" + +#ifndef FRONTEND +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/memutils.h" +#else +#include "common/logging.h" +#endif + +static void report_invalid_record(XLogReaderState *state, const char *fmt,...) + pg_attribute_printf(2, 3); +static void allocate_recordbuf(XLogReaderState *state, uint32 reclength); +static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, + int reqLen); +static void XLogReaderInvalReadState(XLogReaderState *state); +static XLogPageReadResult XLogDecodeNextRecord(XLogReaderState *state, bool non_blocking); +static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess); +static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, + XLogRecPtr recptr); +static void ResetDecoder(XLogReaderState *state); +static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, + int segsize, const char *waldir); + +/* size of the buffer allocated for error message. */ +#define MAX_ERRORMSG_LEN 1000 + +/* + * Default size; large enough that typical users of XLogReader won't often need + * to use the 'oversized' memory allocation code path. + */ +#define DEFAULT_DECODE_BUFFER_SIZE (64 * 1024) + +/* + * Construct a string in state->errormsg_buf explaining what's wrong with + * the current record being read. + */ +static void +report_invalid_record(XLogReaderState *state, const char *fmt,...) +{ + va_list args; + + fmt = _(fmt); + + va_start(args, fmt); + vsnprintf(state->errormsg_buf, MAX_ERRORMSG_LEN, fmt, args); + va_end(args); + + state->errormsg_deferred = true; +} + +/* + * Set the size of the decoding buffer. A pointer to a caller supplied memory + * region may also be passed in, in which case non-oversized records will be + * decoded there. + */ +void +XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size) +{ + Assert(state->decode_buffer == NULL); + + state->decode_buffer = buffer; + state->decode_buffer_size = size; + state->decode_buffer_tail = buffer; + state->decode_buffer_head = buffer; +} + +/* + * Allocate and initialize a new XLogReader. + * + * Returns NULL if the xlogreader couldn't be allocated. + */ +XLogReaderState * +XLogReaderAllocate(int wal_segment_size, const char *waldir, + XLogReaderRoutine *routine, void *private_data) +{ + XLogReaderState *state; + + state = (XLogReaderState *) + palloc_extended(sizeof(XLogReaderState), + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (!state) + return NULL; + + /* initialize caller-provided support functions */ + state->routine = *routine; + + /* + * Permanently allocate readBuf. We do it this way, rather than just + * making a static array, for two reasons: (1) no need to waste the + * storage in most instantiations of the backend; (2) a static char array + * isn't guaranteed to have any particular alignment, whereas + * palloc_extended() will provide MAXALIGN'd storage. + */ + state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ, + MCXT_ALLOC_NO_OOM); + if (!state->readBuf) + { + pfree(state); + return NULL; + } + + /* Initialize segment info. */ + WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size, + waldir); + + /* system_identifier initialized to zeroes above */ + state->private_data = private_data; + /* ReadRecPtr, EndRecPtr and readLen initialized to zeroes above */ + state->errormsg_buf = palloc_extended(MAX_ERRORMSG_LEN + 1, + MCXT_ALLOC_NO_OOM); + if (!state->errormsg_buf) + { + pfree(state->readBuf); + pfree(state); + return NULL; + } + state->errormsg_buf[0] = '\0'; + + /* + * Allocate an initial readRecordBuf of minimal size, which can later be + * enlarged if necessary. + */ + allocate_recordbuf(state, 0); + return state; +} + +void +XLogReaderFree(XLogReaderState *state) +{ + if (state->seg.ws_file != -1) + state->routine.segment_close(state); + + if (state->decode_buffer && state->free_decode_buffer) + pfree(state->decode_buffer); + + pfree(state->errormsg_buf); + if (state->readRecordBuf) + pfree(state->readRecordBuf); + pfree(state->readBuf); + pfree(state); +} + +/* + * Allocate readRecordBuf to fit a record of at least the given length. + * + * readRecordBufSize is set to the new buffer size. + * + * To avoid useless small increases, round its size to a multiple of + * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start + * with. (That is enough for all "normal" records, but very large commit or + * abort records might need more space.) + * + * Note: This routine should *never* be called for xl_tot_len until the header + * of the record has been fully validated. + */ +static void +allocate_recordbuf(XLogReaderState *state, uint32 reclength) +{ + uint32 newSize = reclength; + + newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ); + newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ)); + + if (state->readRecordBuf) + pfree(state->readRecordBuf); + state->readRecordBuf = (char *) palloc(newSize); + state->readRecordBufSize = newSize; +} + +/* + * Initialize the passed segment structs. + */ +static void +WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, + int segsize, const char *waldir) +{ + seg->ws_file = -1; + seg->ws_segno = 0; + seg->ws_tli = 0; + + segcxt->ws_segsize = segsize; + if (waldir) + snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir); +} + +/* + * Begin reading WAL at 'RecPtr'. + * + * 'RecPtr' should point to the beginning of a valid WAL record. Pointing at + * the beginning of a page is also OK, if there is a new record right after + * the page header, i.e. not a continuation. + * + * This does not make any attempt to read the WAL yet, and hence cannot fail. + * If the starting address is not correct, the first call to XLogReadRecord() + * will error out. + */ +void +XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) +{ + Assert(!XLogRecPtrIsInvalid(RecPtr)); + + ResetDecoder(state); + + /* Begin at the passed-in record pointer. */ + state->EndRecPtr = RecPtr; + state->NextRecPtr = RecPtr; + state->ReadRecPtr = InvalidXLogRecPtr; + state->DecodeRecPtr = InvalidXLogRecPtr; +} + +/* + * Release the last record that was returned by XLogNextRecord(), if any, to + * free up space. Returns the LSN past the end of the record. + */ +XLogRecPtr +XLogReleasePreviousRecord(XLogReaderState *state) +{ + DecodedXLogRecord *record; + XLogRecPtr next_lsn; + + if (!state->record) + return InvalidXLogRecPtr; + + /* + * Remove it from the decoded record queue. It must be the oldest item + * decoded, decode_queue_head. + */ + record = state->record; + next_lsn = record->next_lsn; + Assert(record == state->decode_queue_head); + state->record = NULL; + state->decode_queue_head = record->next; + + /* It might also be the newest item decoded, decode_queue_tail. */ + if (state->decode_queue_tail == record) + state->decode_queue_tail = NULL; + + /* Release the space. */ + if (unlikely(record->oversized)) + { + /* It's not in the decode buffer, so free it to release space. */ + pfree(record); + } + else + { + /* It must be the head (oldest) record in the decode buffer. */ + Assert(state->decode_buffer_head == (char *) record); + + /* + * We need to update head to point to the next record that is in the + * decode buffer, if any, being careful to skip oversized ones + * (they're not in the decode buffer). + */ + record = record->next; + while (unlikely(record && record->oversized)) + record = record->next; + + if (record) + { + /* Adjust head to release space up to the next record. */ + state->decode_buffer_head = (char *) record; + } + else + { + /* + * Otherwise we might as well just reset head and tail to the + * start of the buffer space, because we're empty. This means + * we'll keep overwriting the same piece of memory if we're not + * doing any prefetching. + */ + state->decode_buffer_head = state->decode_buffer; + state->decode_buffer_tail = state->decode_buffer; + } + } + + return next_lsn; +} + +/* + * Attempt to read an XLOG record. + * + * XLogBeginRead() or XLogFindNextRecord() and then XLogReadAhead() must be + * called before the first call to XLogNextRecord(). This functions returns + * records and errors that were put into an internal queue by XLogReadAhead(). + * + * On success, a record is returned. + * + * The returned record (or *errormsg) points to an internal buffer that's + * valid until the next call to XLogNextRecord. + */ +DecodedXLogRecord * +XLogNextRecord(XLogReaderState *state, char **errormsg) +{ + /* Release the last record returned by XLogNextRecord(). */ + XLogReleasePreviousRecord(state); + + if (state->decode_queue_head == NULL) + { + *errormsg = NULL; + if (state->errormsg_deferred) + { + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + state->errormsg_deferred = false; + } + + /* + * state->EndRecPtr is expected to have been set by the last call to + * XLogBeginRead() or XLogNextRecord(), and is the location of the + * error. + */ + Assert(!XLogRecPtrIsInvalid(state->EndRecPtr)); + + return NULL; + } + + /* + * Record this as the most recent record returned, so that we'll release + * it next time. This also exposes it to the traditional + * XLogRecXXX(xlogreader) macros, which work with the decoder rather than + * the record for historical reasons. + */ + state->record = state->decode_queue_head; + + /* + * Update the pointers to the beginning and one-past-the-end of this + * record, again for the benefit of historical code that expected the + * decoder to track this rather than accessing these fields of the record + * itself. + */ + state->ReadRecPtr = state->record->lsn; + state->EndRecPtr = state->record->next_lsn; + + *errormsg = NULL; + + return state->record; +} + +/* + * Attempt to read an XLOG record. + * + * XLogBeginRead() or XLogFindNextRecord() must be called before the first call + * to XLogReadRecord(). + * + * If the page_read callback fails to read the requested data, NULL is + * returned. The callback is expected to have reported the error; errormsg + * is set to NULL. + * + * If the reading fails for some other reason, NULL is also returned, and + * *errormsg is set to a string with details of the failure. + * + * The returned pointer (or *errormsg) points to an internal buffer that's + * valid until the next call to XLogReadRecord. + */ +XLogRecord * +XLogReadRecord(XLogReaderState *state, char **errormsg) +{ + DecodedXLogRecord *decoded; + + /* + * Release last returned record, if there is one. We need to do this so + * that we can check for empty decode queue accurately. + */ + XLogReleasePreviousRecord(state); + + /* + * Call XLogReadAhead() in blocking mode to make sure there is something + * in the queue, though we don't use the result. + */ + if (!XLogReaderHasQueuedRecordOrError(state)) + XLogReadAhead(state, false /* nonblocking */ ); + + /* Consume the head record or error. */ + decoded = XLogNextRecord(state, errormsg); + if (decoded) + { + /* + * This function returns a pointer to the record's header, not the + * actual decoded record. The caller will access the decoded record + * through the XLogRecGetXXX() macros, which reach the decoded + * recorded as xlogreader->record. + */ + Assert(state->record == decoded); + return &decoded->header; + } + + return NULL; +} + +/* + * Allocate space for a decoded record. The only member of the returned + * object that is initialized is the 'oversized' flag, indicating that the + * decoded record wouldn't fit in the decode buffer and must eventually be + * freed explicitly. + * + * The caller is responsible for adjusting decode_buffer_tail with the real + * size after successfully decoding a record into this space. This way, if + * decoding fails, then there is nothing to undo unless the 'oversized' flag + * was set and pfree() must be called. + * + * Return NULL if there is no space in the decode buffer and allow_oversized + * is false, or if memory allocation fails for an oversized buffer. + */ +static DecodedXLogRecord * +XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) +{ + size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); + DecodedXLogRecord *decoded = NULL; + + /* Allocate a circular decode buffer if we don't have one already. */ + if (unlikely(state->decode_buffer == NULL)) + { + if (state->decode_buffer_size == 0) + state->decode_buffer_size = DEFAULT_DECODE_BUFFER_SIZE; + state->decode_buffer = palloc(state->decode_buffer_size); + state->decode_buffer_head = state->decode_buffer; + state->decode_buffer_tail = state->decode_buffer; + state->free_decode_buffer = true; + } + + /* Try to allocate space in the circular decode buffer. */ + if (state->decode_buffer_tail >= state->decode_buffer_head) + { + /* Empty, or tail is to the right of head. */ + if (state->decode_buffer_tail + required_space <= + state->decode_buffer + state->decode_buffer_size) + { + /* There is space between tail and end. */ + decoded = (DecodedXLogRecord *) state->decode_buffer_tail; + decoded->oversized = false; + return decoded; + } + else if (state->decode_buffer + required_space < + state->decode_buffer_head) + { + /* There is space between start and head. */ + decoded = (DecodedXLogRecord *) state->decode_buffer; + decoded->oversized = false; + return decoded; + } + } + else + { + /* Tail is to the left of head. */ + if (state->decode_buffer_tail + required_space < + state->decode_buffer_head) + { + /* There is space between tail and head. */ + decoded = (DecodedXLogRecord *) state->decode_buffer_tail; + decoded->oversized = false; + return decoded; + } + } + + /* Not enough space in the decode buffer. Are we allowed to allocate? */ + if (allow_oversized) + { + decoded = palloc(required_space); + decoded->oversized = true; + return decoded; + } + + return NULL; +} + +static XLogPageReadResult +XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) +{ + XLogRecPtr RecPtr; + XLogRecord *record; + XLogRecPtr targetPagePtr; + bool randAccess; + uint32 len, + total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool assembled; + bool gotheader; + int readOff; + DecodedXLogRecord *decoded; + char *errormsg; /* not used */ + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; + + /* reset error state */ + state->errormsg_buf[0] = '\0'; + decoded = NULL; + + state->abortedRecPtr = InvalidXLogRecPtr; + state->missingContrecPtr = InvalidXLogRecPtr; + + RecPtr = state->NextRecPtr; + + if (state->DecodeRecPtr != InvalidXLogRecPtr) + { + /* read the record after the one we just read */ + + /* + * NextRecPtr is pointing to end+1 of the previous WAL record. If + * we're at a page boundary, no more records can fit on the current + * page. We must skip over the page header, but we can't do that until + * we've read in the page, since the header size is variable. + */ + } + else + { + /* + * Caller supplied a position to start at. + * + * In this case, NextRecPtr should already be pointing to a valid + * record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + +restart: + state->nonblocking = nonblocking; + state->currRecPtr = RecPtr; + assembled = false; + + targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request enough + * byte to cover the whole record header, or at least the part of it that + * fits on the same page. + */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + if (targetRecOff == 0) + { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } + else if (targetRecOff < pageHeaderSize) + { + report_invalid_record(state, "invalid record offset at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && + targetRecOff == pageHeaderSize) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert(pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = record->xl_tot_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) + { + if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, + randAccess)) + goto err; + gotheader = true; + } + else + { + /* There may be no next page if it's too small. */ + if (total_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + /* We'll validate the header once we have the next page. */ + gotheader = false; + } + + /* + * Try to find space to decode this record, if we can do so without + * calling palloc. If we can't, we'll try again below after we've + * validated that total_len isn't garbage bytes from a recycled WAL page. + */ + decoded = XLogReadRecordAlloc(state, + total_len, + false /* allow_oversized */ ); + if (decoded == NULL && nonblocking) + { + /* + * There is no space in the circular decode buffer, and the caller is + * only reading ahead. The caller should consume existing records to + * make space. + */ + return XLREAD_WOULDBLOCK; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) + { + /* Need to reassemble record */ + char *contdata; + XLogPageHeader pageHeader; + char *buffer; + uint32 gotlen; + + assembled = true; + + /* + * We always have space for a couple of pages, enough to validate a + * boundary-spanning record header. + */ + Assert(state->readRecordBufSize >= XLOG_BLCKSZ * 2); + Assert(state->readRecordBufSize >= len); + + /* Copy the first fragment of the record from the first page. */ + memcpy(state->readRecordBuf, + state->readBuf + RecPtr % XLOG_BLCKSZ, len); + buffer = state->readRecordBuf + len; + gotlen = len; + + do + { + /* Calculate pointer to beginning of next page */ + targetPagePtr += XLOG_BLCKSZ; + + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + + Assert(SizeOfXLogShortPHD <= readOff); + + pageHeader = (XLogPageHeader) state->readBuf; + + /* + * If we were expecting a continuation record and got an + * "overwrite contrecord" flag, that means the continuation record + * was overwritten with a different record. Restart the read by + * assuming the address to read is the location where we found + * this flag; but keep track of the LSN of the record we were + * reading, for later verification. + */ + if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD) + { + state->overwrittenRecPtr = RecPtr; + RecPtr = targetPagePtr; + goto restart; + } + + /* Check that the continuation on next page looks valid */ + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) + { + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || + total_len != (pageHeader->xlp_rem_len + gotlen)) + { + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + + if (readOff < pageHeaderSize) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize); + + Assert(pageHeaderSize <= readOff); + + contdata = (char *) state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < pageHeaderSize + len) + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize + len); + + memcpy(buffer, (char *) contdata, len); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) + { + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, + record, randAccess)) + goto err; + gotheader = true; + } + + /* + * We might need a bigger buffer. We have validated the record + * header, in the case that it split over a page boundary. We've + * also cross-checked total_len against xlp_rem_len on the second + * page, and verified xlp_pageaddr on both. + */ + if (total_len > state->readRecordBufSize) + { + char save_copy[XLOG_BLCKSZ * 2]; + + /* + * Save and restore the data we already had. It can't be more + * than two pages. + */ + Assert(gotlen <= lengthof(save_copy)); + Assert(gotlen <= state->readRecordBufSize); + memcpy(save_copy, state->readRecordBuf, gotlen); + allocate_recordbuf(state, total_len); + memcpy(state->readRecordBuf, save_copy, gotlen); + buffer = state->readRecordBuf + gotlen; + } + } while (gotlen < total_len); + Assert(gotheader); + + record = (XLogRecord *) state->readRecordBuf; + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + state->DecodeRecPtr = RecPtr; + state->NextRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(pageHeader->xlp_rem_len); + } + else + { + /* Wait for the record data to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + + /* Record does not cross a page boundary */ + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + state->NextRecPtr = RecPtr + MAXALIGN(total_len); + + state->DecodeRecPtr = RecPtr; + } + + /* + * Special processing if it's an XLOG SWITCH record + */ + if (record->xl_rmid == RM_XLOG_ID && + (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) + { + /* Pretend it extends to end of segment */ + state->NextRecPtr += state->segcxt.ws_segsize - 1; + state->NextRecPtr -= XLogSegmentOffset(state->NextRecPtr, state->segcxt.ws_segsize); + } + + /* + * If we got here without a DecodedXLogRecord, it means we needed to + * validate total_len before trusting it, but by now now we've done that. + */ + if (decoded == NULL) + { + Assert(!nonblocking); + decoded = XLogReadRecordAlloc(state, + total_len, + true /* allow_oversized */ ); + /* allocation should always happen under allow_oversized */ + Assert(decoded != NULL); + } + + if (DecodeXLogRecord(state, decoded, record, RecPtr, &errormsg)) + { + /* Record the location of the next record. */ + decoded->next_lsn = state->NextRecPtr; + + /* + * If it's in the decode buffer, mark the decode buffer space as + * occupied. + */ + if (!decoded->oversized) + { + /* The new decode buffer head must be MAXALIGNed. */ + Assert(decoded->size == MAXALIGN(decoded->size)); + if ((char *) decoded == state->decode_buffer) + state->decode_buffer_tail = state->decode_buffer + decoded->size; + else + state->decode_buffer_tail += decoded->size; + } + + /* Insert it into the queue of decoded records. */ + Assert(state->decode_queue_tail != decoded); + if (state->decode_queue_tail) + state->decode_queue_tail->next = decoded; + state->decode_queue_tail = decoded; + if (!state->decode_queue_head) + state->decode_queue_head = decoded; + return XLREAD_SUCCESS; + } + +err: + if (assembled) + { + /* + * We get here when a record that spans multiple pages needs to be + * assembled, but something went wrong -- perhaps a contrecord piece + * was lost. If caller is WAL replay, it will know where the aborted + * record was and where to direct followup WAL to be written, marking + * the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will + * in turn signal downstream WAL consumers that the broken WAL record + * is to be ignored. + */ + state->abortedRecPtr = RecPtr; + state->missingContrecPtr = targetPagePtr; + + /* + * If we got here without reporting an error, make sure an error is + * queued so that XLogPrefetcherReadRecord() doesn't bring us back a + * second time and clobber the above state. + */ + state->errormsg_deferred = true; + } + + if (decoded && decoded->oversized) + pfree(decoded); + + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + /* + * If an error was written to errmsg_buf, it'll be returned to the caller + * of XLogReadRecord() after all successfully decoded records from the + * read queue. + */ + + return XLREAD_FAIL; +} + +/* + * Try to decode the next available record, and return it. The record will + * also be returned to XLogNextRecord(), which must be called to 'consume' + * each record. + * + * If nonblocking is true, may return NULL due to lack of data or WAL decoding + * space. + */ +DecodedXLogRecord * +XLogReadAhead(XLogReaderState *state, bool nonblocking) +{ + XLogPageReadResult result; + + if (state->errormsg_deferred) + return NULL; + + result = XLogDecodeNextRecord(state, nonblocking); + if (result == XLREAD_SUCCESS) + { + Assert(state->decode_queue_tail != NULL); + return state->decode_queue_tail; + } + + return NULL; +} + +/* + * Read a single xlog page including at least [pageptr, reqLen] of valid data + * via the page_read() callback. + * + * Returns XLREAD_FAIL if the required page cannot be read for some + * reason; errormsg_buf is set in that case (unless the error occurs in the + * page_read callback). + * + * Returns XLREAD_WOULDBLOCK if the requested data can't be read without + * waiting. This can be returned only if the installed page_read callback + * respects the state->nonblocking flag, and cannot read the requested data + * immediately. + * + * We fetch the page from a reader-local cache if we know we have the required + * data and if there hasn't been any error since caching the data. + */ +static int +ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) +{ + int readLen; + uint32 targetPageOff; + XLogSegNo targetSegNo; + XLogPageHeader hdr; + + Assert((pageptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); + targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->seg.ws_segno && + targetPageOff == state->segoff && reqLen <= state->readLen) + return state->readLen; + + /* + * Invalidate contents of internal buffer before read attempt. Just set + * the length to 0, rather than a full XLogReaderInvalReadState(), so we + * don't forget the segment we last successfully read. + */ + state->readLen = 0; + + /* + * Data is not in our buffer. + * + * Every time we actually read the segment, even if we looked at parts of + * it before, we need to do verification as the page_read callback might + * now be rereading data from a different source. + * + * Whenever switching to a new WAL segment, we read the first page of the + * file and validate its header, even if that's not where the target + * record is. This is so that we can check the additional identification + * info that is present in the first page's "long" header. + */ + if (targetSegNo != state->seg.ws_segno && targetPageOff != 0) + { + XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; + + readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ, + state->currRecPtr, + state->readBuf); + if (readLen == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readLen < 0) + goto err; + + /* we can be sure to have enough WAL available, we scrolled back */ + Assert(readLen == XLOG_BLCKSZ); + + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, + state->readBuf)) + goto err; + } + + /* + * First, read the requested data length, but at least a short page header + * so that we can validate it. + */ + readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), + state->currRecPtr, + state->readBuf); + if (readLen == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readLen < 0) + goto err; + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= SizeOfXLogShortPHD) + goto err; + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader) state->readBuf; + + /* still not enough */ + if (readLen < XLogPageHeaderSize(hdr)) + { + readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr), + state->currRecPtr, + state->readBuf); + if (readLen == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readLen < 0) + goto err; + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + goto err; + + /* update read state information */ + state->seg.ws_segno = targetSegNo; + state->segoff = targetPageOff; + state->readLen = readLen; + + return readLen; + +err: + XLogReaderInvalReadState(state); + + return XLREAD_FAIL; +} + +/* + * Invalidate the xlogreader's read state to force a re-read. + */ +static void +XLogReaderInvalReadState(XLogReaderState *state) +{ + state->seg.ws_segno = 0; + state->segoff = 0; + state->readLen = 0; +} + +/* + * Validate an XLOG record header. + * + * This is just a convenience subroutine to avoid duplicated code in + * XLogReadRecord. It's not intended for use from anywhere else. + */ +static bool +ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, XLogRecord *record, + bool randAccess) +{ + if (record->xl_tot_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, record->xl_tot_len); + return false; + } + if (!RmgrIdIsValid(record->xl_rmid)) + { + report_invalid_record(state, + "invalid resource manager ID %u at %X/%X", + record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); + return false; + } + if (randAccess) + { + /* + * We can't exactly verify the prev-link, but surely it should be less + * than the record's own address. + */ + if (!(record->xl_prev < RecPtr)) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + else + { + /* + * Record's prev-link should exactly match our previous location. This + * check guards against torn WAL pages where a stale but valid-looking + * WAL record starts on a sector boundary. + */ + if (record->xl_prev != PrevRecPtr) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + + return true; +} + + +/* + * CRC-check an XLOG record. We do not believe the contents of an XLOG + * record (other than to the minimal extent of computing the amount of + * data to read in) until we've checked the CRCs. + * + * We assume all of the record (that is, xl_tot_len bytes) has been read + * into memory at *record. Also, ValidXLogRecordHeader() has accepted the + * record's header, which means in particular that xl_tot_len is at least + * SizeOfXLogRecord. + */ +static bool +ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) +{ + pg_crc32c crc; + + Assert(record->xl_tot_len >= SizeOfXLogRecord); + + /* Calculate the CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + /* include the record header last */ + COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(record->xl_crc, crc)) + { + report_invalid_record(state, + "incorrect resource manager data checksum in record at %X/%X", + LSN_FORMAT_ARGS(recptr)); + return false; + } + + return true; +} + +/* + * Validate a page header. + * + * Check if 'phdr' is valid as the header of the XLog page at position + * 'recptr'. + */ +bool +XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, + char *phdr) +{ + XLogRecPtr recaddr; + XLogSegNo segno; + int32 offset; + XLogPageHeader hdr = (XLogPageHeader) phdr; + + Assert((recptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(recptr, segno, state->segcxt.ws_segsize); + offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr); + + if (hdr->xlp_magic != XLOG_PAGE_MAGIC) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "invalid magic number %04X in log segment %s, offset %u", + hdr->xlp_magic, + fname, + offset); + return false; + } + + if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } + + if (hdr->xlp_info & XLP_LONG_HEADER) + { + XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr; + + if (state->system_identifier && + longhdr->xlp_sysid != state->system_identifier) + { + report_invalid_record(state, + "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu", + (unsigned long long) longhdr->xlp_sysid, + (unsigned long long) state->system_identifier); + return false; + } + else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect segment size in page header"); + return false; + } + else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header"); + return false; + } + } + else if (offset == 0) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + /* hmm, first page of file doesn't have a long header? */ + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } + + /* + * Check that the address on the page agrees with what we expected. This + * check typically fails when an old WAL segment is recycled, and hasn't + * yet been overwritten with new data yet. + */ + if (hdr->xlp_pageaddr != recaddr) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "unexpected pageaddr %X/%X in log segment %s, offset %u", + LSN_FORMAT_ARGS(hdr->xlp_pageaddr), + fname, + offset); + return false; + } + + /* + * Since child timelines are always assigned a TLI greater than their + * immediate parent's TLI, we should never see TLI go backwards across + * successive pages of a consistent WAL sequence. + * + * Sometimes we re-read a segment that's already been (partially) read. So + * we only verify TLIs for pages that are later than the last remembered + * LSN. + */ + if (recptr > state->latestPagePtr) + { + if (hdr->xlp_tli < state->latestPageTLI) + { + char fname[MAXFNAMELEN]; + + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + + report_invalid_record(state, + "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u", + hdr->xlp_tli, + state->latestPageTLI, + fname, + offset); + return false; + } + } + state->latestPagePtr = recptr; + state->latestPageTLI = hdr->xlp_tli; + + return true; +} + +/* + * Forget about an error produced by XLogReaderValidatePageHeader(). + */ +void +XLogReaderResetError(XLogReaderState *state) +{ + state->errormsg_buf[0] = '\0'; + state->errormsg_deferred = false; +} + +/* + * Find the first record with an lsn >= RecPtr. + * + * This is different from XLogBeginRead() in that RecPtr doesn't need to point + * to a valid record boundary. Useful for checking whether RecPtr is a valid + * xlog address for reading, and to find the first valid address after some + * address when dumping records for debugging purposes. + * + * This positions the reader, like XLogBeginRead(), so that the next call to + * XLogReadRecord() will read the next valid record. + */ +XLogRecPtr +XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) +{ + XLogRecPtr tmpRecPtr; + XLogRecPtr found = InvalidXLogRecPtr; + XLogPageHeader header; + char *errormsg; + + Assert(!XLogRecPtrIsInvalid(RecPtr)); + + /* Make sure ReadPageInternal() can't return XLREAD_WOULDBLOCK. */ + state->nonblocking = false; + + /* + * skip over potential continuation data, keeping in mind that it may span + * multiple pages + */ + tmpRecPtr = RecPtr; + while (true) + { + XLogRecPtr targetPagePtr; + int targetRecOff; + uint32 pageHeaderSize; + int readLen; + + /* + * Compute targetRecOff. It should typically be equal or greater than + * short page-header since a valid record can't start anywhere before + * that, except when caller has explicitly specified the offset that + * falls somewhere there or when we are skipping multi-page + * continuation record. It doesn't matter though because + * ReadPageInternal() is prepared to handle that and will read at + * least short page-header worth of data + */ + targetRecOff = tmpRecPtr % XLOG_BLCKSZ; + + /* scroll back to page boundary */ + targetPagePtr = tmpRecPtr - targetRecOff; + + /* Read the page containing the record */ + readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); + if (readLen < 0) + goto err; + + header = (XLogPageHeader) state->readBuf; + + pageHeaderSize = XLogPageHeaderSize(header); + + /* make sure we have enough data for the page header */ + readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); + if (readLen < 0) + goto err; + + /* skip over potential continuation data */ + if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) + { + /* + * If the length of the remaining continuation data is more than + * what can fit in this page, the continuation record crosses over + * this page. Read the next page and try again. xlp_rem_len in the + * next page header will contain the remaining length of the + * continuation data + * + * Note that record headers are MAXALIGN'ed + */ + if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize)) + tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; + else + { + /* + * The previous continuation record ends in this page. Set + * tmpRecPtr to point to the first valid record + */ + tmpRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(header->xlp_rem_len); + break; + } + } + else + { + tmpRecPtr = targetPagePtr + pageHeaderSize; + break; + } + } + + /* + * we know now that tmpRecPtr is an address pointing to a valid XLogRecord + * because either we're at the first record after the beginning of a page + * or we just jumped over the remaining data of a continuation. + */ + XLogBeginRead(state, tmpRecPtr); + while (XLogReadRecord(state, &errormsg) != NULL) + { + /* past the record we've found, break out */ + if (RecPtr <= state->ReadRecPtr) + { + /* Rewind the reader to the beginning of the last record. */ + found = state->ReadRecPtr; + XLogBeginRead(state, found); + return found; + } + } + +err: + XLogReaderInvalReadState(state); + + return InvalidXLogRecPtr; +} + +/* + * Helper function to ease writing of XLogRoutine->page_read callbacks. + * If this function is used, caller must supply a segment_open callback in + * 'state', as that is used here. + * + * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL + * fetched from timeline 'tli'. + * + * Returns true if succeeded, false if an error occurs, in which case + * 'errinfo' receives error details. + * + * XXX probably this should be improved to suck data directly from the + * WAL buffers when possible. + */ +bool +WALRead(XLogReaderState *state, + char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, + WALReadError *errinfo) +{ + char *p; + XLogRecPtr recptr; + Size nbytes; + + p = buf; + recptr = startptr; + nbytes = count; + + while (nbytes > 0) + { + uint32 startoff; + int segbytes; + int readbytes; + + startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + + /* + * If the data we want is not in a segment we have open, close what we + * have (if anything) and open the next one, using the caller's + * provided openSegment callback. + */ + if (state->seg.ws_file < 0 || + !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || + tli != state->seg.ws_tli) + { + XLogSegNo nextSegNo; + + if (state->seg.ws_file >= 0) + state->routine.segment_close(state); + + XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); + state->routine.segment_open(state, nextSegNo, &tli); + + /* This shouldn't happen -- indicates a bug in segment_open */ + Assert(state->seg.ws_file >= 0); + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + state->seg.ws_segno = nextSegNo; + } + + /* How many bytes are within this segment? */ + if (nbytes > (state->segcxt.ws_segsize - startoff)) + segbytes = state->segcxt.ws_segsize - startoff; + else + segbytes = nbytes; + +#ifndef FRONTEND + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +#endif + + /* Reset errno first; eases reporting non-errno-affecting errors */ + errno = 0; + readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +#ifndef FRONTEND + pgstat_report_wait_end(); +#endif + + if (readbytes <= 0) + { + errinfo->wre_errno = errno; + errinfo->wre_req = segbytes; + errinfo->wre_read = readbytes; + errinfo->wre_off = startoff; + errinfo->wre_seg = state->seg; + return false; + } + + /* Update state for read */ + recptr += readbytes; + nbytes -= readbytes; + p += readbytes; + } + + return true; +} + +/* ---------------------------------------- + * Functions for decoding the data and block references in a record. + * ---------------------------------------- + */ + +/* + * Private function to reset the state, forgetting all decoded records, if we + * are asked to move to a new read position. + */ +static void +ResetDecoder(XLogReaderState *state) +{ + DecodedXLogRecord *r; + + /* Reset the decoded record queue, freeing any oversized records. */ + while ((r = state->decode_queue_head) != NULL) + { + state->decode_queue_head = r->next; + if (r->oversized) + pfree(r); + } + state->decode_queue_tail = NULL; + state->decode_queue_head = NULL; + state->record = NULL; + + /* Reset the decode buffer to empty. */ + state->decode_buffer_tail = state->decode_buffer; + state->decode_buffer_head = state->decode_buffer; + + /* Clear error state. */ + state->errormsg_buf[0] = '\0'; + state->errormsg_deferred = false; +} + +/* + * Compute the maximum possible amount of padding that could be required to + * decode a record, given xl_tot_len from the record's header. This is the + * amount of output buffer space that we need to decode a record, though we + * might not finish up using it all. + * + * This computation is pessimistic and assumes the maximum possible number of + * blocks, due to lack of better information. + */ +size_t +DecodeXLogRecordRequiredSpace(size_t xl_tot_len) +{ + size_t size = 0; + + /* Account for the fixed size part of the decoded record struct. */ + size += offsetof(DecodedXLogRecord, blocks[0]); + /* Account for the flexible blocks array of maximum possible size. */ + size += sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1); + /* Account for all the raw main and block data. */ + size += xl_tot_len; + /* We might insert padding before main_data. */ + size += (MAXIMUM_ALIGNOF - 1); + /* We might insert padding before each block's data. */ + size += (MAXIMUM_ALIGNOF - 1) * (XLR_MAX_BLOCK_ID + 1); + /* We might insert padding at the end. */ + size += (MAXIMUM_ALIGNOF - 1); + + return size; +} + +/* + * Decode a record. "decoded" must point to a MAXALIGNed memory area that has + * space for at least DecodeXLogRecordRequiredSpace(record) bytes. On + * success, decoded->size contains the actual space occupied by the decoded + * record, which may turn out to be less. + * + * Only decoded->oversized member must be initialized already, and will not be + * modified. Other members will be initialized as required. + * + * On error, a human-readable error message is returned in *errormsg, and + * the return value is false. + */ +bool +DecodeXLogRecord(XLogReaderState *state, + DecodedXLogRecord *decoded, + XLogRecord *record, + XLogRecPtr lsn, + char **errormsg) +{ + /* + * read next _size bytes from record buffer, but check for overrun first. + */ +#define COPY_HEADER_FIELD(_dst, _size) \ + do { \ + if (remaining < _size) \ + goto shortdata_err; \ + memcpy(_dst, ptr, _size); \ + ptr += _size; \ + remaining -= _size; \ + } while(0) + + char *ptr; + char *out; + uint32 remaining; + uint32 datatotal; + RelFileNode *rnode = NULL; + uint8 block_id; + + decoded->header = *record; + decoded->lsn = lsn; + decoded->next = NULL; + decoded->record_origin = InvalidRepOriginId; + decoded->toplevel_xid = InvalidTransactionId; + decoded->main_data = NULL; + decoded->main_data_len = 0; + decoded->max_block_id = -1; + ptr = (char *) record; + ptr += SizeOfXLogRecord; + remaining = record->xl_tot_len - SizeOfXLogRecord; + + /* Decode the headers */ + datatotal = 0; + while (remaining > datatotal) + { + COPY_HEADER_FIELD(&block_id, sizeof(uint8)); + + if (block_id == XLR_BLOCK_ID_DATA_SHORT) + { + /* XLogRecordDataHeaderShort */ + uint8 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); + + decoded->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_DATA_LONG) + { + /* XLogRecordDataHeaderLong */ + uint32 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); + decoded->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_ORIGIN) + { + COPY_HEADER_FIELD(&decoded->record_origin, sizeof(RepOriginId)); + } + else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) + { + COPY_HEADER_FIELD(&decoded->toplevel_xid, sizeof(TransactionId)); + } + else if (block_id <= XLR_MAX_BLOCK_ID) + { + /* XLogRecordBlockHeader */ + DecodedBkpBlock *blk; + uint8 fork_flags; + + /* mark any intervening block IDs as not in use */ + for (int i = decoded->max_block_id + 1; i < block_id; ++i) + decoded->blocks[i].in_use = false; + + if (block_id <= decoded->max_block_id) + { + report_invalid_record(state, + "out-of-order block_id %u at %X/%X", + block_id, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + decoded->max_block_id = block_id; + + blk = &decoded->blocks[block_id]; + blk->in_use = true; + blk->apply_image = false; + + COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); + blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; + blk->flags = fork_flags; + blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); + blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); + + blk->prefetch_buffer = InvalidBuffer; + + COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); + /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ + if (blk->has_data && blk->data_len == 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + if (!blk->has_data && blk->data_len != 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + datatotal += blk->data_len; + + if (blk->has_image) + { + COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + + if (BKPIMAGE_COMPRESSED(blk->bimg_info)) + { + if (blk->bimg_info & BKPIMAGE_HAS_HOLE) + COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); + else + blk->hole_length = 0; + } + else + blk->hole_length = BLCKSZ - blk->bimg_len; + datatotal += blk->bimg_len; + + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset == 0 || + blk->hole_length == 0 || + blk->bimg_len == BLCKSZ)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that hole_offset == 0 and hole_length == 0 if + * the HAS_HOLE flag is not set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset != 0 || blk->hole_length != 0)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * Cross-check that bimg_len < BLCKSZ if it is compressed. + */ + if (BKPIMAGE_COMPRESSED(blk->bimg_info) && + blk->bimg_len == BLCKSZ) + { + report_invalid_record(state, + "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE is + * set nor COMPRESSED(). + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + !BKPIMAGE_COMPRESSED(blk->bimg_info) && + blk->bimg_len != BLCKSZ) + { + report_invalid_record(state, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) + { + COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); + rnode = &blk->rnode; + } + else + { + if (rnode == NULL) + { + report_invalid_record(state, + "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + blk->rnode = *rnode; + } + COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); + } + else + { + report_invalid_record(state, + "invalid block_id %u at %X/%X", + block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + + if (remaining != datatotal) + goto shortdata_err; + + /* + * Ok, we've parsed the fragment headers, and verified that the total + * length of the payload in the fragments is equal to the amount of data + * left. Copy the data of each fragment to contiguous space after the + * blocks array, inserting alignment padding before the data fragments so + * they can be cast to struct pointers by REDO routines. + */ + out = ((char *) decoded) + + offsetof(DecodedXLogRecord, blocks) + + sizeof(decoded->blocks[0]) * (decoded->max_block_id + 1); + + /* block data first */ + for (block_id = 0; block_id <= decoded->max_block_id; block_id++) + { + DecodedBkpBlock *blk = &decoded->blocks[block_id]; + + if (!blk->in_use) + continue; + + Assert(blk->has_image || !blk->apply_image); + + if (blk->has_image) + { + /* no need to align image */ + blk->bkp_image = out; + memcpy(out, ptr, blk->bimg_len); + ptr += blk->bimg_len; + out += blk->bimg_len; + } + if (blk->has_data) + { + out = (char *) MAXALIGN(out); + blk->data = out; + memcpy(blk->data, ptr, blk->data_len); + ptr += blk->data_len; + out += blk->data_len; + } + } + + /* and finally, the main data */ + if (decoded->main_data_len > 0) + { + out = (char *) MAXALIGN(out); + decoded->main_data = out; + memcpy(decoded->main_data, ptr, decoded->main_data_len); + ptr += decoded->main_data_len; + out += decoded->main_data_len; + } + + /* Report the actual size we used. */ + decoded->size = MAXALIGN(out - (char *) decoded); + Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >= + decoded->size); + + return true; + +shortdata_err: + report_invalid_record(state, + "record with invalid length at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); +err: + *errormsg = state->errormsg_buf; + + return false; +} + +/* + * Returns information about the block that a block reference refers to. + * + * This is like XLogRecGetBlockTagExtended, except that the block reference + * must exist and there's no access to prefetch_buffer. + */ +void +XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum) +{ + if (!XLogRecGetBlockTagExtended(record, block_id, rnode, forknum, blknum, + NULL)) + { +#ifndef FRONTEND + elog(ERROR, "could not locate backup block with ID %d in WAL record", + block_id); +#else + pg_fatal("could not locate backup block with ID %d in WAL record", + block_id); +#endif + } +} + +/* + * Returns information about the block that a block reference refers to, + * optionally including the buffer that the block may already be in. + * + * If the WAL record contains a block reference with the given ID, *rnode, + * *forknum, *blknum and *prefetch_buffer are filled in (if not NULL), and + * returns true. Otherwise returns false. + */ +bool +XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, + BlockNumber *blknum, + Buffer *prefetch_buffer) +{ + DecodedBkpBlock *bkpb; + + if (!XLogRecHasBlockRef(record, block_id)) + return false; + + bkpb = &record->record->blocks[block_id]; + if (rnode) + *rnode = bkpb->rnode; + if (forknum) + *forknum = bkpb->forknum; + if (blknum) + *blknum = bkpb->blkno; + if (prefetch_buffer) + *prefetch_buffer = bkpb->prefetch_buffer; + return true; +} + +/* + * Returns the data associated with a block reference, or NULL if there is + * no data (e.g. because a full-page image was taken instead). The returned + * pointer points to a MAXALIGNed buffer. + */ +char * +XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len) +{ + DecodedBkpBlock *bkpb; + + if (block_id > record->record->max_block_id || + !record->record->blocks[block_id].in_use) + return NULL; + + bkpb = &record->record->blocks[block_id]; + + if (!bkpb->has_data) + { + if (len) + *len = 0; + return NULL; + } + else + { + if (len) + *len = bkpb->data_len; + return bkpb->data; + } +} + +/* + * Restore a full-page image from a backup block attached to an XLOG record. + * + * Returns true if a full-page image is restored, and false on failure with + * an error to be consumed by the caller. + */ +bool +RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) +{ + DecodedBkpBlock *bkpb; + char *ptr; + PGAlignedBlock tmp; + + if (block_id > record->record->max_block_id || + !record->record->blocks[block_id].in_use) + { + report_invalid_record(record, + "could not restore image at %X/%X with invalid block %d specified", + LSN_FORMAT_ARGS(record->ReadRecPtr), + block_id); + return false; + } + if (!record->record->blocks[block_id].has_image) + { + report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + block_id); + return false; + } + + bkpb = &record->record->blocks[block_id]; + ptr = bkpb->bkp_image; + + if (BKPIMAGE_COMPRESSED(bkpb->bimg_info)) + { + /* If a backup block image is compressed, decompress it */ + bool decomp_success = true; + + if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0) + { + if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data, + BLCKSZ - bkpb->hole_length, true) < 0) + decomp_success = false; + } + else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0) + { +#ifdef USE_LZ4 + if (LZ4_decompress_safe(ptr, tmp.data, + bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0) + decomp_success = false; +#else + report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + "LZ4", + block_id); + return false; +#endif + } + else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_ZSTD) != 0) + { +#ifdef USE_ZSTD + size_t decomp_result = ZSTD_decompress(tmp.data, + BLCKSZ - bkpb->hole_length, + ptr, bkpb->bimg_len); + + if (ZSTD_isError(decomp_result)) + decomp_success = false; +#else + report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + "zstd", + block_id); + return false; +#endif + } + else + { + report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + block_id); + return false; + } + + if (!decomp_success) + { + report_invalid_record(record, "could not decompress image at %X/%X, block %d", + LSN_FORMAT_ARGS(record->ReadRecPtr), + block_id); + return false; + } + + ptr = tmp.data; + } + + /* generate page, taking into account hole if necessary */ + if (bkpb->hole_length == 0) + { + memcpy(page, ptr, BLCKSZ); + } + else + { + memcpy(page, ptr, bkpb->hole_offset); + /* must zero-fill the hole */ + MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); + memcpy(page + (bkpb->hole_offset + bkpb->hole_length), + ptr + bkpb->hole_offset, + BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); + } + + return true; +} + +#ifndef FRONTEND + +/* + * Extract the FullTransactionId from a WAL record. + */ +FullTransactionId +XLogRecGetFullXid(XLogReaderState *record) +{ + TransactionId xid, + next_xid; + uint32 epoch; + + /* + * This function is only safe during replay, because it depends on the + * replay state. See AdvanceNextFullTransactionIdPastXid() for more. + */ + Assert(AmStartupProcess() || !IsUnderPostmaster); + + xid = XLogRecGetXid(record); + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If xid is numerically greater than next_xid, it has to be from the last + * epoch. + */ + if (unlikely(xid > next_xid)) + --epoch; + + return FullTransactionIdFromEpochAndXid(epoch, xid); +} + +#endif diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c new file mode 100644 index 0000000..166f7b7 --- /dev/null +++ b/src/backend/access/transam/xlogrecovery.c @@ -0,0 +1,4699 @@ +/*------------------------------------------------------------------------- + * + * xlogrecovery.c + * Functions for WAL recovery, standby mode + * + * This source file contains functions controlling WAL recovery. + * InitWalRecovery() initializes the system for crash or archive recovery, + * or standby mode, depending on configuration options and the state of + * the control file and possible backup label file. PerformWalRecovery() + * performs the actual WAL replay, calling the rmgr-specific redo routines. + * EndWalRecovery() performs end-of-recovery checks and cleanup actions, + * and prepares information needed to initialize the WAL for writes. In + * addition to these three main functions, there are a bunch of functions + * for interrogating recovery state and controlling the recovery process. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogrecovery.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <ctype.h> +#include <math.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <unistd.h> + +#include "access/timeline.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xlogprefetcher.h" +#include "access/xlogreader.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "backup/basebackup.h" +#include "catalog/pg_control.h" +#include "commands/tablespace.h" +#include "common/file_utils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/startup.h" +#include "replication/walreceiver.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/spin.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/ps_status.h" +#include "utils/pg_rusage.h" + +/* Unsupported old recovery command file names (relative to $PGDATA) */ +#define RECOVERY_COMMAND_FILE "recovery.conf" +#define RECOVERY_COMMAND_DONE "recovery.done" + +/* + * GUC support + */ +const struct config_enum_entry recovery_target_action_options[] = { + {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, + {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, + {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, + {NULL, 0, false} +}; + +/* options formerly taken from recovery.conf for archive recovery */ +char *recoveryRestoreCommand = NULL; +char *recoveryEndCommand = NULL; +char *archiveCleanupCommand = NULL; +RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; +bool recoveryTargetInclusive = true; +int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; +TransactionId recoveryTargetXid; +char *recovery_target_time_string; +TimestampTz recoveryTargetTime; +const char *recoveryTargetName; +XLogRecPtr recoveryTargetLSN; +int recovery_min_apply_delay = 0; + +/* options formerly taken from recovery.conf for XLOG streaming */ +char *PrimaryConnInfo = NULL; +char *PrimarySlotName = NULL; +char *PromoteTriggerFile = NULL; +bool wal_receiver_create_temp_slot = false; + +/* + * recoveryTargetTimeLineGoal: what the user requested, if any + * + * recoveryTargetTLIRequested: numeric value of requested timeline, if constant + * + * recoveryTargetTLI: the currently understood target timeline; changes + * + * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and + * the timelines of its known parents, newest first (so recoveryTargetTLI is + * always the first list member). Only these TLIs are expected to be seen in + * the WAL segments we read, and indeed only these TLIs will be considered as + * candidate WAL files to open at all. + * + * curFileTLI: the TLI appearing in the name of the current input WAL file. + * (This is not necessarily the same as the timeline from which we are + * replaying WAL, which StartupXLOG calls replayTLI, because we could be + * scanning data that was copied from an ancestor timeline when the current + * file was created.) During a sequential scan we do not allow this value + * to decrease. + */ +RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; +TimeLineID recoveryTargetTLIRequested = 0; +TimeLineID recoveryTargetTLI = 0; +static List *expectedTLEs; +static TimeLineID curFileTLI; + +/* + * When ArchiveRecoveryRequested is set, archive recovery was requested, + * ie. signal files were present. When InArchiveRecovery is set, we are + * currently recovering using offline XLOG archives. These variables are only + * valid in the startup process. + * + * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're + * currently performing crash recovery using only XLOG files in pg_wal, but + * will switch to using offline XLOG archives as soon as we reach the end of + * WAL in pg_wal. +*/ +bool ArchiveRecoveryRequested = false; +bool InArchiveRecovery = false; + +/* + * When StandbyModeRequested is set, standby mode was requested, i.e. + * standby.signal file was present. When StandbyMode is set, we are currently + * in standby mode. These variables are only valid in the startup process. + * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery. + */ +static bool StandbyModeRequested = false; +bool StandbyMode = false; + +/* was a signal file present at startup? */ +static bool standby_signal_file_found = false; +static bool recovery_signal_file_found = false; + +/* + * CheckPointLoc is the position of the checkpoint record that determines + * where to start the replay. It comes from the backup label file or the + * control file. + * + * RedoStartLSN is the checkpoint's REDO location, also from the backup label + * file or the control file. In standby mode, XLOG streaming usually starts + * from the position where an invalid record was found. But if we fail to + * read even the initial checkpoint record, we use the REDO location instead + * of the checkpoint location as the start position of XLOG streaming. + * Otherwise we would have to jump backwards to the REDO location after + * reading the checkpoint record, because the REDO record can precede the + * checkpoint record. + */ +static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr; +static TimeLineID CheckPointTLI = 0; +static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; +static TimeLineID RedoStartTLI = 0; + +/* + * Local copy of SharedHotStandbyActive variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalHotStandbyActive = false; + +/* + * Local copy of SharedPromoteIsTriggered variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalPromoteIsTriggered = false; + +/* Has the recovery code requested a walreceiver wakeup? */ +static bool doRequestWalReceiverReply; + +/* XLogReader object used to parse the WAL records */ +static XLogReaderState *xlogreader = NULL; + +/* XLogPrefetcher object used to consume WAL records with read-ahead */ +static XLogPrefetcher *xlogprefetcher = NULL; + +/* Parameters passed down from ReadRecord to the XLogPageRead callback. */ +typedef struct XLogPageReadPrivate +{ + int emode; + bool fetching_ckpt; /* are we fetching a checkpoint record? */ + bool randAccess; + TimeLineID replayTLI; +} XLogPageReadPrivate; + +/* flag to tell XLogPageRead that we have started replaying */ +static bool InRedo = false; + +/* + * Codes indicating where we got a WAL file from during recovery, or where + * to attempt to get one. + */ +typedef enum +{ + XLOG_FROM_ANY = 0, /* request to read WAL from any source */ + XLOG_FROM_ARCHIVE, /* restored using restore_command */ + XLOG_FROM_PG_WAL, /* existing file in pg_wal */ + XLOG_FROM_STREAM /* streamed from primary */ +} XLogSource; + +/* human-readable names for XLogSources, for debugging output */ +static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; + +/* + * readFile is -1 or a kernel FD for the log file segment that's currently + * open for reading. readSegNo identifies the segment. readOff is the offset + * of the page just read, readLen indicates how much of it has been read into + * readBuf, and readSource indicates where we got the currently open file from. + * + * Note: we could use Reserve/ReleaseExternalFD to track consumption of this + * FD too (like for openLogFile in xlog.c); but it doesn't currently seem + * worthwhile, since the XLOG is not read by general-purpose sessions. + */ +static int readFile = -1; +static XLogSegNo readSegNo = 0; +static uint32 readOff = 0; +static uint32 readLen = 0; +static XLogSource readSource = XLOG_FROM_ANY; + +/* + * Keeps track of which source we're currently reading from. This is + * different from readSource in that this is always set, even when we don't + * currently have a WAL file open. If lastSourceFailed is set, our last + * attempt to read from currentSource failed, and we should try another source + * next. + * + * pendingWalRcvRestart is set when a config change occurs that requires a + * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. + */ +static XLogSource currentSource = XLOG_FROM_ANY; +static bool lastSourceFailed = false; +static bool pendingWalRcvRestart = false; + +/* + * These variables track when we last obtained some WAL data to process, + * and where we got it from. (XLogReceiptSource is initially the same as + * readSource, but readSource gets reset to zero when we don't have data + * to process right now. It is also different from currentSource, which + * also changes when we try to read from a source and fail, while + * XLogReceiptSource tracks where we last successfully read some WAL.) + */ +static TimestampTz XLogReceiptTime = 0; +static XLogSource XLogReceiptSource = XLOG_FROM_ANY; + +/* Local copy of WalRcv->flushedUpto */ +static XLogRecPtr flushedUpto = 0; +static TimeLineID receiveTLI = 0; + +/* + * Copy of minRecoveryPoint and backupEndPoint from the control file. + * + * In order to reach consistency, we must replay the WAL up to + * minRecoveryPoint. If backupEndRequired is true, we must also reach + * backupEndPoint, or if it's invalid, an end-of-backup record corresponding + * to backupStartPoint. + * + * Note: In archive recovery, after consistency has been reached, the + * functions in xlog.c will start updating minRecoveryPoint in the control + * file. But this copy of minRecoveryPoint variable reflects the value at the + * beginning of recovery, and is *not* updated after consistency is reached. + */ +static XLogRecPtr minRecoveryPoint; +static TimeLineID minRecoveryPointTLI; + +static XLogRecPtr backupStartPoint; +static XLogRecPtr backupEndPoint; +static bool backupEndRequired = false; + +/* + * Have we reached a consistent database state? In crash recovery, we have + * to replay all the WAL, so reachedConsistency is never set. During archive + * recovery, the database is consistent once minRecoveryPoint is reached. + * + * Consistent state means that the system is internally consistent, all + * the WAL has been replayed up to a certain point, and importantly, there + * is no trace of later actions on disk. + */ +bool reachedConsistency = false; + +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *primary_image_masked = NULL; + + +/* + * Shared-memory state for WAL recovery. + */ +typedef struct XLogRecoveryCtlData +{ + /* + * SharedHotStandbyActive indicates if we allow hot standby queries to be + * run. Protected by info_lck. + */ + bool SharedHotStandbyActive; + + /* + * SharedPromoteIsTriggered indicates if a standby promotion has been + * triggered. Protected by info_lck. + */ + bool SharedPromoteIsTriggered; + + /* + * recoveryWakeupLatch is used to wake up the startup process to continue + * WAL replay, if it is waiting for WAL to arrive or failover trigger file + * to appear. + * + * Note that the startup process also uses another latch, its procLatch, + * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for + * signaling the startup process in favor of using its procLatch, which + * comports better with possible generic signal handlers using that latch. + * But we should not do that because the startup process doesn't assume + * that it's waken up by walreceiver process or SIGHUP signal handler + * while it's waiting for recovery conflict. The separate latches, + * recoveryWakeupLatch and procLatch, should be used for inter-process + * communication for WAL replay and recovery conflict, respectively. + */ + Latch recoveryWakeupLatch; + + /* + * Last record successfully replayed. + */ + XLogRecPtr lastReplayedReadRecPtr; /* start position */ + XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ + TimeLineID lastReplayedTLI; /* timeline */ + + /* + * When we're currently replaying a record, ie. in a redo function, + * replayEndRecPtr points to the end+1 of the record being replayed, + * otherwise it's equal to lastReplayedEndRecPtr. + */ + XLogRecPtr replayEndRecPtr; + TimeLineID replayEndTLI; + /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ + TimestampTz recoveryLastXTime; + + /* + * timestamp of when we started replaying the current chunk of WAL data, + * only relevant for replication or archive recovery + */ + TimestampTz currentChunkStartTime; + /* Recovery pause state */ + RecoveryPauseState recoveryPauseState; + ConditionVariable recoveryNotPausedCV; + + slock_t info_lck; /* locks shared variables shown above */ +} XLogRecoveryCtlData; + +static XLogRecoveryCtlData *XLogRecoveryCtl = NULL; + +/* + * abortedRecPtr is the start pointer of a broken record at end of WAL when + * recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ +static XLogRecPtr abortedRecPtr; +static XLogRecPtr missingContrecPtr; + +/* + * if recoveryStopsBefore/After returns true, it saves information of the stop + * point here + */ +static TransactionId recoveryStopXid; +static TimestampTz recoveryStopTime; +static XLogRecPtr recoveryStopLSN; +static char recoveryStopName[MAXFNAMELEN]; +static bool recoveryStopAfter; + +/* prototypes for local functions */ +static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI); + +static void EnableStandbyMode(void); +static void readRecoverySignalFile(void); +static void validateRecoveryParameters(void); +static bool read_backup_label(XLogRecPtr *checkPointLoc, + TimeLineID *backupLabelTLI, + bool *backupEndRequired, bool *backupFromStandby); +static bool read_tablespace_map(List **tablespaces); + +static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI); +static void CheckRecoveryConsistency(void); +static void rm_redo_error_callback(void *arg); +#ifdef WAL_DEBUG +static void xlog_outrec(StringInfo buf, XLogReaderState *record); +#endif +static void xlog_block_info(StringInfo buf, XLogReaderState *record); +static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, + TimeLineID prevTLI, TimeLineID replayTLI); +static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime); +static void verifyBackupPageConsistency(XLogReaderState *record); + +static bool recoveryStopsBefore(XLogReaderState *record); +static bool recoveryStopsAfter(XLogReaderState *record); +static char *getRecoveryStopReason(void); +static void recoveryPausesHere(bool endOfRecovery); +static bool recoveryApplyDelay(XLogReaderState *record); +static void ConfirmRecoveryPaused(void); + +static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher, + int emode, bool fetching_ckpt, + TimeLineID replayTLI); + +static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); +static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, + bool randAccess, + bool fetching_ckpt, + XLogRecPtr tliRecPtr, + TimeLineID replayTLI, + XLogRecPtr replayLSN, + bool nonblocking); +static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, + int whichChkpt, bool report, TimeLineID replayTLI); +static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN); +static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk); +static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); + +static bool CheckForStandbyTrigger(void); +static void SetPromoteIsTriggered(void); +static bool HotStandbyActiveInReplay(void); + +static void SetCurrentChunkStartTime(TimestampTz xtime); +static void SetLatestXTime(TimestampTz xtime); + +/* + * Initialization of shared memory for WAL recovery + */ +Size +XLogRecoveryShmemSize(void) +{ + Size size; + + /* XLogRecoveryCtl */ + size = sizeof(XLogRecoveryCtlData); + + return size; +} + +void +XLogRecoveryShmemInit(void) +{ + bool found; + + XLogRecoveryCtl = (XLogRecoveryCtlData *) + ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found); + if (found) + return; + memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData)); + + SpinLockInit(&XLogRecoveryCtl->info_lck); + InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); +} + +/* + * A thin wrapper to enable StandbyMode and do other preparatory work as + * needed. + */ +static void +EnableStandbyMode(void) +{ + StandbyMode = true; + + /* + * To avoid server log bloat, we don't report recovery progress in a + * standby as it will always be in recovery unless promoted. We disable + * startup progress timeout in standby mode to avoid calling + * startup_progress_timeout_handler() unnecessarily. + */ + disable_startup_progress_timeout(); +} + +/* + * Prepare the system for WAL recovery, if needed. + * + * This is called by StartupXLOG() which coordinates the server startup + * sequence. This function analyzes the control file and the backup label + * file, if any, and figures out whether we need to perform crash recovery or + * archive recovery, and how far we need to replay the WAL to reach a + * consistent state. + * + * This doesn't yet change the on-disk state, except for creating the symlinks + * from table space map file if any, and for fetching WAL files needed to find + * the checkpoint record. On entry, the caller has already read the control + * file into memory, and passes it as argument. This function updates it to + * reflect the recovery state, and the caller is expected to write it back to + * disk does after initializing other subsystems, but before calling + * PerformWalRecovery(). + * + * This initializes some global variables like ArchiveModeRequested, and + * StandbyModeRequested and InRecovery. + */ +void +InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, + bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr) +{ + XLogPageReadPrivate *private; + struct stat st; + bool wasShutdown; + XLogRecord *record; + DBState dbstate_at_startup; + bool haveTblspcMap = false; + bool haveBackupLabel = false; + CheckPoint checkPoint; + bool backupFromStandby = false; + + dbstate_at_startup = ControlFile->state; + + /* + * Initialize on the assumption we want to recover to the latest timeline + * that's active according to pg_control. + */ + if (ControlFile->minRecoveryPointTLI > + ControlFile->checkPointCopy.ThisTimeLineID) + recoveryTargetTLI = ControlFile->minRecoveryPointTLI; + else + recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID; + + /* + * Check for signal files, and if so set up state for offline recovery + */ + readRecoverySignalFile(); + validateRecoveryParameters(); + + if (ArchiveRecoveryRequested) + { + if (StandbyModeRequested) + ereport(LOG, + (errmsg("entering standby mode"))); + else if (recoveryTarget == RECOVERY_TARGET_XID) + ereport(LOG, + (errmsg("starting point-in-time recovery to XID %u", + recoveryTargetXid))); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + ereport(LOG, + (errmsg("starting point-in-time recovery to %s", + timestamptz_to_str(recoveryTargetTime)))); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + ereport(LOG, + (errmsg("starting point-in-time recovery to \"%s\"", + recoveryTargetName))); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + ereport(LOG, + (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN)))); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + ereport(LOG, + (errmsg("starting point-in-time recovery to earliest consistent point"))); + else + ereport(LOG, + (errmsg("starting archive recovery"))); + } + + /* + * Take ownership of the wakeup latch if we're going to sleep during + * recovery. + */ + if (ArchiveRecoveryRequested) + OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + + private = palloc0(sizeof(XLogPageReadPrivate)); + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &XLogPageRead, + .segment_open = NULL, + .segment_close = wal_segment_close), + private); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + xlogreader->system_identifier = ControlFile->system_identifier; + + /* + * Set the WAL decode buffer size. This limits how far ahead we can read + * in the WAL. + */ + XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size); + + /* Create a WAL prefetcher. */ + xlogprefetcher = XLogPrefetcherAllocate(xlogreader); + + /* + * Allocate two page buffers dedicated to WAL consistency checks. We do + * it this way, rather than just making static arrays, for two reasons: + * (1) no need to waste the storage in most instantiations of the backend; + * (2) a static char array isn't guaranteed to have any particular + * alignment, whereas palloc() will provide MAXALIGN'd storage. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + primary_image_masked = (char *) palloc(BLCKSZ); + + if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired, + &backupFromStandby)) + { + List *tablespaces = NIL; + + /* + * Archive recovery was requested, and thanks to the backup label + * file, we know how far we need to replay to reach consistency. Enter + * archive recovery directly. + */ + InArchiveRecovery = true; + if (StandbyModeRequested) + EnableStandbyMode(); + + /* + * When a backup_label file is present, we want to roll forward from + * the checkpoint it identifies, rather than using pg_control. + */ + record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true, + CheckPointTLI); + if (record != NULL) + { + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(CheckPointLoc)))); + InRecovery = true; /* force recovery even if SHUTDOWNED */ + + /* + * Make sure that REDO location exists. This may not be the case + * if there was a crash during an online backup, which left a + * backup_label around that references a WAL segment that's + * already been archived. + */ + if (checkPoint.redo < CheckPointLoc) + { + XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo); + if (!ReadRecord(xlogprefetcher, LOG, false, + checkPoint.ThisTimeLineID)) + ereport(FATAL, + (errmsg("could not find redo location referenced by checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + } + } + else + { + ereport(FATAL, + (errmsg("could not locate required checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + wasShutdown = false; /* keep compiler quiet */ + } + + /* Read the tablespace_map file if present and create symlinks. */ + if (read_tablespace_map(&tablespaces)) + { + ListCell *lc; + + foreach(lc, tablespaces) + { + tablespaceinfo *ti = lfirst(lc); + char *linkloc; + + linkloc = psprintf("pg_tblspc/%s", ti->oid); + + /* + * Remove the existing symlink if any and Create the symlink + * under PGDATA. + */ + remove_tablespace_symlink(linkloc); + + if (symlink(ti->path, linkloc) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create symbolic link \"%s\": %m", + linkloc))); + + pfree(ti->oid); + pfree(ti->path); + pfree(ti); + } + + /* tell the caller to delete it later */ + haveTblspcMap = true; + } + + /* tell the caller to delete it later */ + haveBackupLabel = true; + } + else + { + /* + * If tablespace_map file is present without backup_label file, there + * is no use of such file. There is no harm in retaining it, but it + * is better to get rid of the map file so that we don't have any + * redundant file in data directory and it will avoid any sort of + * confusion. It seems prudent though to just rename the file out of + * the way rather than delete it completely, also we ignore any error + * that occurs in rename operation as even if map file is present + * without backup_label file, it is harmless. + */ + if (stat(TABLESPACE_MAP, &st) == 0) + { + unlink(TABLESPACE_MAP_OLD); + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("File \"%s\" was renamed to \"%s\".", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + else + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("Could not rename file \"%s\" to \"%s\": %m.", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + } + + /* + * It's possible that archive recovery was requested, but we don't + * know how far we need to replay the WAL before we reach consistency. + * This can happen for example if a base backup is taken from a + * running server using an atomic filesystem snapshot, without calling + * pg_backup_start/stop. Or if you just kill a running primary server + * and put it into archive recovery by creating a recovery signal + * file. + * + * Our strategy in that case is to perform crash recovery first, + * replaying all the WAL present in pg_wal, and only enter archive + * recovery after that. + * + * But usually we already know how far we need to replay the WAL (up + * to minRecoveryPoint, up to backupEndPoint, or until we see an + * end-of-backup record), and we can enter archive recovery directly. + */ + if (ArchiveRecoveryRequested && + (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || + ControlFile->backupEndRequired || + ControlFile->backupEndPoint != InvalidXLogRecPtr || + ControlFile->state == DB_SHUTDOWNED)) + { + InArchiveRecovery = true; + if (StandbyModeRequested) + EnableStandbyMode(); + } + + /* Get the last valid checkpoint record. */ + CheckPointLoc = ControlFile->checkPoint; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; + record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true, + CheckPointTLI); + if (record != NULL) + { + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(CheckPointLoc)))); + } + else + { + /* + * We used to attempt to go back to a secondary checkpoint record + * here, but only when not in standby mode. We now just fail if we + * can't read the last checkpoint because this allows us to + * simplify processing around checkpoints. + */ + ereport(PANIC, + (errmsg("could not locate a valid checkpoint record"))); + } + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + } + + /* + * If the location of the checkpoint record is not on the expected + * timeline in the history of the requested timeline, we cannot proceed: + * the backup is not part of the history of the requested timeline. + */ + Assert(expectedTLEs); /* was initialized by reading checkpoint + * record */ + if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) != + CheckPointTLI) + { + XLogRecPtr switchpoint; + + /* + * tliSwitchPoint will throw an error if the checkpoint's timeline is + * not in expectedTLEs at all. + */ + switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL); + ereport(FATAL, + (errmsg("requested timeline %u is not a child of this server's history", + recoveryTargetTLI), + errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + LSN_FORMAT_ARGS(ControlFile->checkPoint), + ControlFile->checkPointCopy.ThisTimeLineID, + LSN_FORMAT_ARGS(switchpoint)))); + } + + /* + * The min recovery point should be part of the requested timeline's + * history, too. + */ + if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != + ControlFile->minRecoveryPointTLI) + ereport(FATAL, + (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI))); + + ereport(DEBUG1, + (errmsg_internal("redo record is at %X/%X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false"))); + ereport(DEBUG1, + (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", + U64FromFullTransactionId(checkPoint.nextXid), + checkPoint.nextOid))); + ereport(DEBUG1, + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + checkPoint.nextMulti, checkPoint.nextMultiOffset))); + ereport(DEBUG1, + (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", + checkPoint.oldestXid, checkPoint.oldestXidDB))); + ereport(DEBUG1, + (errmsg_internal("oldest MultiXactId: %u, in database %u", + checkPoint.oldestMulti, checkPoint.oldestMultiDB))); + ereport(DEBUG1, + (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", + checkPoint.oldestCommitTsXid, + checkPoint.newestCommitTsXid))); + if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) + ereport(PANIC, + (errmsg("invalid next transaction ID"))); + + /* sanity check */ + if (checkPoint.redo > CheckPointLoc) + ereport(PANIC, + (errmsg("invalid redo in checkpoint record"))); + + /* + * Check whether we need to force recovery from WAL. If it appears to + * have been a clean shutdown and we did not have a recovery signal file, + * then assume no recovery needed. + */ + if (checkPoint.redo < CheckPointLoc) + { + if (wasShutdown) + ereport(PANIC, + (errmsg("invalid redo record in shutdown checkpoint"))); + InRecovery = true; + } + else if (ControlFile->state != DB_SHUTDOWNED) + InRecovery = true; + else if (ArchiveRecoveryRequested) + { + /* force recovery due to presence of recovery signal file */ + InRecovery = true; + } + + /* + * If recovery is needed, update our in-memory copy of pg_control to show + * that we are recovering and to show the selected checkpoint as the place + * we are starting from. We also mark pg_control with any minimum recovery + * stop point obtained from a backup history file. + * + * We don't write the changes to disk yet, though. Only do that after + * initializing various subsystems. + */ + if (InRecovery) + { + if (InArchiveRecovery) + { + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + } + else + { + ereport(LOG, + (errmsg("database system was not properly shut down; " + "automatic recovery in progress"))); + if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID) + ereport(LOG, + (errmsg("crash recovery starts in timeline %u " + "and has target timeline %u", + ControlFile->checkPointCopy.ThisTimeLineID, + recoveryTargetTLI))); + ControlFile->state = DB_IN_CRASH_RECOVERY; + } + ControlFile->checkPoint = CheckPointLoc; + ControlFile->checkPointCopy = checkPoint; + if (InArchiveRecovery) + { + /* initialize minRecoveryPoint if not set yet */ + if (ControlFile->minRecoveryPoint < checkPoint.redo) + { + ControlFile->minRecoveryPoint = checkPoint.redo; + ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; + } + } + + /* + * Set backupStartPoint if we're starting recovery from a base backup. + * + * Also set backupEndPoint and use minRecoveryPoint as the backup end + * location if we're starting recovery from a base backup which was + * taken from a standby. In this case, the database system status in + * pg_control must indicate that the database was already in recovery. + * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be + * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted + * before reaching this point; e.g. because restore_command or + * primary_conninfo were faulty. + * + * Any other state indicates that the backup somehow became corrupted + * and we can't sensibly continue with recovery. + */ + if (haveBackupLabel) + { + ControlFile->backupStartPoint = checkPoint.redo; + ControlFile->backupEndRequired = backupEndRequired; + + if (backupFromStandby) + { + if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && + dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) + ereport(FATAL, + (errmsg("backup_label contains data inconsistent with control file"), + errhint("This means that the backup is corrupted and you will " + "have to use another backup for recovery."))); + ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; + } + } + } + + /* remember these, so that we know when we have reached consistency */ + backupStartPoint = ControlFile->backupStartPoint; + backupEndRequired = ControlFile->backupEndRequired; + backupEndPoint = ControlFile->backupEndPoint; + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + else + { + minRecoveryPoint = InvalidXLogRecPtr; + minRecoveryPointTLI = 0; + } + + /* + * Start recovery assuming that the final record isn't lost. + */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + + *wasShutdown_ptr = wasShutdown; + *haveBackupLabel_ptr = haveBackupLabel; + *haveTblspcMap_ptr = haveTblspcMap; +} + +/* + * See if there are any recovery signal files and if so, set state for + * recovery. + * + * See if there is a recovery command file (recovery.conf), and if so + * throw an ERROR since as of PG12 we no longer recognize that. + */ +static void +readRecoverySignalFile(void) +{ + struct stat stat_buf; + + if (IsBootstrapProcessingMode()) + return; + + /* + * Check for old recovery API file: recovery.conf + */ + if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("using recovery command file \"%s\" is not supported", + RECOVERY_COMMAND_FILE))); + + /* + * Remove unused .done file, if present. Ignore if absent. + */ + unlink(RECOVERY_COMMAND_DONE); + + /* + * Check for recovery signal files and if found, fsync them since they + * represent server state information. We don't sweat too much about the + * possibility of fsync failure, however. + * + * If present, standby signal file takes precedence. If neither is present + * then we won't enter archive recovery. + */ + if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + standby_signal_file_found = true; + } + else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + recovery_signal_file_found = true; + } + + StandbyModeRequested = false; + ArchiveRecoveryRequested = false; + if (standby_signal_file_found) + { + StandbyModeRequested = true; + ArchiveRecoveryRequested = true; + } + else if (recovery_signal_file_found) + { + StandbyModeRequested = false; + ArchiveRecoveryRequested = true; + } + else + return; + + /* + * We don't support standby mode in standalone backends; that requires + * other processes such as the WAL receiver to be alive. + */ + if (StandbyModeRequested && !IsUnderPostmaster) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("standby mode is not supported by single-user servers"))); +} + +static void +validateRecoveryParameters(void) +{ + if (!ArchiveRecoveryRequested) + return; + + /* + * Check for compulsory parameters + */ + if (StandbyModeRequested) + { + if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) && + (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)) + ereport(WARNING, + (errmsg("specified neither primary_conninfo nor restore_command"), + errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); + } + else + { + if (recoveryRestoreCommand == NULL || + strcmp(recoveryRestoreCommand, "") == 0) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("must specify restore_command when standby mode is not enabled"))); + } + + /* + * Override any inconsistent requests. Note that this is a change of + * behaviour in 9.5; prior to this we simply ignored a request to pause if + * hot_standby = off, which was surprising behaviour. + */ + if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE && + !EnableHotStandby) + recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; + + /* + * Final parsing of recovery_target_time string; see also + * check_recovery_target_time(). + */ + if (recoveryTarget == RECOVERY_TARGET_TIME) + { + recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(recovery_target_time_string), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + } + + /* + * If user specified recovery_target_timeline, validate it or compute the + * "latest" value. We can't do this until after we've gotten the restore + * command and set InArchiveRecovery, because we need to fetch timeline + * history files from the archive. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) + { + TimeLineID rtli = recoveryTargetTLIRequested; + + /* Timeline 1 does not have a history file, all else should */ + if (rtli != 1 && !existsTimeLineHistory(rtli)) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery target timeline %u does not exist", + rtli))); + recoveryTargetTLI = rtli; + } + else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + /* We start the "latest" search from pg_control's timeline */ + recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); + } + else + { + /* + * else we just use the recoveryTargetTLI as already read from + * ControlFile + */ + Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE); + } +} + +/* + * read_backup_label: check to see if a backup_label file is present + * + * If we see a backup_label during recovery, we assume that we are recovering + * from a backup dump file, and we therefore roll forward from the checkpoint + * identified by the label file, NOT what pg_control says. This avoids the + * problem that pg_control might have been archived one or more checkpoints + * later than the start of the dump, and so if we rely on it as the start + * point, we will fail to restore a consistent database state. + * + * Returns true if a backup_label was found (and fills the checkpoint + * location and TLI into *checkPointLoc and *backupLabelTLI, respectively); + * returns false if not. If this backup_label came from a streamed backup, + * *backupEndRequired is set to true. If this backup_label was created during + * recovery, *backupFromStandby is set to true. + * + * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN + * and TLI read from the backup file. + */ +static bool +read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, + bool *backupEndRequired, bool *backupFromStandby) +{ + char startxlogfilename[MAXFNAMELEN]; + TimeLineID tli_from_walseg, + tli_from_file; + FILE *lfp; + char ch; + char backuptype[20]; + char backupfrom[20]; + char backuplabel[MAXPGPATH]; + char backuptime[128]; + uint32 hi, + lo; + + /* suppress possible uninitialized-variable warnings */ + *checkPointLoc = InvalidXLogRecPtr; + *backupLabelTLI = 0; + *backupEndRequired = false; + *backupFromStandby = false; + + /* + * See if label file is present + */ + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code + * is pretty crude, but we are not expecting any variability in the file + * format). + */ + if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + RedoStartLSN = ((uint64) hi) << 32 | lo; + RedoStartTLI = tli_from_walseg; + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + &hi, &lo, &ch) != 3 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + *checkPointLoc = ((uint64) hi) << 32 | lo; + *backupLabelTLI = tli_from_walseg; + + /* + * BACKUP METHOD lets us know if this was a typical backup ("streamed", + * which could mean either pg_basebackup or the pg_backup_start/stop + * method was used) or if this label came from somewhere else (the only + * other option today being from pg_rewind). If this was a streamed + * backup then we know that we need to play through until we get to the + * end of the WAL which was generated during the backup (at which point we + * will have reached consistency and backupEndRequired will be reset to be + * false). + */ + if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1) + { + if (strcmp(backuptype, "streamed") == 0) + *backupEndRequired = true; + } + + /* + * BACKUP FROM lets us know if this was from a primary or a standby. If + * it was from a standby, we'll double-check that the control file state + * matches that of a standby. + */ + if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1) + { + if (strcmp(backupfrom, "standby") == 0) + *backupFromStandby = true; + } + + /* + * Parse START TIME and LABEL. Those are not mandatory fields for recovery + * but checking for their presence is useful for debugging and the next + * sanity checks. Cope also with the fact that the result buffers have a + * pre-allocated size, hence if the backup_label file has been generated + * with strings longer than the maximum assumed here an incorrect parsing + * happens. That's fine as only minor consistency checks are done + * afterwards. + */ + if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1) + ereport(DEBUG1, + (errmsg_internal("backup time %s in file \"%s\"", + backuptime, BACKUP_LABEL_FILE))); + + if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1) + ereport(DEBUG1, + (errmsg_internal("backup label %s in file \"%s\"", + backuplabel, BACKUP_LABEL_FILE))); + + /* + * START TIMELINE is new as of 11. Its parsing is not mandatory, still use + * it as a sanity check if present. + */ + if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1) + { + if (tli_from_walseg != tli_from_file) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE), + errdetail("Timeline ID parsed is %u, but expected %u.", + tli_from_file, tli_from_walseg))); + + ereport(DEBUG1, + (errmsg_internal("backup timeline %u in file \"%s\"", + tli_from_file, BACKUP_LABEL_FILE))); + } + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + + return true; +} + +/* + * read_tablespace_map: check to see if a tablespace_map file is present + * + * If we see a tablespace_map file during recovery, we assume that we are + * recovering from a backup dump file, and we therefore need to create symlinks + * as per the information present in tablespace_map file. + * + * Returns true if a tablespace_map file was found (and fills *tablespaces + * with a tablespaceinfo struct for each tablespace listed in the file); + * returns false if not. + */ +static bool +read_tablespace_map(List **tablespaces) +{ + tablespaceinfo *ti; + FILE *lfp; + char str[MAXPGPATH]; + int ch, + i, + n; + bool was_backslash; + + /* + * See if tablespace_map file is present + */ + lfp = AllocateFile(TABLESPACE_MAP, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the link name and path lines from tablespace_map file + * (this code is pretty crude, but we are not expecting any variability in + * the file format). De-escape any backslashes that were inserted. + */ + i = 0; + was_backslash = false; + while ((ch = fgetc(lfp)) != EOF) + { + if (!was_backslash && (ch == '\n' || ch == '\r')) + { + if (i == 0) + continue; /* \r immediately followed by \n */ + + /* + * The de-escaped line should contain an OID followed by exactly + * one space followed by a path. The path might start with + * spaces, so don't be too liberal about parsing. + */ + str[i] = '\0'; + n = 0; + while (str[n] && str[n] != ' ') + n++; + if (n < 1 || n >= i - 1) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + str[n++] = '\0'; + + ti = palloc0(sizeof(tablespaceinfo)); + ti->oid = pstrdup(str); + ti->path = pstrdup(str + n); + *tablespaces = lappend(*tablespaces, ti); + + i = 0; + continue; + } + else if (!was_backslash && ch == '\\') + was_backslash = true; + else + { + if (i < sizeof(str) - 1) + str[i++] = ch; + was_backslash = false; + } + } + + if (i != 0 || was_backslash) /* last line not terminated? */ + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + + return true; +} + +/* + * Finish WAL recovery. + * + * This does not close the 'xlogreader' yet, because in some cases the caller + * still wants to re-read the last checkpoint record by calling + * ReadCheckPointRecord(). + * + * Returns the position of the last valid or applied record, after which new + * WAL should be appended, information about why recovery was ended, and some + * other things. See the WalRecoveryResult struct for details. + */ +EndOfWalRecoveryInfo * +FinishWalRecovery(void) +{ + EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo)); + XLogRecPtr lastRec; + TimeLineID lastRecTLI; + XLogRecPtr endOfLog; + + /* + * Kill WAL receiver, if it's still running, before we continue to write + * the startup checkpoint and aborted-contrecord records. It will trump + * over these records and subsequent ones if it's still alive when we + * start writing WAL. + */ + XLogShutdownWalRcv(); + + /* + * We are now done reading the xlog from stream. Turn off streaming + * recovery to force fetching the files (which would be required at end of + * recovery, e.g., timeline history file) from archive or pg_wal. + * + * Note that standby mode must be turned off after killing WAL receiver, + * i.e., calling XLogShutdownWalRcv(). + */ + Assert(!WalRcvStreaming()); + StandbyMode = false; + + /* + * Determine where to start writing WAL next. + * + * Re-fetch the last valid or last applied record, so we can identify the + * exact endpoint of what we consider the valid portion of WAL. There may + * be an incomplete continuation record after that, in which case + * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will + * write a special OVERWRITE_CONTRECORD message to mark that the rest of + * it is intentionally missing. See CreateOverwriteContrecordRecord(). + * + * An important side-effect of this is to load the last page into + * xlogreader. The caller uses it to initialize the WAL for writing. + */ + if (!InRecovery) + { + lastRec = CheckPointLoc; + lastRecTLI = CheckPointTLI; + } + else + { + lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; + lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; + } + XLogPrefetcherBeginRead(xlogprefetcher, lastRec); + (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + endOfLog = xlogreader->EndRecPtr; + + /* + * Remember the TLI in the filename of the XLOG segment containing the + * end-of-log. It could be different from the timeline that endOfLog + * nominally belongs to, if there was a timeline switch in that segment, + * and we were reading the old WAL from a segment belonging to a higher + * timeline. + */ + result->endOfLogTLI = xlogreader->seg.ws_tli; + + if (ArchiveRecoveryRequested) + { + /* + * We are no longer in archive recovery state. + * + * We are now done reading the old WAL. Turn off archive fetching if + * it was active. + */ + Assert(InArchiveRecovery); + InArchiveRecovery = false; + + /* + * If the ending log segment is still open, close it (to avoid + * problems on Windows with trying to rename or delete an open file). + */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + } + + /* + * Copy the last partial block to the caller, for initializing the WAL + * buffer for appending new WAL. + */ + if (endOfLog % XLOG_BLCKSZ != 0) + { + char *page; + int len; + XLogRecPtr pageBeginPtr; + + pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ); + Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); + + /* Copy the valid part of the last block */ + len = endOfLog % XLOG_BLCKSZ; + page = palloc(len); + memcpy(page, xlogreader->readBuf, len); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + } + else + { + /* There is no partial block to copy. */ + result->lastPageBeginPtr = endOfLog; + result->lastPage = NULL; + } + + /* + * Create a comment for the history file to explain why and where timeline + * changed. + */ + result->recoveryStopReason = getRecoveryStopReason(); + + result->lastRec = lastRec; + result->lastRecTLI = lastRecTLI; + result->endOfLog = endOfLog; + + result->abortedRecPtr = abortedRecPtr; + result->missingContrecPtr = missingContrecPtr; + + result->standby_signal_file_found = standby_signal_file_found; + result->recovery_signal_file_found = recovery_signal_file_found; + + return result; +} + +/* + * Clean up the WAL reader and leftovers from restoring WAL from archive + */ +void +ShutdownWalRecovery(void) +{ + char recoveryPath[MAXPGPATH]; + + /* Final update of pg_stat_recovery_prefetch. */ + XLogPrefetcherComputeStats(xlogprefetcher); + + /* Shut down xlogreader */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + XLogReaderFree(xlogreader); + XLogPrefetcherFree(xlogprefetcher); + + if (ArchiveRecoveryRequested) + { + /* + * Since there might be a partial WAL segment named RECOVERYXLOG, get + * rid of it. + */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); + unlink(recoveryPath); /* ignore any error */ + + /* Get rid of any remaining recovered timeline-history file, too */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); + unlink(recoveryPath); /* ignore any error */ + } + + /* + * We don't need the latch anymore. It's not strictly necessary to disown + * it, but let's do it for the sake of tidiness. + */ + if (ArchiveRecoveryRequested) + DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch); +} + +/* + * Perform WAL recovery. + * + * If the system was shut down cleanly, this is never called. + */ +void +PerformWalRecovery(void) +{ + XLogRecord *record; + bool reachedRecoveryTarget = false; + TimeLineID replayTLI; + + /* + * Initialize shared variables for tracking progress of WAL replay, as if + * we had just replayed the record before the REDO location (or the + * checkpoint record itself, if it's a shutdown checkpoint). + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + if (RedoStartLSN < CheckPointLoc) + { + XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN; + XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI; + } + else + { + XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI; + } + XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; + XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI; + XLogRecoveryCtl->recoveryLastXTime = 0; + XLogRecoveryCtl->currentChunkStartTime = 0; + XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* Also ensure XLogReceiptTime has a sane value */ + XLogReceiptTime = GetCurrentTimestamp(); + + /* + * Let postmaster know we've started redo now, so that it can launch the + * archiver if necessary. + */ + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); + + /* + * Allow read-only connections immediately if we're consistent already. + */ + CheckRecoveryConsistency(); + + /* + * Find the first record that logically follows the checkpoint --- it + * might physically precede it, though. + */ + if (RedoStartLSN < CheckPointLoc) + { + /* back up to find the record */ + replayTLI = RedoStartTLI; + XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN); + record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI); + } + else + { + /* just have to read next record after CheckPoint */ + Assert(xlogreader->ReadRecPtr == CheckPointLoc); + replayTLI = CheckPointTLI; + record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); + } + + if (record != NULL) + { + TimestampTz xtime; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + InRedo = true; + + RmgrStartup(); + + ereport(LOG, + (errmsg("redo starts at %X/%X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + + /* Prepare to report progress of the redo phase. */ + if (!StandbyMode) + begin_startup_progress_phase(); + + /* + * main redo apply loop + */ + do + { + if (!StandbyMode) + ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); + +#ifdef WAL_DEBUG + if (XLOG_DEBUG || + (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) || + (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3)) + { + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + xlog_outrec(&buf, xlogreader); + appendStringInfoString(&buf, " - "); + xlog_outdesc(&buf, xlogreader); + elog(LOG, "%s", buf.data); + pfree(buf.data); + } +#endif + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + + /* + * Pause WAL replay, if requested by a hot-standby session via + * SetRecoveryPause(). + * + * Note that we intentionally don't take the info_lck spinlock + * here. We might therefore read a slightly stale value of the + * recoveryPause flag, but it can't be very stale (no worse than + * the last spinlock we did acquire). Since a pause request is a + * pretty asynchronous thing anyway, possibly responding to it one + * WAL record later than we otherwise would is a minor issue, so + * it doesn't seem worth adding another spinlock cycle to prevent + * that. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * Have we reached our recovery target? + */ + if (recoveryStopsBefore(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* + * If we've been asked to lag the primary, wait on latch until + * enough time has passed. + */ + if (recoveryApplyDelay(xlogreader)) + { + /* + * We test for paused recovery again here. If user sets + * delayed apply, it may be because they expect to pause + * recovery in case of problems, so we must test again here + * otherwise pausing during the delay-wait wouldn't work. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + } + + /* + * Apply the record + */ + ApplyWalRecord(xlogreader, record, &replayTLI); + + /* Exit loop if we reached inclusive recovery target */ + if (recoveryStopsAfter(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* Else, try to fetch the next WAL record */ + record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); + } while (record != NULL); + + /* + * end of main redo apply loop + */ + + if (reachedRecoveryTarget) + { + if (!reachedConsistency) + ereport(FATAL, + (errmsg("requested recovery stop point is before consistent recovery point"))); + + /* + * This is the last point where we can restart recovery with a new + * recovery target, if we shutdown and begin again. After this, + * Resource Managers may choose to do permanent corrective actions + * at end of recovery. + */ + switch (recoveryTargetAction) + { + case RECOVERY_TARGET_ACTION_SHUTDOWN: + + /* + * exit with special return code to request shutdown of + * postmaster. Log messages issued from postmaster. + */ + proc_exit(3); + + case RECOVERY_TARGET_ACTION_PAUSE: + SetRecoveryPause(true); + recoveryPausesHere(true); + + /* drop into promote */ + + case RECOVERY_TARGET_ACTION_PROMOTE: + break; + } + } + + RmgrCleanup(); + + ereport(LOG, + (errmsg("redo done at %X/%X system usage: %s", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + pg_rusage_show(&ru0)))); + xtime = GetLatestXTime(); + if (xtime) + ereport(LOG, + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(xtime)))); + + InRedo = false; + } + else + { + /* there are no WAL records following the checkpoint */ + ereport(LOG, + (errmsg("redo is not required"))); + } + + /* + * This check is intentionally after the above log messages that indicate + * how far recovery went. + */ + if (ArchiveRecoveryRequested && + recoveryTarget != RECOVERY_TARGET_UNSET && + !reachedRecoveryTarget) + ereport(FATAL, + (errmsg("recovery ended before configured recovery target was reached"))); +} + +/* + * Subroutine of PerformWalRecovery, to apply one WAL record. + */ +static void +ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI) +{ + ErrorContextCallback errcallback; + bool switchedTLI = false; + + /* Setup error traceback support for ereport() */ + errcallback.callback = rm_redo_error_callback; + errcallback.arg = (void *) xlogreader; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* + * ShmemVariableCache->nextXid must be beyond record's xid. + */ + AdvanceNextFullTransactionIdPastXid(record->xl_xid); + + /* + * Before replaying this record, check if this record causes the current + * timeline to change. The record is already considered to be part of the + * new timeline, so we update replayTLI before replaying it. That's + * important so that replayEndTLI, which is recorded as the minimum + * recovery point's TLI if recovery stops after this record, is set + * correctly. + */ + if (record->xl_rmid == RM_XLOG_ID) + { + TimeLineID newReplayTLI = *replayTLI; + TimeLineID prevReplayTLI = *replayTLI; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + newReplayTLI = checkPoint.ThisTimeLineID; + prevReplayTLI = checkPoint.PrevTimeLineID; + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); + newReplayTLI = xlrec.ThisTimeLineID; + prevReplayTLI = xlrec.PrevTimeLineID; + } + + if (newReplayTLI != *replayTLI) + { + /* Check that it's OK to switch to this TLI */ + checkTimeLineSwitch(xlogreader->EndRecPtr, + newReplayTLI, prevReplayTLI, *replayTLI); + + /* Following WAL records should be run with new TLI */ + *replayTLI = newReplayTLI; + switchedTLI = true; + } + } + + /* + * Update shared replayEndRecPtr before replaying this record, so that + * XLogFlush will update minRecoveryPoint correctly. + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->replayEndTLI = *replayTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * If we are attempting to enter Hot Standby mode, process XIDs we see + */ + if (standbyState >= STANDBY_INITIALIZED && + TransactionIdIsValid(record->xl_xid)) + RecordKnownAssignedTransactionIds(record->xl_xid); + + /* + * Some XLOG record types that are related to recovery are processed + * directly here, rather than in xlog_redo() + */ + if (record->xl_rmid == RM_XLOG_ID) + xlogrecovery_redo(xlogreader, *replayTLI); + + /* Now apply the WAL record itself */ + GetRmgr(record->xl_rmid).rm_redo(xlogreader); + + /* + * After redo, check whether the backup pages associated with the WAL + * record are consistent with the existing pages. This check is done only + * if consistency check is enabled for this record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + verifyBackupPageConsistency(xlogreader); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* + * Update lastReplayedEndRecPtr after this record has been successfully + * replayed. + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->lastReplayedTLI = *replayTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * If rm_redo called XLogRequestWalReceiverReply, then we wake up the + * receiver so that it notices the updated lastReplayedEndRecPtr and sends + * a reply to the primary. + */ + if (doRequestWalReceiverReply) + { + doRequestWalReceiverReply = false; + WalRcvForceReply(); + } + + /* Allow read-only connections if we're consistent now */ + CheckRecoveryConsistency(); + + /* Is this a timeline switch? */ + if (switchedTLI) + { + /* + * Before we continue on the new timeline, clean up any (possibly + * bogus) future WAL segments on the old timeline. + */ + RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI); + + /* + * Wake up any walsenders to notice that we are on a new timeline. + */ + if (AllowCascadeReplication()) + WalSndWakeup(); + + /* Reset the prefetcher. */ + XLogPrefetchReconfigure(); + } +} + +/* + * Some XLOG RM record types that are directly related to WAL recovery are + * handled here rather than in the xlog_redo() + */ +static void +xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + + Assert(XLogRecGetRmid(record) == RM_XLOG_ID); + + if (info == XLOG_OVERWRITE_CONTRECORD) + { + /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */ + xl_overwrite_contrecord xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); + if (xlrec.overwritten_lsn != record->overwrittenRecPtr) + elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + LSN_FORMAT_ARGS(record->overwrittenRecPtr)); + + /* We have safely skipped the aborted record */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + + ereport(LOG, + (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time)))); + + /* Verifying the record should only happen once */ + record->overwrittenRecPtr = InvalidXLogRecPtr; + } + else if (info == XLOG_BACKUP_END) + { + XLogRecPtr startpoint; + + memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); + + if (backupStartPoint == startpoint) + { + /* + * We have reached the end of base backup, the point where + * pg_backup_stop() was done. The data on disk is now consistent + * (assuming we have also reached minRecoveryPoint). Set + * backupEndPoint to the current LSN, so that the next call to + * CheckRecoveryConsistency() will notice it and do the + * end-of-backup processing. + */ + elog(DEBUG1, "end of backup record reached"); + + backupEndPoint = lsn; + } + else + elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X", + LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); + } +} + +/* + * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real + * directories. + * + * Replay of database creation XLOG records for databases that were later + * dropped can create fake directories in pg_tblspc. By the time consistency + * is reached these directories should have been removed; here we verify + * that this did indeed happen. This is to be called at the point where + * consistent state is reached. + * + * allow_in_place_tablespaces turns the PANIC into a WARNING, which is + * useful for testing purposes, and also allows for an escape hatch in case + * things go south. + */ +static void +CheckTablespaceDirectory(void) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDir(dir, "pg_tblspc")) != NULL) + { + char path[MAXPGPATH + 10]; + + /* Skip entries of non-oid names */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name); + + if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK) + ereport(allow_in_place_tablespaces ? WARNING : PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("unexpected directory entry \"%s\" found in %s", + de->d_name, "pg_tblspc/"), + errdetail("All directory entries in pg_tblspc/ should be symbolic links."), + errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete."))); + } +} + +/* + * Checks if recovery has reached a consistent state. When consistency is + * reached and we have a valid starting standby snapshot, tell postmaster + * that it can start accepting read-only connections. + */ +static void +CheckRecoveryConsistency(void) +{ + XLogRecPtr lastReplayedEndRecPtr; + TimeLineID lastReplayedTLI; + + /* + * During crash recovery, we don't reach a consistent state until we've + * replayed all the WAL. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + return; + + Assert(InArchiveRecovery); + + /* + * assume that we are called in the startup process, and hence don't need + * a lock to read lastReplayedEndRecPtr + */ + lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; + lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI; + + /* + * Have we reached the point where our base backup was completed? + */ + if (!XLogRecPtrIsInvalid(backupEndPoint) && + backupEndPoint <= lastReplayedEndRecPtr) + { + elog(DEBUG1, "end of backup reached"); + + /* + * We have reached the end of base backup, as indicated by pg_control. + * Update the control file accordingly. + */ + ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI); + backupStartPoint = InvalidXLogRecPtr; + backupEndPoint = InvalidXLogRecPtr; + backupEndRequired = false; + } + + /* + * Have we passed our safe starting point? Note that minRecoveryPoint is + * known to be incorrectly set if recovering from a backup, until the + * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint. + * All we know prior to that is that we're not consistent yet. + */ + if (!reachedConsistency && !backupEndRequired && + minRecoveryPoint <= lastReplayedEndRecPtr) + { + /* + * Check to see if the XLOG sequence contained any unresolved + * references to uninitialized pages. + */ + XLogCheckInvalidPages(); + + /* + * Check that pg_tblspc doesn't contain any real directories. Replay + * of Database/CREATE_* records may have created ficticious tablespace + * directories that should have been removed by the time consistency + * was reached. + */ + CheckTablespaceDirectory(); + + reachedConsistency = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + } + + /* + * Have we got a valid starting snapshot that will allow queries to be + * run? If so, we can tell postmaster that the database is consistent now, + * enabling connections. + */ + if (standbyState == STANDBY_SNAPSHOT_READY && + !LocalHotStandbyActive && + reachedConsistency && + IsUnderPostmaster) + { + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->SharedHotStandbyActive = true; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + LocalHotStandbyActive = true; + + SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); + } +} + +/* + * Error context callback for errors occurring during rm_redo(). + */ +static void +rm_redo_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + xlog_block_info(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + pfree(buf.data); +} + +/* + * Returns a string describing an XLogRecord, consisting of its identity + * optionally followed by a colon, a space, and a further description. + */ +void +xlog_outdesc(StringInfo buf, XLogReaderState *record) +{ + RmgrData rmgr = GetRmgr(XLogRecGetRmid(record)); + uint8 info = XLogRecGetInfo(record); + const char *id; + + appendStringInfoString(buf, rmgr.rm_name); + appendStringInfoChar(buf, '/'); + + id = rmgr.rm_identify(info); + if (id == NULL) + appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); + else + appendStringInfo(buf, "%s: ", id); + + rmgr.rm_desc(buf, record); +} + +#ifdef WAL_DEBUG + +static void +xlog_outrec(StringInfo buf, XLogReaderState *record) +{ + appendStringInfo(buf, "prev %X/%X; xid %u", + LSN_FORMAT_ARGS(XLogRecGetPrev(record)), + XLogRecGetXid(record)); + + appendStringInfo(buf, "; len %u", + XLogRecGetDataLen(record)); + + xlog_block_info(buf, record); +} +#endif /* WAL_DEBUG */ + +/* + * Returns a string giving information about all the blocks in an + * XLogRecord. + */ +static void +xlog_block_info(StringInfo buf, XLogReaderState *record) +{ + int block_id; + + /* decode block references */ + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blk; + + if (!XLogRecGetBlockTagExtended(record, block_id, + &rnode, &forknum, &blk, NULL)) + continue; + + if (forknum != MAIN_FORKNUM) + appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, + blk); + else + appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + blk); + if (XLogRecHasBlockImage(record, block_id)) + appendStringInfoString(buf, " FPW"); + } +} + + +/* + * Check that it's OK to switch to new timeline during recovery. + * + * 'lsn' is the address of the shutdown checkpoint record we're about to + * replay. (Currently, timeline can only change at a shutdown checkpoint). + */ +static void +checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, + TimeLineID replayTLI) +{ + /* Check that the record agrees on what the current (old) timeline is */ + if (prevTLI != replayTLI) + ereport(PANIC, + (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", + prevTLI, replayTLI))); + + /* + * The new timeline better be in the list of timelines we expect to see, + * according to the timeline history. It should also not decrease. + */ + if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs)) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", + newTLI, replayTLI))); + + /* + * If we have not yet reached min recovery point, and we're about to + * switch to a timeline greater than the timeline of the min recovery + * point: trouble. After switching to the new timeline, we could not + * possibly visit the min recovery point on the correct timeline anymore. + * This can happen if there is a newer timeline in the archive that + * branched before the timeline the min recovery point is on, and you + * attempt to do PITR to the new timeline. + */ + if (!XLogRecPtrIsInvalid(minRecoveryPoint) && + lsn < minRecoveryPoint && + newTLI > minRecoveryPointTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI))); + + /* Looks good */ +} + + +/* + * Extract timestamp from WAL record. + * + * If the record contains a timestamp, returns true, and saves the timestamp + * in *recordXtime. If the record type has no timestamp, returns false. + * Currently, only transaction commit/abort records and restore points contain + * timestamps. + */ +static bool +getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 xact_info = info & XLOG_XACT_OPMASK; + uint8 rmid = XLogRecGetRmid(record); + + if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED)) + { + *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED)) + { + *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; + return true; + } + return false; +} + +/* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +verifyBackupPageConsistency(XLogReaderState *record) +{ + RmgrData rmgr = GetRmgr(XLogRecGetRmid(record)); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTagExtended(record, block_id, + &rnode, &forknum, &blkno, NULL)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + if (XLogRecBlockImageApply(record, block_id)) + { + /* + * WAL record has already applied the page, so bypass the + * consistency check as that would result in comparing the full + * page stored in the record with itself. + */ + continue; + } + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG, + InvalidBuffer); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is + * restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is no need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, primary_image_masked)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("%s", record->errormsg_buf))); + + /* + * If masking function is defined, mask both the primary and replay + * images + */ + if (rmgr.rm_mask != NULL) + { + rmgr.rm_mask(replay_image_masked, blkno); + rmgr.rm_mask(primary_image_masked, blkno); + } + + /* Time to compare the primary and replay images. */ + if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + +/* + * For point-in-time recovery, this function decides whether we want to + * stop applying the XLOG before the current record. + * + * Returns true if we are stopping, false otherwise. If stopping, some + * information is saved in recoveryStopXid et al for use in annotating the + * new timeline's history file. + */ +static bool +recoveryStopsBefore(XLogReaderState *record) +{ + bool stopsHere = false; + uint8 xact_info; + bool isCommit; + TimestampTz recordXtime = 0; + TransactionId recordXid; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + return true; + } + + /* Check if target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + !recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + /* Otherwise we only consider stopping before COMMIT or ABORT records. */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT) + { + isCommit = true; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + isCommit = true; + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT) + { + isCommit = false; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + isCommit = false; + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + return false; + + if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive) + { + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + stopsHere = (recordXid == recoveryTargetXid); + } + + /* + * Note: we must fetch recordXtime regardless of recoveryTarget setting. + * We don't expect getRecordTimestamp ever to fail, since we already know + * this is a commit or abort record; but test its result anyway. + */ + if (getRecordTimestamp(record, &recordXtime) && + recoveryTarget == RECOVERY_TARGET_TIME) + { + /* + * There can be many transactions that share the same commit time, so + * we stop after the last one, if we are inclusive, or stop at the + * first one if we are exclusive + */ + if (recoveryTargetInclusive) + stopsHere = (recordXtime > recoveryTargetTime); + else + stopsHere = (recordXtime >= recoveryTargetTime); + } + + if (stopsHere) + { + recoveryStopAfter = false; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (isCommit) + { + ereport(LOG, + (errmsg("recovery stopping before commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else + { + ereport(LOG, + (errmsg("recovery stopping before abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + } + + return stopsHere; +} + +/* + * Same as recoveryStopsBefore, but called after applying the record. + * + * We also track the timestamp of the latest applied COMMIT/ABORT + * record in XLogRecoveryCtl->recoveryLastXTime. + */ +static bool +recoveryStopsAfter(XLogReaderState *record) +{ + uint8 info; + uint8 xact_info; + uint8 rmid; + TimestampTz recordXtime = 0; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + rmid = XLogRecGetRmid(record); + + /* + * There can be many restore points that share the same name; we stop at + * the first one. + */ + if (recoveryTarget == RECOVERY_TARGET_NAME && + rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + xl_restore_point *recordRestorePointData; + + recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + + if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + (void) getRecordTimestamp(record, &recoveryStopTime); + strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); + + ereport(LOG, + (errmsg("recovery stopping at restore point \"%s\", time %s", + recoveryStopName, + timestamptz_to_str(recoveryStopTime)))); + return true; + } + } + + /* Check if the target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + if (rmid != RM_XACT_ID) + return false; + + xact_info = info & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED || + xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + TransactionId recordXid; + + /* Update the last applied transaction timestamp */ + if (getRecordTimestamp(record, &recordXtime)) + SetLatestXTime(recordXtime); + + /* Extract the XID of the committed/aborted transaction */ + if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + recordXid = XLogRecGetXid(record); + + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive && + recordXid == recoveryTargetXid) + { + recoveryStopAfter = true; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else if (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + return true; + } + } + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopTime = 0; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + return true; + } + + return false; +} + +/* + * Create a comment for the history file to explain why and where + * timeline changed. + */ +static char * +getRecoveryStopReason(void) +{ + char reason[200]; + + if (recoveryTarget == RECOVERY_TARGET_XID) + snprintf(reason, sizeof(reason), + "%s transaction %u", + recoveryStopAfter ? "after" : "before", + recoveryStopXid); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + snprintf(reason, sizeof(reason), + "%s %s\n", + recoveryStopAfter ? "after" : "before", + timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + snprintf(reason, sizeof(reason), + "%s LSN %X/%X\n", + recoveryStopAfter ? "after" : "before", + LSN_FORMAT_ARGS(recoveryStopLSN)); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + snprintf(reason, sizeof(reason), + "at restore point \"%s\"", + recoveryStopName); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + snprintf(reason, sizeof(reason), "reached consistency"); + else + snprintf(reason, sizeof(reason), "no recovery target specified"); + + return pstrdup(reason); +} + +/* + * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED. + * + * endOfRecovery is true if the recovery target is reached and + * the paused state starts at the end of recovery because of + * recovery_target_action=pause, and false otherwise. + */ +static void +recoveryPausesHere(bool endOfRecovery) +{ + /* Don't pause unless users can connect! */ + if (!LocalHotStandbyActive) + return; + + /* Don't pause after standby promotion has been triggered */ + if (LocalPromoteIsTriggered) + return; + + if (endOfRecovery) + ereport(LOG, + (errmsg("pausing at the end of recovery"), + errhint("Execute pg_wal_replay_resume() to promote."))); + else + ereport(LOG, + (errmsg("recovery has paused"), + errhint("Execute pg_wal_replay_resume() to continue."))); + + /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */ + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + if (CheckForStandbyTrigger()) + return; + + /* + * If recovery pause is requested then set it paused. While we are in + * the loop, user might resume and pause again so set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon as the + * pause ends, but we use a timeout so we can check the above exit + * condition periodically too. + */ + ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); +} + +/* + * When recovery_min_apply_delay is set, we wait long enough to make sure + * certain record types are applied at least that interval behind the primary. + * + * Returns true if we waited. + * + * Note that the delay is calculated between the WAL record log time and + * the current time on standby. We would prefer to keep track of when this + * standby received each WAL record, which would allow a more consistent + * approach and one not affected by time synchronisation issues, but that + * is significantly more effort and complexity for little actual gain in + * usability. + */ +static bool +recoveryApplyDelay(XLogReaderState *record) +{ + uint8 xact_info; + TimestampTz xtime; + TimestampTz delayUntil; + long msecs; + + /* nothing to do if no delay configured */ + if (recovery_min_apply_delay <= 0) + return false; + + /* no delay is applied on a database not yet consistent */ + if (!reachedConsistency) + return false; + + /* nothing to do if crash recovery is requested */ + if (!ArchiveRecoveryRequested) + return false; + + /* + * Is it a COMMIT record? + * + * We deliberately choose not to delay aborts since they have no effect on + * MVCC. We already allow replay of records that don't have a timestamp, + * so there is already opportunity for issues caused by early conflicts on + * standbys. + */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info != XLOG_XACT_COMMIT && + xact_info != XLOG_XACT_COMMIT_PREPARED) + return false; + + if (!getRecordTimestamp(record, &xtime)) + return false; + + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Exit without arming the latch if it's already past time to apply this + * record + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil); + if (msecs <= 0) + return false; + + while (true) + { + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + + /* + * This might change recovery_min_apply_delay or the trigger file's + * location. + */ + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + break; + + /* + * Recalculate delayUntil as recovery_min_apply_delay could have + * changed while waiting in this loop. + */ + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Wait for difference between GetCurrentTimestamp() and delayUntil. + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), + delayUntil); + + if (msecs <= 0) + break; + + elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); + + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + msecs, + WAIT_EVENT_RECOVERY_APPLY_DELAY); + } + return true; +} + +/* + * Get the current state of the recovery pause request. + */ +RecoveryPauseState +GetRecoveryPauseState(void) +{ + RecoveryPauseState state; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + state = XLogRecoveryCtl->recoveryPauseState; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return state; +} + +/* + * Set the recovery pause state. + * + * If recovery pause is requested then sets the recovery pause state to + * 'pause requested' if it is not already 'paused'. Otherwise, sets it + * to 'not paused' to resume the recovery. The recovery pause will be + * confirmed by the ConfirmRecoveryPaused. + */ +void +SetRecoveryPause(bool recoveryPause) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + + if (!recoveryPause) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED; + + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (!recoveryPause) + ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV); +} + +/* + * Confirm the recovery pause by setting the recovery pause state to + * RECOVERY_PAUSED. + */ +static void +ConfirmRecoveryPaused(void) +{ + /* If recovery pause is requested then set it paused */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + + +/* + * Attempt to read the next XLOG record. + * + * Before first call, the reader needs to be positioned to the first record + * by calling XLogPrefetcherBeginRead(). + * + * If no valid record is available, returns NULL, or fails if emode is PANIC. + * (emode must be either PANIC, LOG). In standby mode, retries until a valid + * record is available. + */ +static XLogRecord * +ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, + bool fetching_ckpt, TimeLineID replayTLI) +{ + XLogRecord *record; + XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher); + XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + + /* Pass through parameters to XLogPageRead */ + private->fetching_ckpt = fetching_ckpt; + private->emode = emode; + private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); + private->replayTLI = replayTLI; + + /* This is the first attempt to read this page. */ + lastSourceFailed = false; + + for (;;) + { + char *errormsg; + + record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg); + if (record == NULL) + { + /* + * When we find that WAL ends in an incomplete record, keep track + * of that record. After recovery is done, we'll write a record to + * indicate to downstream WAL readers that that portion is to be + * ignored. + * + * However, when ArchiveRecoveryRequested = true, we're going to + * switch to a new timeline at the end of recovery. We will only + * copy WAL over to the new timeline up to the end of the last + * complete record, so if we did this, we would later create an + * overwrite contrecord in the wrong place, breaking everything. + */ + if (!ArchiveRecoveryRequested && + !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) + { + abortedRecPtr = xlogreader->abortedRecPtr; + missingContrecPtr = xlogreader->missingContrecPtr; + } + + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + + /* + * We only end up here without a message when XLogPageRead() + * failed - in that case we already logged something. In + * StandbyMode that only happens if we have been triggered, so we + * shouldn't loop anymore in that case. + */ + if (errormsg) + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg_internal("%s", errormsg) /* already translated */ )); + } + + /* + * Check page TLI is one of the expected values. + */ + else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) + { + char fname[MAXFNAMELEN]; + XLogSegNo segno; + int32 offset; + + XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); + offset = XLogSegmentOffset(xlogreader->latestPagePtr, + wal_segment_size); + XLogFileName(fname, xlogreader->seg.ws_tli, segno, + wal_segment_size); + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg("unexpected timeline ID %u in log segment %s, offset %u", + xlogreader->latestPageTLI, + fname, + offset))); + record = NULL; + } + + if (record) + { + /* Great, got a record */ + return record; + } + else + { + /* No valid record available from this source */ + lastSourceFailed = true; + + /* + * If archive recovery was requested, but we were still doing + * crash recovery, switch to archive recovery and retry using the + * offline archive. We have now replayed all the valid WAL in + * pg_wal, so we are presumably now consistent. + * + * We require that there's at least some valid WAL present in + * pg_wal, however (!fetching_ckpt). We could recover using the + * WAL from the archive, even if pg_wal is completely empty, but + * we'd have no idea how far we'd have to replay to reach + * consistency. So err on the safe side and give up. + */ + if (!InArchiveRecovery && ArchiveRecoveryRequested && + !fetching_ckpt) + { + ereport(DEBUG1, + (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + InArchiveRecovery = true; + if (StandbyModeRequested) + EnableStandbyMode(); + + SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI); + minRecoveryPoint = xlogreader->EndRecPtr; + minRecoveryPointTLI = replayTLI; + + CheckRecoveryConsistency(); + + /* + * Before we retry, reset lastSourceFailed and currentSource + * so that we will check the archive next. + */ + lastSourceFailed = false; + currentSource = XLOG_FROM_ANY; + + continue; + } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (StandbyMode && !CheckForStandbyTrigger()) + continue; + else + return NULL; + } + } +} + +/* + * Read the XLOG page containing RecPtr into readBuf (if not read already). + * Returns number of bytes read, if the page is read successfully, or + * XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed, but + * only if they have not been previously reported. + * + * While prefetching, xlogreader->nonblocking may be set. In that case, + * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL. + * + * This is responsible for restoring files from archive as needed, as well + * as for waiting for the requested WAL record to arrive in standby mode. + * + * 'emode' specifies the log level used for reporting "file not found" or + * "end of WAL" situations in archive recovery, or in standby mode when a + * trigger file is found. If set to WARNING or below, XLogPageRead() returns + * XLREAD_FAIL in those situations, on higher log levels the ereport() won't + * return. + * + * In standby mode, if after a successful return of XLogPageRead() the + * caller finds the record it's interested in to be broken, it should + * ereport the error with the level determined by + * emode_for_corrupt_record(), and then set lastSourceFailed + * and call XLogPageRead() again with the same arguments. This lets + * XLogPageRead() to try fetching the record from another source, or to + * sleep and retry. + */ +static int +XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf) +{ + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; + uint32 targetPageOff; + XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; + int r; + + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); + targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); + + /* + * See if we need to switch to a new segment because the requested record + * is not in the currently open one. + */ + if (readFile >= 0 && + !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) + { + /* + * Request a restartpoint if we've replayed too much xlog since the + * last one. + */ + if (ArchiveRecoveryRequested && IsUnderPostmaster) + { + if (XLogCheckpointNeeded(readSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(readSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + + close(readFile); + readFile = -1; + readSource = XLOG_FROM_ANY; + } + + XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); + +retry: + /* See if we need to retrieve more data */ + if (readFile < 0 || + (readSource == XLOG_FROM_STREAM && + flushedUpto < targetPagePtr + reqLen)) + { + if (readFile >= 0 && + xlogreader->nonblocking && + readSource == XLOG_FROM_STREAM && + flushedUpto < targetPagePtr + reqLen) + return XLREAD_WOULDBLOCK; + + switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen, + private->randAccess, + private->fetching_ckpt, + targetRecPtr, + private->replayTLI, + xlogreader->EndRecPtr, + xlogreader->nonblocking)) + { + case XLREAD_WOULDBLOCK: + return XLREAD_WOULDBLOCK; + case XLREAD_FAIL: + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + return XLREAD_FAIL; + case XLREAD_SUCCESS: + break; + } + } + + /* + * At this point, we have the right segment open and if we're streaming we + * know the requested record is in it. + */ + Assert(readFile != -1); + + /* + * If the current segment is being streamed from the primary, calculate + * how much of the current page we have received already. We know the + * requested record has been received, but this is for the benefit of + * future calls, to allow quick exit at the top of this function. + */ + if (readSource == XLOG_FROM_STREAM) + { + if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) + readLen = XLOG_BLCKSZ; + else + readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - + targetPageOff; + } + else + readLen = XLOG_BLCKSZ; + + /* Read the requested page */ + readOff = targetPageOff; + + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + if (r != XLOG_BLCKSZ) + { + char fname[MAXFNAMELEN]; + int save_errno = errno; + + pgstat_report_wait_end(); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); + if (r < 0) + { + errno = save_errno; + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u: %m", + fname, readOff))); + } + else + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %u: read %d of %zu", + fname, readOff, r, (Size) XLOG_BLCKSZ))); + goto next_record_is_invalid; + } + pgstat_report_wait_end(); + + Assert(targetSegNo == readSegNo); + Assert(targetPageOff == readOff); + Assert(reqLen <= readLen); + + xlogreader->seg.ws_tli = curFileTLI; + + /* + * Check the page header immediately, so that we can retry immediately if + * it's not valid. This may seem unnecessary, because ReadPageInternal() + * validates the page header anyway, and would propagate the failure up to + * ReadRecord(), which would retry. However, there's a corner case with + * continuation records, if a record is split across two pages such that + * we would need to read the two pages from different sources. For + * example, imagine a scenario where a streaming replica is started up, + * and replay reaches a record that's split across two WAL segments. The + * first page is only available locally, in pg_wal, because it's already + * been recycled on the primary. The second page, however, is not present + * in pg_wal, and we should stream it from the primary. There is a + * recycled WAL segment present in pg_wal, with garbage contents, however. + * We would read the first page from the local WAL segment, but when + * reading the second page, we would read the bogus, recycled, WAL + * segment. If we didn't catch that case here, we would never recover, + * because ReadRecord() would retry reading the whole record from the + * beginning. + * + * Of course, this only catches errors in the page header, which is what + * happens in the case of a recycled WAL segment. Other kinds of errors or + * corruption still has the same problem. But this at least fixes the + * common case, which can happen as part of normal operation. + * + * Validating the page header is cheap enough that doing it twice + * shouldn't be a big deal from a performance point of view. + * + * When not in standby mode, an invalid page header should cause recovery + * to end, not retry reading the page, so we don't need to validate the + * page header here for the retry. Instead, ReadPageInternal() is + * responsible for the validation. + */ + if (StandbyMode && + !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) + { + /* + * Emit this error right now then retry this page immediately. Use + * errmsg_internal() because the message was already translated. + */ + if (xlogreader->errormsg_buf[0]) + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg_internal("%s", xlogreader->errormsg_buf))); + + /* reset any error XLogReaderValidatePageHeader() might have set */ + XLogReaderResetError(xlogreader); + goto next_record_is_invalid; + } + + return readLen; + +next_record_is_invalid: + + /* + * If we're reading ahead, give up fast. Retries and error reporting will + * be handled by a later read when recovery catches up to this point. + */ + if (xlogreader->nonblocking) + return XLREAD_WOULDBLOCK; + + lastSourceFailed = true; + + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + /* In standby-mode, keep trying */ + if (StandbyMode) + goto retry; + else + return XLREAD_FAIL; +} + +/* + * Open the WAL segment containing WAL location 'RecPtr'. + * + * The segment can be fetched via restore_command, or via walreceiver having + * streamed the record, or it can already be present in pg_wal. Checking + * pg_wal is mainly for crash recovery, but it will be polled in standby mode + * too, in case someone copies a new segment directly to pg_wal. That is not + * documented or recommended, though. + * + * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should + * prepare to read WAL starting from RedoStartLSN after this. + * + * 'RecPtr' might not point to the beginning of the record we're interested + * in, it might also point to the page or segment header. In that case, + * 'tliRecPtr' is the position of the WAL record we're interested in. It is + * used to decide which timeline to stream the requested WAL from. + * + * 'replayLSN' is the current replay LSN, so that if we scan for new + * timelines, we can reject a switch to a timeline that branched off before + * this point. + * + * If the record is not immediately available, the function returns false + * if we're not in standby mode. In standby mode, waits for it to become + * available. + * + * When the requested record becomes available, the function opens the file + * containing it (if not open already), and returns XLREAD_SUCCESS. When end + * of standby mode is triggered by the user, and there is no more WAL + * available, returns XLREAD_FAIL. + * + * If nonblocking is true, then give up immediately if we can't satisfy the + * request, returning XLREAD_WOULDBLOCK instead of waiting. + */ +static XLogPageReadResult +WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, + bool fetching_ckpt, XLogRecPtr tliRecPtr, + TimeLineID replayTLI, XLogRecPtr replayLSN, + bool nonblocking) +{ + static TimestampTz last_fail_time = 0; + TimestampTz now; + bool streaming_reply_sent = false; + + /*------- + * Standby mode is implemented by a state machine: + * + * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just + * pg_wal (XLOG_FROM_PG_WAL) + * 2. Check trigger file + * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) + * 4. Rescan timelines + * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. + * + * Failure to read from the current source advances the state machine to + * the next state. + * + * 'currentSource' indicates the current state. There are no currentSource + * values for "check trigger", "rescan timelines", and "sleep" states, + * those actions are taken when reading from the previous source fails, as + * part of advancing to the next state. + * + * If standby mode is turned off while reading WAL from stream, we move + * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching + * the files (which would be required at end of recovery, e.g., timeline + * history file) from archive or pg_wal. We don't need to kill WAL receiver + * here because it's already stopped when standby mode is turned off at + * the end of recovery. + *------- + */ + if (!InArchiveRecovery) + currentSource = XLOG_FROM_PG_WAL; + else if (currentSource == XLOG_FROM_ANY || + (!StandbyMode && currentSource == XLOG_FROM_STREAM)) + { + lastSourceFailed = false; + currentSource = XLOG_FROM_ARCHIVE; + } + + for (;;) + { + XLogSource oldSource = currentSource; + bool startWalReceiver = false; + + /* + * First check if we failed to read from the current source, and + * advance the state machine if so. The failure to read might've + * happened outside this function, e.g when a CRC check fails on a + * record, or within this loop. + */ + if (lastSourceFailed) + { + /* + * Don't allow any retry loops to occur during nonblocking + * readahead. Let the caller process everything that has been + * decoded already first. + */ + if (nonblocking) + return XLREAD_WOULDBLOCK; + + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * Check to see if the trigger file exists. Note that we + * do this only after failure, so when you create the + * trigger file, we still finish replaying as much as we + * can from archive and pg_wal before failover. + */ + if (StandbyMode && CheckForStandbyTrigger()) + { + XLogShutdownWalRcv(); + return XLREAD_FAIL; + } + + /* + * Not in standby mode, and we've now tried the archive + * and pg_wal. + */ + if (!StandbyMode) + return XLREAD_FAIL; + + /* + * Move to XLOG_FROM_STREAM state, and set to start a + * walreceiver if necessary. + */ + currentSource = XLOG_FROM_STREAM; + startWalReceiver = true; + break; + + case XLOG_FROM_STREAM: + + /* + * Failure while streaming. Most likely, we got here + * because streaming replication was terminated, or + * promotion was triggered. But we also get here if we + * find an invalid record in the WAL streamed from the + * primary, in which case something is seriously wrong. + * There's little chance that the problem will just go + * away, but PANIC is not good for availability either, + * especially in hot standby mode. So, we treat that the + * same as disconnection, and retry from archive/pg_wal + * again. The WAL in the archive should be identical to + * what was streamed, so it's unlikely that it helps, but + * one can hope... + */ + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * Before we leave XLOG_FROM_STREAM state, make sure that + * walreceiver is not active, so that it won't overwrite + * WAL that we restore from archive. + */ + XLogShutdownWalRcv(); + + /* + * Before we sleep, re-scan for possible new timelines if + * we were requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + if (rescanLatestTimeLine(replayTLI, replayLSN)) + { + currentSource = XLOG_FROM_ARCHIVE; + break; + } + } + + /* + * XLOG_FROM_STREAM is the last state in our state + * machine, so we've exhausted all the options for + * obtaining the requested WAL. We're going to loop back + * and retry from the archive, but if it hasn't been long + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. + */ + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) + { + long wait_time; + + wait_time = wal_retrieve_retry_interval - + TimestampDifferenceMilliseconds(last_fail_time, now); + + elog(LOG, "waiting for WAL to become available at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + + /* Do background tasks that might benefit us later. */ + KnownAssignedTransactionIdsIdleMaintenance(); + + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + now = GetCurrentTimestamp(); + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + } + last_fail_time = now; + currentSource = XLOG_FROM_ARCHIVE; + break; + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + } + else if (currentSource == XLOG_FROM_PG_WAL) + { + /* + * We just successfully read a file in pg_wal. We prefer files in + * the archive over ones in pg_wal, so try the next file again + * from the archive first. + */ + if (InArchiveRecovery) + currentSource = XLOG_FROM_ARCHIVE; + } + + if (currentSource != oldSource) + elog(DEBUG2, "switched WAL source from %s to %s after %s", + xlogSourceNames[oldSource], xlogSourceNames[currentSource], + lastSourceFailed ? "failure" : "success"); + + /* + * We've now handled possible failure. Try to read from the chosen + * source. + */ + lastSourceFailed = false; + + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * WAL receiver must not be running when reading WAL from + * archive or pg_wal. + */ + Assert(!WalRcvStreaming()); + + /* Close any old file we might have open. */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; + + /* + * Try to restore the file from archive, or read an existing + * file from pg_wal. + */ + readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, + currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : + currentSource); + if (readFile >= 0) + return XLREAD_SUCCESS; /* success! */ + + /* + * Nope, not found in archive or pg_wal. + */ + lastSourceFailed = true; + break; + + case XLOG_FROM_STREAM: + { + bool havedata; + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * First, shutdown walreceiver if its restart has been + * requested -- but no point if we're already slated for + * starting it. + */ + if (pendingWalRcvRestart && !startWalReceiver) + { + XLogShutdownWalRcv(); + + /* + * Re-scan for possible new timelines if we were + * requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == + RECOVERY_TARGET_TIMELINE_LATEST) + rescanLatestTimeLine(replayTLI, replayLSN); + + startWalReceiver = true; + } + pendingWalRcvRestart = false; + + /* + * Launch walreceiver if needed. + * + * If fetching_ckpt is true, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (startWalReceiver && + PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + { + XLogRecPtr ptr; + TimeLineID tli; + + if (fetching_ckpt) + { + ptr = RedoStartLSN; + tli = RedoStartTLI; + } + else + { + ptr = RecPtr; + + /* + * Use the record begin position to determine the + * TLI, rather than the position we're reading. + */ + tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); + + if (curFileTLI > 0 && tli < curFileTLI) + elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + LSN_FORMAT_ARGS(tliRecPtr), + tli, curFileTLI); + } + curFileTLI = tli; + SetInstallXLogFileSegmentActive(); + RequestXLogStreaming(tli, ptr, PrimaryConnInfo, + PrimarySlotName, + wal_receiver_create_temp_slot); + flushedUpto = 0; + } + + /* + * Check if WAL receiver is active or wait to start up. + */ + if (!WalRcvStreaming()) + { + lastSourceFailed = true; + break; + } + + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * allotted to conflicting queries will decrease. + */ + if (RecPtr < flushedUpto) + havedata = true; + else + { + XLogRecPtr latestChunkStart; + + flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); + if (RecPtr < flushedUpto && receiveTLI == curFileTLI) + { + havedata = true; + if (latestChunkStart <= RecPtr) + { + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + } + } + else + havedata = false; + } + if (havedata) + { + /* + * Great, streamed far enough. Open the file if it's + * not open already. Also read the timeline history + * file if we haven't initialized timeline history + * yet; it should be streamed over and present in + * pg_wal by now. Use XLOG_FROM_STREAM so that source + * info is set correctly and XLogReceiptTime isn't + * changed. + * + * NB: We must set readTimeLineHistory based on + * recoveryTargetTLI, not receiveTLI. Normally they'll + * be the same, but if recovery_target_timeline is + * 'latest' and archiving is configured, then it's + * possible that we managed to retrieve one or more + * new timeline history files from the archive, + * updating recoveryTargetTLI. + */ + if (readFile < 0) + { + if (!expectedTLEs) + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); + readFile = XLogFileRead(readSegNo, PANIC, + receiveTLI, + XLOG_FROM_STREAM, false); + Assert(readFile >= 0); + } + else + { + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return XLREAD_SUCCESS; + } + break; + } + + /* In nonblocking mode, return rather than sleeping. */ + if (nonblocking) + return XLREAD_WOULDBLOCK; + + /* + * Data not here yet. Check for trigger, then wait for + * walreceiver to wake us up when new WAL arrives. + */ + if (CheckForStandbyTrigger()) + { + /* + * Note that we don't return XLREAD_FAIL immediately + * here. After being triggered, we still want to + * replay all the WAL that was already streamed. It's + * in pg_wal now, so we just treat this as a failure, + * and the state machine will move on to replay the + * streamed WAL from pg_wal, and then recheck the + * trigger and exit replay. + */ + lastSourceFailed = true; + break; + } + + /* + * Since we have replayed everything we have received so + * far and are about to start waiting for more WAL, let's + * tell the upstream server our replay location now so + * that pg_stat_replication doesn't show stale + * information. + */ + if (!streaming_reply_sent) + { + WalRcvForceReply(); + streaming_reply_sent = true; + } + + /* Do any background tasks that might benefit us later. */ + KnownAssignedTransactionIdsIdleMaintenance(); + + /* Update pg_stat_recovery_prefetch before sleeping. */ + XLogPrefetcherComputeStats(xlogprefetcher); + + /* + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly and to check if the + * WAL receiver is still active. + */ + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM); + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + break; + } + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + + /* + * Check for recovery pause here so that we can confirm more quickly + * that a requested pause has actually taken effect. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * This possibly-long loop needs to handle interrupts of startup + * process. + */ + HandleStartupProcInterrupts(); + } + + return XLREAD_FAIL; /* not reached */ +} + + +/* + * Determine what log level should be used to report a corrupt WAL record + * in the current WAL page, previously read by XLogPageRead(). + * + * 'emode' is the error mode that would be used to report a file-not-found + * or legitimate end-of-WAL situation. Generally, we use it as-is, but if + * we're retrying the exact same record that we've tried previously, only + * complain the first time to keep the noise down. However, we only do when + * reading from pg_wal, because we don't expect any invalid records in archive + * or in records streamed from the primary. Files in the archive should be complete, + * and we should never hit the end of WAL because we stop and wait for more WAL + * to arrive before replaying it. + * + * NOTE: This function remembers the RecPtr value it was last called with, + * to suppress repeated messages about the same record. Only call this when + * you are about to ereport(), or you might cause a later message to be + * erroneously suppressed. + */ +static int +emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) +{ + static XLogRecPtr lastComplaint = 0; + + if (readSource == XLOG_FROM_PG_WAL && emode == LOG) + { + if (RecPtr == lastComplaint) + emode = DEBUG1; + else + lastComplaint = RecPtr; + } + return emode; +} + + +/* + * Subroutine to try to fetch and validate a prior checkpoint record. + * + * whichChkpt identifies the checkpoint (merely for reporting purposes). + * 1 for "primary", 0 for "other" (backup_label) + */ +static XLogRecord * +ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, + int whichChkpt, bool report, TimeLineID replayTLI) +{ + XLogRecord *record; + uint8 info; + + Assert(xlogreader != NULL); + + if (!XRecOffIsValid(RecPtr)) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint link in control file"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint link in backup_label file"))); + break; + } + return NULL; + } + + XLogPrefetcherBeginRead(xlogprefetcher, RecPtr); + record = ReadRecord(xlogprefetcher, LOG, true, replayTLI); + + if (record == NULL) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_rmid != RM_XLOG_ID) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid resource manager ID in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid resource manager ID in checkpoint record"))); + break; + } + return NULL; + } + info = record->xl_info & ~XLR_INFO_MASK; + if (info != XLOG_CHECKPOINT_SHUTDOWN && + info != XLOG_CHECKPOINT_ONLINE) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid xl_info in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid xl_info in checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid length of primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid length of checkpoint record"))); + break; + } + return NULL; + } + return record; +} + +/* + * Scan for new timelines that might have appeared in the archive since we + * started recovery. + * + * If there are any, the function changes recovery target TLI to the latest + * one and returns 'true'. + */ +static bool +rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) +{ + List *newExpectedTLEs; + bool found; + ListCell *cell; + TimeLineID newtarget; + TimeLineID oldtarget = recoveryTargetTLI; + TimeLineHistoryEntry *currentTle = NULL; + + newtarget = findNewestTimeLine(recoveryTargetTLI); + if (newtarget == recoveryTargetTLI) + { + /* No new timelines found */ + return false; + } + + /* + * Determine the list of expected TLIs for the new TLI + */ + + newExpectedTLEs = readTimeLineHistory(newtarget); + + /* + * If the current timeline is not part of the history of the new timeline, + * we cannot proceed to it. + */ + found = false; + foreach(cell, newExpectedTLEs) + { + currentTle = (TimeLineHistoryEntry *) lfirst(cell); + + if (currentTle->tli == recoveryTargetTLI) + { + found = true; + break; + } + } + if (!found) + { + ereport(LOG, + (errmsg("new timeline %u is not a child of database system timeline %u", + newtarget, + replayTLI))); + return false; + } + + /* + * The current timeline was found in the history file, but check that the + * next timeline was forked off from it *after* the current recovery + * location. + */ + if (currentTle->end < replayLSN) + { + ereport(LOG, + (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", + newtarget, + replayTLI, + LSN_FORMAT_ARGS(replayLSN)))); + return false; + } + + /* The new timeline history seems valid. Switch target */ + recoveryTargetTLI = newtarget; + list_free_deep(expectedTLEs); + expectedTLEs = newExpectedTLEs; + + /* + * As in StartupXLOG(), try to ensure we have all the history files + * between the old target and new target in pg_wal. + */ + restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); + + ereport(LOG, + (errmsg("new target timeline is %u", + recoveryTargetTLI))); + + return true; +} + + +/* + * Open a logfile segment for reading (during recovery). + * + * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. + * Otherwise, it's assumed to be already available in pg_wal. + */ +static int +XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk) +{ + char xlogfname[MAXFNAMELEN]; + char activitymsg[MAXFNAMELEN + 16]; + char path[MAXPGPATH]; + int fd; + + XLogFileName(xlogfname, tli, segno, wal_segment_size); + + switch (source) + { + case XLOG_FROM_ARCHIVE: + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", + xlogfname); + set_ps_display(activitymsg); + + if (!RestoreArchivedFile(path, xlogfname, + "RECOVERYXLOG", + wal_segment_size, + InRedo)) + return -1; + break; + + case XLOG_FROM_PG_WAL: + case XLOG_FROM_STREAM: + XLogFilePath(path, tli, segno, wal_segment_size); + break; + + default: + elog(ERROR, "invalid XLogFileRead source %d", source); + } + + /* + * If the segment was fetched from archival storage, replace the existing + * xlog segment (if any) with the archival version. + */ + if (source == XLOG_FROM_ARCHIVE) + { + Assert(!IsInstallXLogFileSegmentActive()); + KeepFileRestoredFromArchive(path, xlogfname); + + /* + * Set path to point at the new file in pg_wal. + */ + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); + } + + fd = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + /* Success! */ + curFileTLI = tli; + + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "recovering %s", + xlogfname); + set_ps_display(activitymsg); + + /* Track source of data in assorted state variables */ + readSource = source; + XLogReceiptSource = source; + /* In FROM_STREAM case, caller tracks receipt time, not me */ + if (source != XLOG_FROM_STREAM) + XLogReceiptTime = GetCurrentTimestamp(); + + return fd; + } + if (errno != ENOENT || !notfoundOk) /* unexpected failure? */ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Open a logfile segment for reading (during recovery). + * + * This version searches for the segment with any TLI listed in expectedTLEs. + */ +static int +XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) +{ + char path[MAXPGPATH]; + ListCell *cell; + int fd; + List *tles; + + /* + * Loop looking for a suitable timeline ID: we might need to read any of + * the timelines listed in expectedTLEs. + * + * We expect curFileTLI on entry to be the TLI of the preceding file in + * sequence, or 0 if there was no predecessor. We do not allow curFileTLI + * to go backwards; this prevents us from picking up the wrong file when a + * parent timeline extends to higher segment numbers than the child we + * want to read. + * + * If we haven't read the timeline history file yet, read it now, so that + * we know which TLIs to scan. We don't save the list in expectedTLEs, + * however, unless we actually find a valid segment. That way if there is + * neither a timeline history file nor a WAL segment in the archive, and + * streaming replication is set up, we'll read the timeline history file + * streamed from the primary when we start streaming, instead of + * recovering with a dummy history generated here. + */ + if (expectedTLEs) + tles = expectedTLEs; + else + tles = readTimeLineHistory(recoveryTargetTLI); + + foreach(cell, tles) + { + TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); + TimeLineID tli = hent->tli; + + if (tli < curFileTLI) + break; /* don't bother looking at too-old TLIs */ + + /* + * Skip scanning the timeline ID that the logfile segment to read + * doesn't belong to + */ + if (hent->begin != InvalidXLogRecPtr) + { + XLogSegNo beginseg = 0; + + XLByteToSeg(hent->begin, beginseg, wal_segment_size); + + /* + * The logfile segment that doesn't belong to the timeline is + * older or newer than the segment that the timeline started or + * ended at, respectively. It's sufficient to check only the + * starting segment of the timeline here. Since the timelines are + * scanned in descending order in this loop, any segments newer + * than the ending segment should belong to newer timeline and + * have already been read before. So it's not necessary to check + * the ending segment of the timeline here. + */ + if (segno < beginseg) + continue; + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_ARCHIVE, true); + if (fd != -1) + { + elog(DEBUG1, "got WAL segment from archive"); + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_PG_WAL, true); + if (fd != -1) + { + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + } + + /* Couldn't find it. For simplicity, complain about front timeline */ + XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size); + errno = ENOENT; + ereport(emode, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Set flag to signal the walreceiver to restart. (The startup process calls + * this on noticing a relevant configuration change.) + */ +void +StartupRequestWalReceiverRestart(void) +{ + if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) + { + ereport(LOG, + (errmsg("WAL receiver process shutdown requested"))); + + pendingWalRcvRestart = true; + } +} + + +/* + * Has a standby promotion already been triggered? + * + * Unlike CheckForStandbyTrigger(), this works in any process + * that's connected to shared memory. + */ +bool +PromoteIsTriggered(void) +{ + /* + * We check shared state each time only until a standby promotion is + * triggered. We can't trigger a promotion again, so there's no need to + * keep checking after the shared variable has once been seen true. + */ + if (LocalPromoteIsTriggered) + return true; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return LocalPromoteIsTriggered; +} + +static void +SetPromoteIsTriggered(void) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->SharedPromoteIsTriggered = true; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * Mark the recovery pause state as 'not paused' because the paused state + * ends and promotion continues if a promotion is triggered while recovery + * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly + * return 'paused' while a promotion is ongoing. + */ + SetRecoveryPause(false); + + LocalPromoteIsTriggered = true; +} + +/* + * Check to see whether the user-specified trigger file exists and whether a + * promote request has arrived. If either condition holds, return true. + */ +static bool +CheckForStandbyTrigger(void) +{ + struct stat stat_buf; + + if (LocalPromoteIsTriggered) + return true; + + if (IsPromoteSignaled() && CheckPromoteSignal()) + { + ereport(LOG, (errmsg("received promote request"))); + RemovePromoteSignalFiles(); + ResetPromoteSignaled(); + SetPromoteIsTriggered(); + return true; + } + + if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0) + return false; + + if (stat(PromoteTriggerFile, &stat_buf) == 0) + { + ereport(LOG, + (errmsg("promote trigger file found: %s", PromoteTriggerFile))); + unlink(PromoteTriggerFile); + SetPromoteIsTriggered(); + return true; + } + else if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat promote trigger file \"%s\": %m", + PromoteTriggerFile))); + + return false; +} + +/* + * Remove the files signaling a standby promotion request. + */ +void +RemovePromoteSignalFiles(void) +{ + unlink(PROMOTE_SIGNAL_FILE); +} + +/* + * Check to see if a promote request has arrived. + */ +bool +CheckPromoteSignal(void) +{ + struct stat stat_buf; + + if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} + +/* + * Wake up startup process to replay newly arrived WAL, or to notice that + * failover has been requested. + */ +void +WakeupRecovery(void) +{ + SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); +} + +/* + * Schedule a walreceiver wakeup in the main recovery loop. + */ +void +XLogRequestWalReceiverReply(void) +{ + doRequestWalReceiverReply = true; +} + +/* + * Is HotStandby active yet? This is only important in special backends + * since normal backends won't ever be able to connect until this returns + * true. Postmaster knows this by way of signal, not via shared memory. + * + * Unlike testing standbyState, this works in any process that's connected to + * shared memory. (And note that standbyState alone doesn't tell the truth + * anyway.) + */ +bool +HotStandbyActive(void) +{ + /* + * We check shared state each time only until Hot Standby is active. We + * can't de-activate Hot Standby, so there's no need to keep checking + * after the shared variable has once been seen true. + */ + if (LocalHotStandbyActive) + return true; + else + { + /* spinlock is essential on machines with weak memory ordering! */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return LocalHotStandbyActive; + } +} + +/* + * Like HotStandbyActive(), but to be used only in WAL replay code, + * where we don't need to ask any other process what the state is. + */ +static bool +HotStandbyActiveInReplay(void) +{ + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + return LocalHotStandbyActive; +} + +/* + * Get latest redo apply position. + * + * Exported to allow WALReceiver to read the pointer directly. + */ +XLogRecPtr +GetXLogReplayRecPtr(TimeLineID *replayTLI) +{ + XLogRecPtr recptr; + TimeLineID tli; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + recptr = XLogRecoveryCtl->lastReplayedEndRecPtr; + tli = XLogRecoveryCtl->lastReplayedTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (replayTLI) + *replayTLI = tli; + return recptr; +} + + +/* + * Get position of last applied, or the record being applied. + * + * This is different from GetXLogReplayRecPtr() in that if a WAL + * record is currently being applied, this includes that record. + */ +XLogRecPtr +GetCurrentReplayRecPtr(TimeLineID *replayEndTLI) +{ + XLogRecPtr recptr; + TimeLineID tli; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + recptr = XLogRecoveryCtl->replayEndRecPtr; + tli = XLogRecoveryCtl->replayEndTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (replayEndTLI) + *replayEndTLI = tli; + return recptr; +} + +/* + * Save timestamp of latest processed commit/abort record. + * + * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be + * seen by processes other than the startup process. Note in particular + * that CreateRestartPoint is executed in the checkpointer. + */ +static void +SetLatestXTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->recoveryLastXTime = xtime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + */ +TimestampTz +GetLatestXTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + xtime = XLogRecoveryCtl->recoveryLastXTime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return xtime; +} + +/* + * Save timestamp of the next chunk of WAL records to apply. + * + * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be + * seen by all backends. + */ +static void +SetCurrentChunkStartTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->currentChunkStartTime = xtime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + * Startup process maintains an accurate local copy in XLogReceiptTime + */ +TimestampTz +GetCurrentChunkReplayStartTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + xtime = XLogRecoveryCtl->currentChunkStartTime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return xtime; +} + +/* + * Returns time of receipt of current chunk of XLOG data, as well as + * whether it was received from streaming replication or from archives. + */ +void +GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) +{ + /* + * This must be executed in the startup process, since we don't export the + * relevant state to shared memory. + */ + Assert(InRecovery); + + *rtime = XLogReceiptTime; + *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM); +} + +/* + * Note that text field supplied is a parameter name and does not require + * translation + */ +void +RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) +{ + if (currValue < minValue) + { + if (HotStandbyActiveInReplay()) + { + bool warned_for_promote = false; + + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hot standby is not possible because of insufficient parameter settings"), + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue))); + + SetRecoveryPause(true); + + ereport(LOG, + (errmsg("recovery has paused"), + errdetail("If recovery is unpaused, the server will shut down."), + errhint("You can then restart the server after making the necessary configuration changes."))); + + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + { + if (!warned_for_promote) + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("promotion is not possible because of insufficient parameter settings"), + + /* + * Repeat the detail from above so it's easy to find + * in the log. + */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("Restart the server after making the necessary configuration changes."))); + warned_for_promote = true; + } + + /* + * If recovery pause is requested then set it paused. While + * we are in the loop, user might resume and pause again so + * set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon + * as the pause ends, but we use a timeout so we can check the + * above conditions periodically too. + */ + ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); + } + + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery aborted because of insufficient parameter settings"), + /* Repeat the detail from above so it's easy to find in the log. */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("You can restart the server after making the necessary configuration changes."))); + } +} diff --git a/src/backend/access/transam/xlogstats.c b/src/backend/access/transam/xlogstats.c new file mode 100644 index 0000000..5141817 --- /dev/null +++ b/src/backend/access/transam/xlogstats.c @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * xlogstats.c + * Functions for WAL Statitstics + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/transam/xlogstats.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlogreader.h" +#include "access/xlogstats.h" + +/* + * Calculate the size of a record, split into !FPI and FPI parts. + */ +void +XLogRecGetLen(XLogReaderState *record, uint32 *rec_len, + uint32 *fpi_len) +{ + int block_id; + + /* + * Calculate the amount of FPI data in the record. + * + * XXX: We peek into xlogreader's private decoded backup blocks for the + * bimg_len indicating the length of FPI data. + */ + *fpi_len = 0; + for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) + { + if (!XLogRecHasBlockRef(record, block_id)) + continue; + + if (XLogRecHasBlockImage(record, block_id)) + *fpi_len += XLogRecGetBlock(record, block_id)->bimg_len; + } + + /* + * Calculate the length of the record as the total length - the length of + * all the block images. + */ + *rec_len = XLogRecGetTotalLen(record) - *fpi_len; +} + +/* + * Store per-rmgr and per-record statistics for a given record. + */ +void +XLogRecStoreStats(XLogStats *stats, XLogReaderState *record) +{ + RmgrId rmid; + uint8 recid; + uint32 rec_len; + uint32 fpi_len; + + Assert(stats != NULL && record != NULL); + + stats->count++; + + rmid = XLogRecGetRmid(record); + + XLogRecGetLen(record, &rec_len, &fpi_len); + + /* Update per-rmgr statistics */ + + stats->rmgr_stats[rmid].count++; + stats->rmgr_stats[rmid].rec_len += rec_len; + stats->rmgr_stats[rmid].fpi_len += fpi_len; + + /* + * Update per-record statistics, where the record is identified by a + * combination of the RmgrId and the four bits of the xl_info field that + * are the rmgr's domain (resulting in sixteen possible entries per + * RmgrId). + */ + + recid = XLogRecGetInfo(record) >> 4; + + /* + * XACT records need to be handled differently. Those records use the + * first bit of those four bits for an optional flag variable and the + * following three bits for the opcode. We filter opcode out of xl_info + * and use it as the identifier of the record. + */ + if (rmid == RM_XACT_ID) + recid &= 0x07; + + stats->record_stats[rmid][recid].count++; + stats->record_stats[rmid][recid].rec_len += rec_len; + stats->record_stats[rmid][recid].fpi_len += fpi_len; +} diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c new file mode 100644 index 0000000..702c8c1 --- /dev/null +++ b/src/backend/access/transam/xlogutils.c @@ -0,0 +1,1064 @@ +/*------------------------------------------------------------------------- + * + * xlogutils.c + * + * PostgreSQL write-ahead log manager utility routines + * + * This file contains support routines that are used by XLOG replay functions. + * None of this code is used during normal system operation. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogutils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/timeline.h" +#include "access/xlogrecovery.h" +#include "access/xlog_internal.h" +#include "access/xlogprefetcher.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/fd.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/rel.h" + + +/* GUC variable */ +bool ignore_invalid_pages = false; + +/* + * Are we doing recovery from XLOG? + * + * This is only ever true in the startup process; it should be read as meaning + * "this process is replaying WAL records", rather than "the system is in + * recovery mode". It should be examined primarily by functions that need + * to act differently when called from a WAL redo function (e.g., to skip WAL + * logging). To check whether the system is in recovery regardless of which + * process you're running in, use RecoveryInProgress() but only after shared + * memory startup and lock initialization. + * + * This is updated from xlog.c and xlogrecovery.c, but lives here because + * it's mostly read by WAL redo functions. + */ +bool InRecovery = false; + +/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */ +HotStandbyState standbyState = STANDBY_DISABLED; + +/* + * During XLOG replay, we may see XLOG records for incremental updates of + * pages that no longer exist, because their relation was later dropped or + * truncated. (Note: this is only possible when full_page_writes = OFF, + * since when it's ON, the first reference we see to a page should always + * be a full-page rewrite not an incremental update.) Rather than simply + * ignoring such records, we make a note of the referenced page, and then + * complain if we don't actually see a drop or truncate covering the page + * later in replay. + */ +typedef struct xl_invalid_page_key +{ + RelFileNode node; /* the relation */ + ForkNumber forkno; /* the fork number */ + BlockNumber blkno; /* the page */ +} xl_invalid_page_key; + +typedef struct xl_invalid_page +{ + xl_invalid_page_key key; /* hash key ... must be first */ + bool present; /* page existed but contained zeroes */ +} xl_invalid_page; + +static HTAB *invalid_page_tab = NULL; + +static int read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, + char *cur_page, bool wait_for_wal); + +/* Report a reference to an invalid page */ +static void +report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno, + BlockNumber blkno, bool present) +{ + char *path = relpathperm(node, forkno); + + if (present) + elog(elevel, "page %u of relation %s is uninitialized", + blkno, path); + else + elog(elevel, "page %u of relation %s does not exist", + blkno, path); + pfree(path); +} + +/* Log a reference to an invalid page */ +static void +log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno, + bool present) +{ + xl_invalid_page_key key; + xl_invalid_page *hentry; + bool found; + + /* + * Once recovery has reached a consistent state, the invalid-page table + * should be empty and remain so. If a reference to an invalid page is + * found after consistency is reached, PANIC immediately. This might seem + * aggressive, but it's better than letting the invalid reference linger + * in the hash table until the end of recovery and PANIC there, which + * might come only much later if this is a standby server. + */ + if (reachedConsistency) + { + report_invalid_page(WARNING, node, forkno, blkno, present); + elog(ignore_invalid_pages ? WARNING : PANIC, + "WAL contains references to invalid pages"); + } + + /* + * Log references to invalid pages at DEBUG1 level. This allows some + * tracing of the cause (note the elog context mechanism will tell us + * something about the XLOG record that generated the reference). + */ + if (message_level_is_interesting(DEBUG1)) + report_invalid_page(DEBUG1, node, forkno, blkno, present); + + if (invalid_page_tab == NULL) + { + /* create hash table when first needed */ + HASHCTL ctl; + + ctl.keysize = sizeof(xl_invalid_page_key); + ctl.entrysize = sizeof(xl_invalid_page); + + invalid_page_tab = hash_create("XLOG invalid-page table", + 100, + &ctl, + HASH_ELEM | HASH_BLOBS); + } + + /* we currently assume xl_invalid_page_key contains no padding */ + key.node = node; + key.forkno = forkno; + key.blkno = blkno; + hentry = (xl_invalid_page *) + hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + hentry->present = present; + } + else + { + /* repeat reference ... leave "present" as it was */ + } +} + +/* Forget any invalid pages >= minblkno, because they've been dropped */ +static void +forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + if (RelFileNodeEquals(hentry->key.node, node) && + hentry->key.forkno == forkno && + hentry->key.blkno >= minblkno) + { + if (message_level_is_interesting(DEBUG2)) + { + char *path = relpathperm(hentry->key.node, forkno); + + elog(DEBUG2, "page %u of relation %s has been dropped", + hentry->key.blkno, path); + pfree(path); + } + + if (hash_search(invalid_page_tab, + (void *) &hentry->key, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + } + } +} + +/* Forget any invalid pages in a whole database */ +static void +forget_invalid_pages_db(Oid dbid) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + if (hentry->key.node.dbNode == dbid) + { + if (message_level_is_interesting(DEBUG2)) + { + char *path = relpathperm(hentry->key.node, hentry->key.forkno); + + elog(DEBUG2, "page %u of relation %s has been dropped", + hentry->key.blkno, path); + pfree(path); + } + + if (hash_search(invalid_page_tab, + (void *) &hentry->key, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + } + } +} + +/* Are there any unresolved references to invalid pages? */ +bool +XLogHaveInvalidPages(void) +{ + if (invalid_page_tab != NULL && + hash_get_num_entries(invalid_page_tab) > 0) + return true; + return false; +} + +/* Complain about any remaining invalid-page entries */ +void +XLogCheckInvalidPages(void) +{ + HASH_SEQ_STATUS status; + xl_invalid_page *hentry; + bool foundone = false; + + if (invalid_page_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, invalid_page_tab); + + /* + * Our strategy is to emit WARNING messages for all remaining entries and + * only PANIC after we've dumped all the available info. + */ + while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) + { + report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno, + hentry->key.blkno, hentry->present); + foundone = true; + } + + if (foundone) + elog(ignore_invalid_pages ? WARNING : PANIC, + "WAL contains references to invalid pages"); + + hash_destroy(invalid_page_tab); + invalid_page_tab = NULL; +} + + +/* + * XLogReadBufferForRedo + * Read a page during XLOG replay + * + * Reads a block referenced by a WAL record into shared buffer cache, and + * determines what needs to be done to redo the changes to it. If the WAL + * record includes a full-page image of the page, it is restored. + * + * 'record.EndRecPtr' is compared to the page's LSN to determine if the record + * has already been replayed. 'block_id' is the ID number the block was + * registered with, when the WAL record was created. + * + * Returns one of the following: + * + * BLK_NEEDS_REDO - changes from the WAL record need to be applied + * BLK_DONE - block doesn't need replaying + * BLK_RESTORED - block was restored from a full-page image included in + * the record + * BLK_NOTFOUND - block was not found (because it was truncated away by + * an operation later in the WAL stream) + * + * On return, the buffer is locked in exclusive-mode, and returned in *buf. + * Note that the buffer is locked and returned even if it doesn't need + * replaying. (Getting the buffer lock is not really necessary during + * single-process crash recovery, but some subroutines such as MarkBufferDirty + * will complain if we don't have the lock. In hot standby mode it's + * definitely necessary.) + * + * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag + * set, we restore it, even if the page in the database appears newer. This + * is to protect ourselves against database pages that were partially or + * incorrectly written during a crash. We assume that the XLOG data must be + * good because it has passed a CRC check, while the database page might not + * be. This will force us to replay all subsequent modifications of the page + * that appear in XLOG, rather than possibly ignoring them as already + * applied, but that's not a huge drawback. + */ +XLogRedoAction +XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, + Buffer *buf) +{ + return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, + false, buf); +} + +/* + * Pin and lock a buffer referenced by a WAL record, for the purpose of + * re-initializing it. + */ +Buffer +XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) +{ + Buffer buf; + + XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, + &buf); + return buf; +} + +/* + * XLogReadBufferForRedoExtended + * Like XLogReadBufferForRedo, but with extra options. + * + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the referenced block number. In + * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value + * is always BLK_NEEDS_REDO. + * + * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock + * parameter. Do not use an inconsistent combination!) + * + * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer + * using LockBufferForCleanup(), instead of a regular exclusive lock. + */ +XLogRedoAction +XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 block_id, + ReadBufferMode mode, bool get_cleanup_lock, + Buffer *buf) +{ + XLogRecPtr lsn = record->EndRecPtr; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + Buffer prefetch_buffer; + Page page; + bool zeromode; + bool willinit; + + if (!XLogRecGetBlockTagExtended(record, block_id, &rnode, &forknum, &blkno, + &prefetch_buffer)) + { + /* Caller specified a bogus block_id */ + elog(PANIC, "failed to locate backup block with ID %d in WAL record", + block_id); + } + + /* + * Make sure that if the block is marked with WILL_INIT, the caller is + * going to initialize it. And vice versa. + */ + zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); + willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0; + if (willinit && !zeromode) + elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); + if (!willinit && zeromode) + elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); + + /* If it has a full-page image and it should be restored, do it. */ + if (XLogRecBlockImageApply(record, block_id)) + { + Assert(XLogRecHasBlockImage(record, block_id)); + *buf = XLogReadBufferExtended(rnode, forknum, blkno, + get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK, + prefetch_buffer); + page = BufferGetPage(*buf); + if (!RestoreBlockImage(record, block_id, page)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("%s", record->errormsg_buf))); + + /* + * The page may be uninitialized. If so, we can't set the LSN because + * that would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, lsn); + } + + MarkBufferDirty(*buf); + + /* + * At the end of crash recovery the init forks of unlogged relations + * are copied, without going through shared buffers. So we need to + * force the on-disk state of init forks to always be in sync with the + * state in shared buffers. + */ + if (forknum == INIT_FORKNUM) + FlushOneBuffer(*buf); + + return BLK_RESTORED; + } + else + { + *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode, prefetch_buffer); + if (BufferIsValid(*buf)) + { + if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) + { + if (get_cleanup_lock) + LockBufferForCleanup(*buf); + else + LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); + } + if (lsn <= PageGetLSN(BufferGetPage(*buf))) + return BLK_DONE; + else + return BLK_NEEDS_REDO; + } + else + return BLK_NOTFOUND; + } +} + +/* + * XLogReadBufferExtended + * Read a page during XLOG replay + * + * This is functionally comparable to ReadBufferExtended. There's some + * differences in the behavior wrt. the "mode" argument: + * + * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we + * return InvalidBuffer. In this case the caller should silently skip the + * update on this page. (In this situation, we expect that the page was later + * dropped or truncated. If we don't see evidence of that later in the WAL + * sequence, we'll complain at the end of WAL replay.) + * + * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended + * with all-zeroes pages up to the given block number. + * + * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't + * exist, and we don't check for all-zeroes. Thus, no log entry is made + * to imply that the page should be dropped or truncated later. + * + * Optionally, recent_buffer can be used to provide a hint about the location + * of the page in the buffer pool; it does not have to be correct, but avoids + * a buffer mapping table probe if it is. + * + * NB: A redo function should normally not call this directly. To get a page + * to modify, use XLogReadBufferForRedoExtended instead. It is important that + * all pages modified by a WAL record are registered in the WAL records, or + * they will be invisible to tools that need to know which pages are modified. + */ +Buffer +XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, + BlockNumber blkno, ReadBufferMode mode, + Buffer recent_buffer) +{ + BlockNumber lastblock; + Buffer buffer; + SMgrRelation smgr; + + Assert(blkno != P_NEW); + + /* Do we have a clue where the buffer might be already? */ + if (BufferIsValid(recent_buffer) && + mode == RBM_NORMAL && + ReadRecentBuffer(rnode, forknum, blkno, recent_buffer)) + { + buffer = recent_buffer; + goto recent_buffer_fast_path; + } + + /* Open the relation at smgr level */ + smgr = smgropen(rnode, InvalidBackendId); + + /* + * Create the target file if it doesn't already exist. This lets us cope + * if the replay sequence contains writes to a relation that is later + * deleted. (The original coding of this routine would instead suppress + * the writes, but that seems like it risks losing valuable data if the + * filesystem loses an inode during a crash. Better to write the data + * until we are actually told to delete the file.) + */ + smgrcreate(smgr, forknum, true); + + lastblock = smgrnblocks(smgr, forknum); + + if (blkno < lastblock) + { + /* page exists in file */ + buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, + mode, NULL, true); + } + else + { + /* hm, page doesn't exist in file */ + if (mode == RBM_NORMAL) + { + log_invalid_page(rnode, forknum, blkno, false); + return InvalidBuffer; + } + if (mode == RBM_NORMAL_NO_LOG) + return InvalidBuffer; + /* OK to extend the file */ + /* we do this in recovery only - no rel-extension lock needed */ + Assert(InRecovery); + buffer = InvalidBuffer; + do + { + if (buffer != InvalidBuffer) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } + buffer = ReadBufferWithoutRelcache(rnode, forknum, + P_NEW, mode, NULL, true); + } + while (BufferGetBlockNumber(buffer) < blkno); + /* Handle the corner case that P_NEW returns non-consecutive pages */ + if (BufferGetBlockNumber(buffer) != blkno) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, + mode, NULL, true); + } + } + +recent_buffer_fast_path: + if (mode == RBM_NORMAL) + { + /* check that page has been initialized */ + Page page = (Page) BufferGetPage(buffer); + + /* + * We assume that PageIsNew is safe without a lock. During recovery, + * there should be no other backends that could modify the buffer at + * the same time. + */ + if (PageIsNew(page)) + { + ReleaseBuffer(buffer); + log_invalid_page(rnode, forknum, blkno, true); + return InvalidBuffer; + } + } + + return buffer; +} + +/* + * Struct actually returned by CreateFakeRelcacheEntry, though the declared + * return type is Relation. + */ +typedef struct +{ + RelationData reldata; /* Note: this must be first */ + FormData_pg_class pgc; +} FakeRelCacheEntryData; + +typedef FakeRelCacheEntryData *FakeRelCacheEntry; + +/* + * Create a fake relation cache entry for a physical relation + * + * It's often convenient to use the same functions in XLOG replay as in the + * main codepath, but those functions typically work with a relcache entry. + * We don't have a working relation cache during XLOG replay, but this + * function can be used to create a fake relcache entry instead. Only the + * fields related to physical storage, like rd_rel, are initialized, so the + * fake entry is only usable in low-level operations like ReadBuffer(). + * + * This is also used for syncing WAL-skipped files. + * + * Caller must free the returned entry with FreeFakeRelcacheEntry(). + */ +Relation +CreateFakeRelcacheEntry(RelFileNode rnode) +{ + FakeRelCacheEntry fakeentry; + Relation rel; + + /* Allocate the Relation struct and all related space in one block. */ + fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); + rel = (Relation) fakeentry; + + rel->rd_rel = &fakeentry->pgc; + rel->rd_node = rnode; + + /* + * We will never be working with temp rels during recovery or while + * syncing WAL-skipped files. + */ + rel->rd_backend = InvalidBackendId; + + /* It must be a permanent table here */ + rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; + + /* We don't know the name of the relation; use relfilenode instead */ + sprintf(RelationGetRelationName(rel), "%u", rnode.relNode); + + /* + * We set up the lockRelId in case anything tries to lock the dummy + * relation. Note that this is fairly bogus since relNode may be + * different from the relation's OID. It shouldn't really matter though. + * In recovery, we are running by ourselves and can't have any lock + * conflicts. While syncing, we already hold AccessExclusiveLock. + */ + rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode; + rel->rd_lockInfo.lockRelId.relId = rnode.relNode; + + rel->rd_smgr = NULL; + + return rel; +} + +/* + * Free a fake relation cache entry. + */ +void +FreeFakeRelcacheEntry(Relation fakerel) +{ + /* make sure the fakerel is not referenced by the SmgrRelation anymore */ + if (fakerel->rd_smgr != NULL) + smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr); + pfree(fakerel); +} + +/* + * Drop a relation during XLOG replay + * + * This is called when the relation is about to be deleted; we need to remove + * any open "invalid-page" records for the relation. + */ +void +XLogDropRelation(RelFileNode rnode, ForkNumber forknum) +{ + forget_invalid_pages(rnode, forknum, 0); +} + +/* + * Drop a whole database during XLOG replay + * + * As above, but for DROP DATABASE instead of dropping a single rel + */ +void +XLogDropDatabase(Oid dbid) +{ + /* + * This is unnecessarily heavy-handed, as it will close SMgrRelation + * objects for other databases as well. DROP DATABASE occurs seldom enough + * that it's not worth introducing a variant of smgrclose for just this + * purpose. XXX: Or should we rather leave the smgr entries dangling? + */ + smgrcloseall(); + + forget_invalid_pages_db(dbid); +} + +/* + * Truncate a relation during XLOG replay + * + * We need to clean up any open "invalid-page" records for the dropped pages. + */ +void +XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, + BlockNumber nblocks) +{ + forget_invalid_pages(rnode, forkNum, nblocks); +} + +/* + * Determine which timeline to read an xlog page from and set the + * XLogReaderState's currTLI to that timeline ID. + * + * We care about timelines in xlogreader when we might be reading xlog + * generated prior to a promotion, either if we're currently a standby in + * recovery or if we're a promoted primary reading xlogs generated by the old + * primary before our promotion. + * + * wantPage must be set to the start address of the page to read and + * wantLength to the amount of the page that will be read, up to + * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. + * + * The currTLI argument should be the system-wide current timeline. + * Note that this may be different from state->currTLI, which is the timeline + * from which the caller is currently reading previous xlog records. + * + * We switch to an xlog segment from the new timeline eagerly when on a + * historical timeline, as soon as we reach the start of the xlog segment + * containing the timeline switch. The server copied the segment to the new + * timeline so all the data up to the switch point is the same, but there's no + * guarantee the old segment will still exist. It may have been deleted or + * renamed with a .partial suffix so we can't necessarily keep reading from + * the old TLI even though tliSwitchPoint says it's OK. + * + * We can't just check the timeline when we read a page on a different segment + * to the last page. We could've received a timeline switch from a cascading + * upstream, so the current segment ends abruptly (possibly getting renamed to + * .partial) and we have to switch to a new one. Even in the middle of reading + * a page we could have to dump the cached page and switch to a new TLI. + * + * Because of this, callers MAY NOT assume that currTLI is the timeline that + * will be in a page's xlp_tli; the page may begin on an older timeline or we + * might be reading from historical timeline data on a segment that's been + * copied to a new timeline. + * + * The caller must also make sure it doesn't read past the current replay + * position (using GetXLogReplayRecPtr) if executing in recovery, so it + * doesn't fail to notice that the current timeline became historical. + */ +void +XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, + uint32 wantLength, TimeLineID currTLI) +{ + const XLogRecPtr lastReadPage = (state->seg.ws_segno * + state->segcxt.ws_segsize + state->segoff); + + Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); + Assert(wantLength <= XLOG_BLCKSZ); + Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); + Assert(currTLI != 0); + + /* + * If the desired page is currently read in and valid, we have nothing to + * do. + * + * The caller should've ensured that it didn't previously advance readOff + * past the valid limit of this timeline, so it doesn't matter if the + * current TLI has since become historical. + */ + if (lastReadPage == wantPage && + state->readLen != 0 && + lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) + return; + + /* + * If we're reading from the current timeline, it hasn't become historical + * and the page we're reading is after the last page read, we can again + * just carry on. (Seeking backwards requires a check to make sure the + * older page isn't on a prior timeline). + * + * currTLI might've become historical since the caller obtained the value, + * but the caller is required not to read past the flush limit it saw at + * the time it looked up the timeline. There's nothing we can do about it + * if StartupXLOG() renames it to .partial concurrently. + */ + if (state->currTLI == currTLI && wantPage >= lastReadPage) + { + Assert(state->currTLIValidUntil == InvalidXLogRecPtr); + return; + } + + /* + * If we're just reading pages from a previously validated historical + * timeline and the timeline we're reading from is valid until the end of + * the current segment we can just keep reading. + */ + if (state->currTLIValidUntil != InvalidXLogRecPtr && + state->currTLI != currTLI && + state->currTLI != 0 && + ((wantPage + wantLength) / state->segcxt.ws_segsize) < + (state->currTLIValidUntil / state->segcxt.ws_segsize)) + return; + + /* + * If we reach this point we're either looking up a page for random + * access, the current timeline just became historical, or we're reading + * from a new segment containing a timeline switch. In all cases we need + * to determine the newest timeline on the segment. + * + * If it's the current timeline we can just keep reading from here unless + * we detect a timeline switch that makes the current timeline historical. + * If it's a historical timeline we can read all the segment on the newest + * timeline because it contains all the old timelines' data too. So only + * one switch check is required. + */ + { + /* + * We need to re-read the timeline history in case it's been changed + * by a promotion or replay from a cascaded replica. + */ + List *timelineHistory = readTimeLineHistory(currTLI); + XLogRecPtr endOfSegment; + + endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) * + state->segcxt.ws_segsize - 1; + Assert(wantPage / state->segcxt.ws_segsize == + endOfSegment / state->segcxt.ws_segsize); + + /* + * Find the timeline of the last LSN on the segment containing + * wantPage. + */ + state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); + state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, + &state->nextTLI); + + Assert(state->currTLIValidUntil == InvalidXLogRecPtr || + wantPage + wantLength < state->currTLIValidUntil); + + list_free_deep(timelineHistory); + + elog(DEBUG3, "switched to timeline %u valid until %X/%X", + state->currTLI, + LSN_FORMAT_ARGS(state->currTLIValidUntil)); + } +} + +/* XLogReaderRoutine->segment_open callback for local pg_wal files */ +void +wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; + + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + if (errno == ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + path))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + +/* stock XLogReaderRoutine->segment_close callback */ +void +wal_segment_close(XLogReaderState *state) +{ + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; +} + +/* + * XLogReaderRoutine->page_read callback for reading local xlog files + * + * Public because it would likely be very helpful for someone writing another + * output method outside walsender, e.g. in a bgworker. + * + * TODO: The walsender has its own version of this, but it relies on the + * walsender's latch being set whenever WAL is flushed. No such infrastructure + * exists for normal backends, so we have to do a check/sleep/repeat style of + * loop for now. + */ +int +read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *cur_page) +{ + return read_local_xlog_page_guts(state, targetPagePtr, reqLen, + targetRecPtr, cur_page, true); +} + +/* + * Same as read_local_xlog_page except that it doesn't wait for future WAL + * to be available. + */ +int +read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, + char *cur_page) +{ + return read_local_xlog_page_guts(state, targetPagePtr, reqLen, + targetRecPtr, cur_page, false); +} + +/* + * Implementation of read_local_xlog_page and its no wait version. + */ +static int +read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, + char *cur_page, bool wait_for_wal) +{ + XLogRecPtr read_upto, + loc; + TimeLineID tli; + int count; + WALReadError errinfo; + TimeLineID currTLI; + + loc = targetPagePtr + reqLen; + + /* Loop waiting for xlog to be available if necessary */ + while (1) + { + /* + * Determine the limit of xlog we can currently read to, and what the + * most recent timeline is. + */ + if (!RecoveryInProgress()) + read_upto = GetFlushRecPtr(&currTLI); + else + read_upto = GetXLogReplayRecPtr(&currTLI); + tli = currTLI; + + /* + * Check which timeline to get the record from. + * + * We have to do it each time through the loop because if we're in + * recovery as a cascading standby, the current timeline might've + * become historical. We can't rely on RecoveryInProgress() because in + * a standby configuration like + * + * A => B => C + * + * if we're a logical decoding session on C, and B gets promoted, our + * timeline will change while we remain in recovery. + * + * We can't just keep reading from the old timeline as the last WAL + * archive in the timeline will get renamed to .partial by + * StartupXLOG(). + * + * If that happens after our caller determined the TLI but before we + * actually read the xlog page, we might still try to read from the + * old (now renamed) segment and fail. There's not much we can do + * about this, but it can only happen when we're a leaf of a cascading + * standby whose primary gets promoted while we're decoding, so a + * one-off ERROR isn't too bad. + */ + XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli); + + if (state->currTLI == currTLI) + { + + if (loc <= read_upto) + break; + + /* If asked, let's not wait for future WAL. */ + if (!wait_for_wal) + { + ReadLocalXLogPageNoWaitPrivate *private_data; + + /* + * Inform the caller of read_local_xlog_page_no_wait that the + * end of WAL has been reached. + */ + private_data = (ReadLocalXLogPageNoWaitPrivate *) + state->private_data; + private_data->end_of_wal = true; + break; + } + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + else + { + /* + * We're on a historical timeline, so limit reading to the switch + * point where we moved to the next timeline. + * + * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know + * about the new timeline, so we must've received past the end of + * it. + */ + read_upto = state->currTLIValidUntil; + + /* + * Setting tli to our wanted record's TLI is slightly wrong; the + * page might begin on an older timeline if it contains a timeline + * switch, since its xlog segment will have been copied from the + * prior timeline. This is pretty harmless though, as nothing + * cares so long as the timeline doesn't go backwards. We should + * read the page header instead; FIXME someday. + */ + tli = state->currTLI; + + /* No need to wait on a historical timeline */ + break; + } + } + + if (targetPagePtr + XLOG_BLCKSZ <= read_upto) + { + /* + * more than one block available; read only that block, have caller + * come back if they need more. + */ + count = XLOG_BLCKSZ; + } + else if (targetPagePtr + reqLen > read_upto) + { + /* not enough data there */ + return -1; + } + else + { + /* enough bytes available to satisfy the request */ + count = read_upto - targetPagePtr; + } + + /* + * Even though we just determined how much of the page can be validly read + * as 'count', read the whole page anyway. It's guaranteed to be + * zero-padded up to the page boundary if it's incomplete. + */ + if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, + &errinfo)) + WALReadRaiseError(&errinfo); + + /* number of valid bytes in the buffer */ + return count; +} + +/* + * Backend-specific convenience code to handle read errors encountered by + * WALRead(). + */ +void +WALReadRaiseError(WALReadError *errinfo) +{ + WALOpenSegment *seg = &errinfo->wre_seg; + char fname[MAXFNAMELEN]; + + XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size); + + if (errinfo->wre_read < 0) + { + errno = errinfo->wre_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %d: %m", + fname, errinfo->wre_off))); + } + else if (errinfo->wre_read == 0) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %d: read %d of %d", + fname, errinfo->wre_off, errinfo->wre_read, + errinfo->wre_req))); + } +} |