From 46651ce6fe013220ed397add242004d764fc0153 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:15:05 +0200 Subject: Adding upstream version 14.5. Signed-off-by: Daniel Baumann --- src/backend/replication/.gitignore | 4 + src/backend/replication/Makefile | 41 + src/backend/replication/README | 76 + src/backend/replication/backup_manifest.c | 408 ++ src/backend/replication/basebackup.c | 2034 ++++++++ src/backend/replication/libpqwalreceiver/Makefile | 37 + .../libpqwalreceiver/libpqwalreceiver.c | 1112 +++++ src/backend/replication/logical/Makefile | 31 + src/backend/replication/logical/decode.c | 1316 +++++ src/backend/replication/logical/launcher.c | 1024 ++++ src/backend/replication/logical/logical.c | 1840 +++++++ src/backend/replication/logical/logicalfuncs.c | 417 ++ src/backend/replication/logical/message.c | 88 + src/backend/replication/logical/origin.c | 1574 ++++++ src/backend/replication/logical/proto.c | 900 ++++ src/backend/replication/logical/relation.c | 705 +++ src/backend/replication/logical/reorderbuffer.c | 5156 ++++++++++++++++++++ src/backend/replication/logical/snapbuild.c | 1995 ++++++++ src/backend/replication/logical/tablesync.c | 1160 +++++ src/backend/replication/logical/worker.c | 3254 ++++++++++++ src/backend/replication/pgoutput/Makefile | 32 + src/backend/replication/pgoutput/pgoutput.c | 1346 +++++ src/backend/replication/repl_gram.c | 1895 +++++++ src/backend/replication/repl_gram.y | 414 ++ src/backend/replication/repl_scanner.c | 2615 ++++++++++ src/backend/replication/repl_scanner.l | 310 ++ src/backend/replication/slot.c | 1861 +++++++ src/backend/replication/slotfuncs.c | 950 ++++ src/backend/replication/syncrep.c | 1103 +++++ src/backend/replication/syncrep_gram.c | 1461 ++++++ src/backend/replication/syncrep_gram.y | 116 + src/backend/replication/syncrep_scanner.c | 2163 ++++++++ src/backend/replication/syncrep_scanner.l | 163 + src/backend/replication/walreceiver.c | 1460 ++++++ src/backend/replication/walreceiverfuncs.c | 407 ++ src/backend/replication/walsender.c | 3729 ++++++++++++++ 36 files changed, 43197 insertions(+) create mode 100644 src/backend/replication/.gitignore create mode 100644 src/backend/replication/Makefile create mode 100644 src/backend/replication/README create mode 100644 src/backend/replication/backup_manifest.c create mode 100644 src/backend/replication/basebackup.c create mode 100644 src/backend/replication/libpqwalreceiver/Makefile create mode 100644 src/backend/replication/libpqwalreceiver/libpqwalreceiver.c create mode 100644 src/backend/replication/logical/Makefile create mode 100644 src/backend/replication/logical/decode.c create mode 100644 src/backend/replication/logical/launcher.c create mode 100644 src/backend/replication/logical/logical.c create mode 100644 src/backend/replication/logical/logicalfuncs.c create mode 100644 src/backend/replication/logical/message.c create mode 100644 src/backend/replication/logical/origin.c create mode 100644 src/backend/replication/logical/proto.c create mode 100644 src/backend/replication/logical/relation.c create mode 100644 src/backend/replication/logical/reorderbuffer.c create mode 100644 src/backend/replication/logical/snapbuild.c create mode 100644 src/backend/replication/logical/tablesync.c create mode 100644 src/backend/replication/logical/worker.c create mode 100644 src/backend/replication/pgoutput/Makefile create mode 100644 src/backend/replication/pgoutput/pgoutput.c create mode 100644 src/backend/replication/repl_gram.c create mode 100644 src/backend/replication/repl_gram.y create mode 100644 src/backend/replication/repl_scanner.c create mode 100644 src/backend/replication/repl_scanner.l create mode 100644 src/backend/replication/slot.c create mode 100644 src/backend/replication/slotfuncs.c create mode 100644 src/backend/replication/syncrep.c create mode 100644 src/backend/replication/syncrep_gram.c create mode 100644 src/backend/replication/syncrep_gram.y create mode 100644 src/backend/replication/syncrep_scanner.c create mode 100644 src/backend/replication/syncrep_scanner.l create mode 100644 src/backend/replication/walreceiver.c create mode 100644 src/backend/replication/walreceiverfuncs.c create mode 100644 src/backend/replication/walsender.c (limited to 'src/backend/replication') diff --git a/src/backend/replication/.gitignore b/src/backend/replication/.gitignore new file mode 100644 index 0000000..d1df614 --- /dev/null +++ b/src/backend/replication/.gitignore @@ -0,0 +1,4 @@ +/repl_gram.c +/repl_scanner.c +/syncrep_gram.c +/syncrep_scanner.c diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile new file mode 100644 index 0000000..a0381e5 --- /dev/null +++ b/src/backend/replication/Makefile @@ -0,0 +1,41 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication +# +# IDENTIFICATION +# src/backend/replication/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) + +OBJS = \ + backup_manifest.o \ + basebackup.o \ + repl_gram.o \ + slot.o \ + slotfuncs.o \ + syncrep.o \ + syncrep_gram.o \ + walreceiver.o \ + walreceiverfuncs.o \ + walsender.o + +SUBDIRS = logical + +include $(top_srcdir)/src/backend/common.mk + +# repl_scanner is compiled as part of repl_gram +repl_gram.o: repl_scanner.c + +# syncrep_scanner is compiled as part of syncrep_gram +syncrep_gram.o: syncrep_scanner.c + +# repl_gram.c, repl_scanner.c, syncrep_gram.c and syncrep_scanner.c +# are in the distribution tarball, so they are not cleaned here. +# (Our parent Makefile takes care of them during maintainer-clean.) diff --git a/src/backend/replication/README b/src/backend/replication/README new file mode 100644 index 0000000..8fcd78d --- /dev/null +++ b/src/backend/replication/README @@ -0,0 +1,76 @@ +src/backend/replication/README + +Walreceiver - libpqwalreceiver API +---------------------------------- + +The transport-specific part of walreceiver, responsible for connecting to +the primary server, receiving WAL files and sending messages, is loaded +dynamically to avoid having to link the main server binary with libpq. +The dynamically loaded module is in libpqwalreceiver subdirectory. + +The dynamically loaded module implements a set of functions with details +about each one of them provided in src/include/replication/walreceiver.h. + +This API should be considered internal at the moment, but we could open it +up for 3rd party replacements of libpqwalreceiver in the future, allowing +pluggable methods for receiving WAL. + +Walreceiver IPC +--------------- + +When the WAL replay in startup process has reached the end of archived WAL, +restorable using restore_command, it starts up the walreceiver process +to fetch more WAL (if streaming replication is configured). + +Walreceiver is a postmaster subprocess, so the startup process can't fork it +directly. Instead, it sends a signal to postmaster, asking postmaster to launch +it. Before that, however, startup process fills in WalRcvData->conninfo +and WalRcvData->slotname, and initializes the starting point in +WalRcvData->receiveStart. + +As walreceiver receives WAL from the primary server, and writes and flushes +it to disk (in pg_wal), it updates WalRcvData->flushedUpto and signals +the startup process to know how far WAL replay can advance. + +Walreceiver sends information about replication progress to the primary server +whenever it either writes or flushes new WAL, or the specified interval elapses. +This is used for reporting purpose. + +Walsender IPC +------------- + +At shutdown, postmaster handles walsender processes differently from regular +backends. It waits for regular backends to die before writing the +shutdown checkpoint and terminating pgarch and other auxiliary processes, but +that's not desirable for walsenders, because we want the standby servers to +receive all the WAL, including the shutdown checkpoint, before the primary +is shut down. Therefore postmaster treats walsenders like the pgarch process, +and instructs them to terminate at PM_SHUTDOWN_2 phase, after all regular +backends have died and checkpointer has issued the shutdown checkpoint. + +When postmaster accepts a connection, it immediately forks a new process +to handle the handshake and authentication, and the process initializes to +become a backend. Postmaster doesn't know if the process becomes a regular +backend or a walsender process at that time - that's indicated in the +connection handshake - so we need some extra signaling to let postmaster +identify walsender processes. + +When walsender process starts up, it marks itself as a walsender process in +the PMSignal array. That way postmaster can tell it apart from regular +backends. + +Note that no big harm is done if postmaster thinks that a walsender is a +regular backend; it will just terminate the walsender earlier in the shutdown +phase. A walsender will look like a regular backend until it's done with the +initialization and has marked itself in PMSignal array, and at process +termination, after unmarking the PMSignal slot. + +Each walsender allocates an entry from the WalSndCtl array, and tracks +information about replication progress. User can monitor them via +statistics views. + + +Walsender - walreceiver protocol +-------------------------------- + +See manual. diff --git a/src/backend/replication/backup_manifest.c b/src/backend/replication/backup_manifest.c new file mode 100644 index 0000000..04ca455 --- /dev/null +++ b/src/backend/replication/backup_manifest.c @@ -0,0 +1,408 @@ +/*------------------------------------------------------------------------- + * + * backup_manifest.c + * code for generating and sending a backup manifest + * + * Portions Copyright (c) 2010-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/backup_manifest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/timeline.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "mb/pg_wchar.h" +#include "replication/backup_manifest.h" +#include "utils/builtins.h" +#include "utils/json.h" + +static void AppendStringToManifest(backup_manifest_info *manifest, char *s); + +/* + * Does the user want a backup manifest? + * + * It's simplest to always have a manifest_info object, so that we don't need + * checks for NULL pointers in too many places. However, if the user doesn't + * want a manifest, we set manifest->buffile to NULL. + */ +static inline bool +IsManifestEnabled(backup_manifest_info *manifest) +{ + return (manifest->buffile != NULL); +} + +/* + * Convenience macro for appending data to the backup manifest. + */ +#define AppendToManifest(manifest, ...) \ + { \ + char *_manifest_s = psprintf(__VA_ARGS__); \ + AppendStringToManifest(manifest, _manifest_s); \ + pfree(_manifest_s); \ + } + +/* + * Initialize state so that we can construct a backup manifest. + * + * NB: Although the checksum type for the data files is configurable, the + * checksum for the manifest itself always uses SHA-256. See comments in + * SendBackupManifest. + */ +void +InitializeBackupManifest(backup_manifest_info *manifest, + backup_manifest_option want_manifest, + pg_checksum_type manifest_checksum_type) +{ + memset(manifest, 0, sizeof(backup_manifest_info)); + manifest->checksum_type = manifest_checksum_type; + + if (want_manifest == MANIFEST_OPTION_NO) + manifest->buffile = NULL; + else + { + manifest->buffile = BufFileCreateTemp(false); + manifest->manifest_ctx = pg_cryptohash_create(PG_SHA256); + if (pg_cryptohash_init(manifest->manifest_ctx) < 0) + elog(ERROR, "failed to initialize checksum of backup manifest"); + } + + manifest->manifest_size = UINT64CONST(0); + manifest->force_encode = (want_manifest == MANIFEST_OPTION_FORCE_ENCODE); + manifest->first_file = true; + manifest->still_checksumming = true; + + if (want_manifest != MANIFEST_OPTION_NO) + AppendToManifest(manifest, + "{ \"PostgreSQL-Backup-Manifest-Version\": 1,\n" + "\"Files\": ["); +} + +/* + * Free resources assigned to a backup manifest constructed. + */ +void +FreeBackupManifest(backup_manifest_info *manifest) +{ + pg_cryptohash_free(manifest->manifest_ctx); + manifest->manifest_ctx = NULL; +} + +/* + * Add an entry to the backup manifest for a file. + */ +void +AddFileToBackupManifest(backup_manifest_info *manifest, const char *spcoid, + const char *pathname, size_t size, pg_time_t mtime, + pg_checksum_context *checksum_ctx) +{ + char pathbuf[MAXPGPATH]; + int pathlen; + StringInfoData buf; + + if (!IsManifestEnabled(manifest)) + return; + + /* + * If this file is part of a tablespace, the pathname passed to this + * function will be relative to the tar file that contains it. We want the + * pathname relative to the data directory (ignoring the intermediate + * symlink traversal). + */ + if (spcoid != NULL) + { + snprintf(pathbuf, sizeof(pathbuf), "pg_tblspc/%s/%s", spcoid, + pathname); + pathname = pathbuf; + } + + /* + * Each file's entry needs to be separated from any entry that follows by + * a comma, but there's no comma before the first one or after the last + * one. To make that work, adding a file to the manifest starts by + * terminating the most recently added line, with a comma if appropriate, + * but does not terminate the line inserted for this file. + */ + initStringInfo(&buf); + if (manifest->first_file) + { + appendStringInfoChar(&buf, '\n'); + manifest->first_file = false; + } + else + appendStringInfoString(&buf, ",\n"); + + /* + * Write the relative pathname to this file out to the manifest. The + * manifest is always stored in UTF-8, so we have to encode paths that are + * not valid in that encoding. + */ + pathlen = strlen(pathname); + if (!manifest->force_encode && + pg_verify_mbstr(PG_UTF8, pathname, pathlen, true)) + { + appendStringInfoString(&buf, "{ \"Path\": "); + escape_json(&buf, pathname); + appendStringInfoString(&buf, ", "); + } + else + { + appendStringInfoString(&buf, "{ \"Encoded-Path\": \""); + enlargeStringInfo(&buf, 2 * pathlen); + buf.len += hex_encode(pathname, pathlen, + &buf.data[buf.len]); + appendStringInfoString(&buf, "\", "); + } + + appendStringInfo(&buf, "\"Size\": %zu, ", size); + + /* + * Convert last modification time to a string and append it to the + * manifest. Since it's not clear what time zone to use and since time + * zone definitions can change, possibly causing confusion, use GMT + * always. + */ + appendStringInfoString(&buf, "\"Last-Modified\": \""); + enlargeStringInfo(&buf, 128); + buf.len += pg_strftime(&buf.data[buf.len], 128, "%Y-%m-%d %H:%M:%S %Z", + pg_gmtime(&mtime)); + appendStringInfoChar(&buf, '"'); + + /* Add checksum information. */ + if (checksum_ctx->type != CHECKSUM_TYPE_NONE) + { + uint8 checksumbuf[PG_CHECKSUM_MAX_LENGTH]; + int checksumlen; + + checksumlen = pg_checksum_final(checksum_ctx, checksumbuf); + if (checksumlen < 0) + elog(ERROR, "could not finalize checksum of file \"%s\"", + pathname); + + appendStringInfo(&buf, + ", \"Checksum-Algorithm\": \"%s\", \"Checksum\": \"", + pg_checksum_type_name(checksum_ctx->type)); + enlargeStringInfo(&buf, 2 * checksumlen); + buf.len += hex_encode((char *) checksumbuf, checksumlen, + &buf.data[buf.len]); + appendStringInfoChar(&buf, '"'); + } + + /* Close out the object. */ + appendStringInfoString(&buf, " }"); + + /* OK, add it to the manifest. */ + AppendStringToManifest(manifest, buf.data); + + /* Avoid leaking memory. */ + pfree(buf.data); +} + +/* + * Add information about the WAL that will need to be replayed when restoring + * this backup to the manifest. + */ +void +AddWALInfoToBackupManifest(backup_manifest_info *manifest, XLogRecPtr startptr, + TimeLineID starttli, XLogRecPtr endptr, + TimeLineID endtli) +{ + List *timelines; + ListCell *lc; + bool first_wal_range = true; + bool found_start_timeline = false; + + if (!IsManifestEnabled(manifest)) + return; + + /* Terminate the list of files. */ + AppendStringToManifest(manifest, "\n],\n"); + + /* Read the timeline history for the ending timeline. */ + timelines = readTimeLineHistory(endtli); + + /* Start a list of LSN ranges. */ + AppendStringToManifest(manifest, "\"WAL-Ranges\": [\n"); + + foreach(lc, timelines) + { + TimeLineHistoryEntry *entry = lfirst(lc); + XLogRecPtr tl_beginptr; + + /* + * We only care about timelines that were active during the backup. + * Skip any that ended before the backup started. (Note that if + * entry->end is InvalidXLogRecPtr, it means that the timeline has not + * yet ended.) + */ + if (!XLogRecPtrIsInvalid(entry->end) && entry->end < startptr) + continue; + + /* + * Because the timeline history file lists newer timelines before + * older ones, the first timeline we encounter that is new enough to + * matter ought to match the ending timeline of the backup. + */ + if (first_wal_range && endtli != entry->tli) + ereport(ERROR, + errmsg("expected end timeline %u but found timeline %u", + starttli, entry->tli)); + + /* + * If this timeline entry matches with the timeline on which the + * backup started, WAL needs to be checked from the start LSN of the + * backup. If this entry refers to a newer timeline, WAL needs to be + * checked since the beginning of this timeline, so use the LSN where + * the timeline began. + */ + if (starttli == entry->tli) + tl_beginptr = startptr; + else + { + tl_beginptr = entry->begin; + + /* + * If we reach a TLI that has no valid beginning LSN, there can't + * be any more timelines in the history after this point, so we'd + * better have arrived at the expected starting TLI. If not, + * something's gone horribly wrong. + */ + if (XLogRecPtrIsInvalid(entry->begin)) + ereport(ERROR, + errmsg("expected start timeline %u but found timeline %u", + starttli, entry->tli)); + } + + AppendToManifest(manifest, + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + first_wal_range ? "" : ",\n", + entry->tli, + LSN_FORMAT_ARGS(tl_beginptr), + LSN_FORMAT_ARGS(endptr)); + + if (starttli == entry->tli) + { + found_start_timeline = true; + break; + } + + endptr = entry->begin; + first_wal_range = false; + } + + /* + * The last entry in the timeline history for the ending timeline should + * be the ending timeline itself. Verify that this is what we observed. + */ + if (!found_start_timeline) + ereport(ERROR, + errmsg("start timeline %u not found in history of timeline %u", + starttli, endtli)); + + /* Terminate the list of WAL ranges. */ + AppendStringToManifest(manifest, "\n],\n"); +} + +/* + * Finalize the backup manifest, and send it to the client. + */ +void +SendBackupManifest(backup_manifest_info *manifest) +{ + StringInfoData protobuf; + uint8 checksumbuf[PG_SHA256_DIGEST_LENGTH]; + char checksumstringbuf[PG_SHA256_DIGEST_STRING_LENGTH]; + size_t manifest_bytes_done = 0; + + if (!IsManifestEnabled(manifest)) + return; + + /* + * Append manifest checksum, so that the problems with the manifest itself + * can be detected. + * + * We always use SHA-256 for this, regardless of what algorithm is chosen + * for checksumming the files. If we ever want to make the checksum + * algorithm used for the manifest file variable, the client will need a + * way to figure out which algorithm to use as close to the beginning of + * the manifest file as possible, to avoid having to read the whole thing + * twice. + */ + manifest->still_checksumming = false; + if (pg_cryptohash_final(manifest->manifest_ctx, checksumbuf, + sizeof(checksumbuf)) < 0) + elog(ERROR, "failed to finalize checksum of backup manifest"); + AppendStringToManifest(manifest, "\"Manifest-Checksum\": \""); + + hex_encode((char *) checksumbuf, sizeof checksumbuf, checksumstringbuf); + checksumstringbuf[PG_SHA256_DIGEST_STRING_LENGTH - 1] = '\0'; + + AppendStringToManifest(manifest, checksumstringbuf); + AppendStringToManifest(manifest, "\"}\n"); + + /* + * We've written all the data to the manifest file. Rewind the file so + * that we can read it all back. + */ + if (BufFileSeek(manifest->buffile, 0, 0L, SEEK_SET)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rewind temporary file"))); + + /* Send CopyOutResponse message */ + pq_beginmessage(&protobuf, 'H'); + pq_sendbyte(&protobuf, 0); /* overall format */ + pq_sendint16(&protobuf, 0); /* natts */ + pq_endmessage(&protobuf); + + /* + * Send CopyData messages. + * + * We choose to read back the data from the temporary file in chunks of + * size BLCKSZ; this isn't necessary, but buffile.c uses that as the I/O + * size, so it seems to make sense to match that value here. + */ + while (manifest_bytes_done < manifest->manifest_size) + { + char manifestbuf[BLCKSZ]; + size_t bytes_to_read; + size_t rc; + + bytes_to_read = Min(sizeof(manifestbuf), + manifest->manifest_size - manifest_bytes_done); + rc = BufFileRead(manifest->buffile, manifestbuf, bytes_to_read); + if (rc != bytes_to_read) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from temporary file: %m"))); + pq_putmessage('d', manifestbuf, bytes_to_read); + manifest_bytes_done += bytes_to_read; + } + + /* No more data, so send CopyDone message */ + pq_putemptymessage('c'); + + /* Release resources */ + BufFileClose(manifest->buffile); +} + +/* + * Append a cstring to the manifest. + */ +static void +AppendStringToManifest(backup_manifest_info *manifest, char *s) +{ + int len = strlen(s); + + Assert(manifest != NULL); + if (manifest->still_checksumming) + { + if (pg_cryptohash_update(manifest->manifest_ctx, (uint8 *) s, len) < 0) + elog(ERROR, "failed to update checksum of backup manifest"); + } + BufFileWrite(manifest->buffile, s, len); + manifest->manifest_size += len; +} diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c new file mode 100644 index 0000000..d142cc2 --- /dev/null +++ b/src/backend/replication/basebackup.c @@ -0,0 +1,2034 @@ +/*------------------------------------------------------------------------- + * + * basebackup.c + * code for taking a base backup and streaming it to a standby + * + * Portions Copyright (c) 2010-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/basebackup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/xlog_internal.h" /* for pg_start/stop_backup */ +#include "catalog/pg_type.h" +#include "common/file_perm.h" +#include "commands/progress.h" +#include "lib/stringinfo.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "pgstat.h" +#include "pgtar.h" +#include "port.h" +#include "postmaster/syslogger.h" +#include "replication/basebackup.h" +#include "replication/backup_manifest.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/reinit.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" +#include "utils/relcache.h" +#include "utils/resowner.h" +#include "utils/timestamp.h" + +typedef struct +{ + const char *label; + bool progress; + bool fastcheckpoint; + bool nowait; + bool includewal; + uint32 maxrate; + bool sendtblspcmapfile; + backup_manifest_option manifest; + pg_checksum_type manifest_checksum_type; +} basebackup_options; + +static int64 sendTablespace(char *path, char *oid, bool sizeonly, + struct backup_manifest_info *manifest); +static int64 sendDir(const char *path, int basepathlen, bool sizeonly, + List *tablespaces, bool sendtblspclinks, + backup_manifest_info *manifest, const char *spcoid); +static bool sendFile(const char *readfilename, const char *tarfilename, + struct stat *statbuf, bool missing_ok, Oid dboid, + backup_manifest_info *manifest, const char *spcoid); +static void sendFileWithContent(const char *filename, const char *content, + backup_manifest_info *manifest); +static int64 _tarWriteHeader(const char *filename, const char *linktarget, + struct stat *statbuf, bool sizeonly); +static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf, + bool sizeonly); +static void send_int8_string(StringInfoData *buf, int64 intval); +static void SendBackupHeader(List *tablespaces); +static void perform_base_backup(basebackup_options *opt); +static void parse_basebackup_options(List *options, basebackup_options *opt); +static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli); +static int compareWalFileNames(const ListCell *a, const ListCell *b); +static void throttle(size_t increment); +static void update_basebackup_progress(int64 delta); +static bool is_checksummed_file(const char *fullpath, const char *filename); +static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok); + +/* Was the backup currently in-progress initiated in recovery mode? */ +static bool backup_started_in_recovery = false; + +/* Relative path of temporary statistics directory */ +static char *statrelpath = NULL; + +/* + * Size of each block sent into the tar stream for larger files. + */ +#define TAR_SEND_SIZE 32768 + +/* + * How frequently to throttle, as a fraction of the specified rate-second. + */ +#define THROTTLING_FREQUENCY 8 + +/* The actual number of bytes, transfer of which may cause sleep. */ +static uint64 throttling_sample; + +/* Amount of data already transferred but not yet throttled. */ +static int64 throttling_counter; + +/* The minimum time required to transfer throttling_sample bytes. */ +static TimeOffset elapsed_min_unit; + +/* The last check of the transfer rate. */ +static TimestampTz throttled_last; + +/* The starting XLOG position of the base backup. */ +static XLogRecPtr startptr; + +/* Total number of checksum failures during base backup. */ +static long long int total_checksum_failures; + +/* Do not verify checksums. */ +static bool noverify_checksums = false; + +/* + * Total amount of backup data that will be streamed. + * -1 means that the size is not estimated. + */ +static int64 backup_total = 0; + +/* Amount of backup data already streamed */ +static int64 backup_streamed = 0; + +/* + * Definition of one element part of an exclusion list, used for paths part + * of checksum validation or base backups. "name" is the name of the file + * or path to check for exclusion. If "match_prefix" is true, any items + * matching the name as prefix are excluded. + */ +struct exclude_list_item +{ + const char *name; + bool match_prefix; +}; + +/* + * The contents of these directories are removed or recreated during server + * start so they are not included in backups. The directories themselves are + * kept and included as empty to preserve access permissions. + * + * Note: this list should be kept in sync with the filter lists in pg_rewind's + * filemap.c. + */ +static const char *const excludeDirContents[] = +{ + /* + * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even + * when stats_temp_directory is set because PGSS_TEXT_FILE is always + * created there. + */ + PG_STAT_TMP_DIR, + + /* + * It is generally not useful to backup the contents of this directory + * even if the intention is to restore to another primary. See backup.sgml + * for a more detailed description. + */ + "pg_replslot", + + /* Contents removed on startup, see dsm_cleanup_for_mmap(). */ + PG_DYNSHMEM_DIR, + + /* Contents removed on startup, see AsyncShmemInit(). */ + "pg_notify", + + /* + * Old contents are loaded for possible debugging but are not required for + * normal operation, see SerialInit(). + */ + "pg_serial", + + /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */ + "pg_snapshots", + + /* Contents zeroed on startup, see StartupSUBTRANS(). */ + "pg_subtrans", + + /* end of list */ + NULL +}; + +/* + * List of files excluded from backups. + */ +static const struct exclude_list_item excludeFiles[] = +{ + /* Skip auto conf temporary file. */ + {PG_AUTOCONF_FILENAME ".tmp", false}, + + /* Skip current log file temporary file */ + {LOG_METAINFO_DATAFILE_TMP, false}, + + /* + * Skip relation cache because it is rebuilt on startup. This includes + * temporary files. + */ + {RELCACHE_INIT_FILENAME, true}, + + /* + * If there's a backup_label or tablespace_map file, it belongs to a + * backup started by the user with pg_start_backup(). It is *not* correct + * for this backup. Our backup_label/tablespace_map is injected into the + * tar separately. + */ + {BACKUP_LABEL_FILE, false}, + {TABLESPACE_MAP, false}, + + /* + * If there's a backup_manifest, it belongs to a backup that was used to + * start this server. It is *not* correct for this backup. Our + * backup_manifest is injected into the backup separately if users want + * it. + */ + {"backup_manifest", false}, + + {"postmaster.pid", false}, + {"postmaster.opts", false}, + + /* end of list */ + {NULL, false} +}; + +/* + * List of files excluded from checksum validation. + * + * Note: this list should be kept in sync with what pg_checksums.c + * includes. + */ +static const struct exclude_list_item noChecksumFiles[] = { + {"pg_control", false}, + {"pg_filenode.map", false}, + {"pg_internal.init", true}, + {"PG_VERSION", false}, +#ifdef EXEC_BACKEND + {"config_exec_params", true}, +#endif + {NULL, false} +}; + +/* + * Actually do a base backup for the specified tablespaces. + * + * This is split out mainly to avoid complaints about "variable might be + * clobbered by longjmp" from stupider versions of gcc. + */ +static void +perform_base_backup(basebackup_options *opt) +{ + TimeLineID starttli; + XLogRecPtr endptr; + TimeLineID endtli; + StringInfo labelfile; + StringInfo tblspc_map_file; + backup_manifest_info manifest; + int datadirpathlen; + List *tablespaces = NIL; + + backup_total = 0; + backup_streamed = 0; + pgstat_progress_start_command(PROGRESS_COMMAND_BASEBACKUP, InvalidOid); + + /* + * If the estimation of the total backup size is disabled, make the + * backup_total column in the view return NULL by setting the parameter to + * -1. + */ + if (!opt->progress) + { + backup_total = -1; + pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TOTAL, + backup_total); + } + + /* we're going to use a BufFile, so we need a ResourceOwner */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); + + datadirpathlen = strlen(DataDir); + + backup_started_in_recovery = RecoveryInProgress(); + + labelfile = makeStringInfo(); + tblspc_map_file = makeStringInfo(); + InitializeBackupManifest(&manifest, opt->manifest, + opt->manifest_checksum_type); + + total_checksum_failures = 0; + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_WAIT_CHECKPOINT); + startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli, + labelfile, &tablespaces, + tblspc_map_file); + + /* + * Once do_pg_start_backup has been called, ensure that any failure causes + * us to abort the backup so we don't "leak" a backup counter. For this + * reason, *all* functionality between do_pg_start_backup() and the end of + * do_pg_stop_backup() should be inside the error cleanup block! + */ + + PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); + { + ListCell *lc; + tablespaceinfo *ti; + int tblspc_streamed = 0; + + /* + * Calculate the relative path of temporary statistics directory in + * order to skip the files which are located in that directory later. + */ + if (is_absolute_path(pgstat_stat_directory) && + strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0) + statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1); + else if (strncmp(pgstat_stat_directory, "./", 2) != 0) + statrelpath = psprintf("./%s", pgstat_stat_directory); + else + statrelpath = pgstat_stat_directory; + + /* Add a node for the base directory at the end */ + ti = palloc0(sizeof(tablespaceinfo)); + ti->size = -1; + tablespaces = lappend(tablespaces, ti); + + /* + * Calculate the total backup size by summing up the size of each + * tablespace + */ + if (opt->progress) + { + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_ESTIMATE_BACKUP_SIZE); + + foreach(lc, tablespaces) + { + tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc); + + if (tmp->path == NULL) + tmp->size = sendDir(".", 1, true, tablespaces, true, NULL, + NULL); + else + tmp->size = sendTablespace(tmp->path, tmp->oid, true, + NULL); + backup_total += tmp->size; + } + } + + /* Report that we are now streaming database files as a base backup */ + { + const int index[] = { + PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_BACKUP_TOTAL, + PROGRESS_BASEBACKUP_TBLSPC_TOTAL + }; + const int64 val[] = { + PROGRESS_BASEBACKUP_PHASE_STREAM_BACKUP, + backup_total, list_length(tablespaces) + }; + + pgstat_progress_update_multi_param(3, index, val); + } + + /* Send the starting position of the backup */ + SendXlogRecPtrResult(startptr, starttli); + + /* Send tablespace header */ + SendBackupHeader(tablespaces); + + /* Setup and activate network throttling, if client requested it */ + if (opt->maxrate > 0) + { + throttling_sample = + (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY; + + /* + * The minimum amount of time for throttling_sample bytes to be + * transferred. + */ + elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY; + + /* Enable throttling. */ + throttling_counter = 0; + + /* The 'real data' starts now (header was ignored). */ + throttled_last = GetCurrentTimestamp(); + } + else + { + /* Disable throttling. */ + throttling_counter = -1; + } + + /* Send off our tablespaces one by one */ + foreach(lc, tablespaces) + { + tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); + StringInfoData buf; + + /* Send CopyOutResponse message */ + pq_beginmessage(&buf, 'H'); + pq_sendbyte(&buf, 0); /* overall format */ + pq_sendint16(&buf, 0); /* natts */ + pq_endmessage(&buf); + + if (ti->path == NULL) + { + struct stat statbuf; + bool sendtblspclinks = true; + + /* In the main tar, include the backup_label first... */ + sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data, + &manifest); + + /* Then the tablespace_map file, if required... */ + if (opt->sendtblspcmapfile) + { + sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data, + &manifest); + sendtblspclinks = false; + } + + /* Then the bulk of the files... */ + sendDir(".", 1, false, tablespaces, sendtblspclinks, + &manifest, NULL); + + /* ... and pg_control after everything else. */ + if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + XLOG_CONTROL_FILE))); + sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, + false, InvalidOid, &manifest, NULL); + } + else + sendTablespace(ti->path, ti->oid, false, &manifest); + + /* + * If we're including WAL, and this is the main data directory we + * don't terminate the tar stream here. Instead, we will append + * the xlog files below and terminate it then. This is safe since + * the main data directory is always sent *last*. + */ + if (opt->includewal && ti->path == NULL) + { + Assert(lnext(tablespaces, lc) == NULL); + } + else + pq_putemptymessage('c'); /* CopyDone */ + + tblspc_streamed++; + pgstat_progress_update_param(PROGRESS_BASEBACKUP_TBLSPC_STREAMED, + tblspc_streamed); + } + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_WAIT_WAL_ARCHIVE); + endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli); + } + PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); + + + if (opt->includewal) + { + /* + * We've left the last tar file "open", so we can now append the + * required WAL files to it. + */ + char pathbuf[MAXPGPATH]; + XLogSegNo segno; + XLogSegNo startsegno; + XLogSegNo endsegno; + struct stat statbuf; + List *historyFileList = NIL; + List *walFileList = NIL; + char firstoff[MAXFNAMELEN]; + char lastoff[MAXFNAMELEN]; + DIR *dir; + struct dirent *de; + ListCell *lc; + TimeLineID tli; + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_TRANSFER_WAL); + + /* + * I'd rather not worry about timelines here, so scan pg_wal and + * include all WAL files in the range between 'startptr' and 'endptr', + * regardless of the timeline the file is stamped with. If there are + * some spurious WAL files belonging to timelines that don't belong in + * this server's history, they will be included too. Normally there + * shouldn't be such files, but if there are, there's little harm in + * including them. + */ + XLByteToSeg(startptr, startsegno, wal_segment_size); + XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size); + XLByteToPrevSeg(endptr, endsegno, wal_segment_size); + XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size); + + dir = AllocateDir("pg_wal"); + while ((de = ReadDir(dir, "pg_wal")) != NULL) + { + /* Does it look like a WAL segment, and is it in the range? */ + if (IsXLogFileName(de->d_name) && + strcmp(de->d_name + 8, firstoff + 8) >= 0 && + strcmp(de->d_name + 8, lastoff + 8) <= 0) + { + walFileList = lappend(walFileList, pstrdup(de->d_name)); + } + /* Does it look like a timeline history file? */ + else if (IsTLHistoryFileName(de->d_name)) + { + historyFileList = lappend(historyFileList, pstrdup(de->d_name)); + } + } + FreeDir(dir); + + /* + * Before we go any further, check that none of the WAL segments we + * need were removed. + */ + CheckXLogRemoved(startsegno, ThisTimeLineID); + + /* + * Sort the WAL filenames. We want to send the files in order from + * oldest to newest, to reduce the chance that a file is recycled + * before we get a chance to send it over. + */ + list_sort(walFileList, compareWalFileNames); + + /* + * There must be at least one xlog file in the pg_wal directory, since + * we are doing backup-including-xlog. + */ + if (walFileList == NIL) + ereport(ERROR, + (errmsg("could not find any WAL files"))); + + /* + * Sanity check: the first and last segment should cover startptr and + * endptr, with no gaps in between. + */ + XLogFromFileName((char *) linitial(walFileList), + &tli, &segno, wal_segment_size); + if (segno != startsegno) + { + char startfname[MAXFNAMELEN]; + + XLogFileName(startfname, ThisTimeLineID, startsegno, + wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", startfname))); + } + foreach(lc, walFileList) + { + char *walFileName = (char *) lfirst(lc); + XLogSegNo currsegno = segno; + XLogSegNo nextsegno = segno + 1; + + XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); + if (!(nextsegno == segno || currsegno == segno)) + { + char nextfname[MAXFNAMELEN]; + + XLogFileName(nextfname, ThisTimeLineID, nextsegno, + wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", nextfname))); + } + } + if (segno != endsegno) + { + char endfname[MAXFNAMELEN]; + + XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", endfname))); + } + + /* Ok, we have everything we need. Send the WAL files. */ + foreach(lc, walFileList) + { + char *walFileName = (char *) lfirst(lc); + int fd; + char buf[TAR_SEND_SIZE]; + size_t cnt; + pgoff_t len = 0; + + snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName); + XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); + + fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY); + if (fd < 0) + { + int save_errno = errno; + + /* + * Most likely reason for this is that the file was already + * removed by a checkpoint, so check for that to get a better + * error message. + */ + CheckXLogRemoved(segno, tli); + + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", pathbuf))); + } + + if (fstat(fd, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + pathbuf))); + if (statbuf.st_size != wal_segment_size) + { + CheckXLogRemoved(segno, tli); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected WAL file size \"%s\"", walFileName))); + } + + /* send the WAL file itself */ + _tarWriteHeader(pathbuf, NULL, &statbuf, false); + + while ((cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), + wal_segment_size - len), + len, pathbuf, true)) > 0) + { + CheckXLogRemoved(segno, tli); + /* Send the chunk as a CopyData message */ + if (pq_putmessage('d', buf, cnt)) + ereport(ERROR, + (errmsg("base backup could not send data, aborting backup"))); + update_basebackup_progress(cnt); + + len += cnt; + throttle(cnt); + + if (len == wal_segment_size) + break; + } + + if (len != wal_segment_size) + { + CheckXLogRemoved(segno, tli); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected WAL file size \"%s\"", walFileName))); + } + + /* + * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need + * for padding. + */ + Assert(wal_segment_size % TAR_BLOCK_SIZE == 0); + + CloseTransientFile(fd); + + /* + * Mark file as archived, otherwise files can get archived again + * after promotion of a new node. This is in line with + * walreceiver.c always doing an XLogArchiveForceDone() after a + * complete segment. + */ + StatusFilePath(pathbuf, walFileName, ".done"); + sendFileWithContent(pathbuf, "", &manifest); + } + + /* + * Send timeline history files too. Only the latest timeline history + * file is required for recovery, and even that only if there happens + * to be a timeline switch in the first WAL segment that contains the + * checkpoint record, or if we're taking a base backup from a standby + * server and the target timeline changes while the backup is taken. + * But they are small and highly useful for debugging purposes, so + * better include them all, always. + */ + foreach(lc, historyFileList) + { + char *fname = lfirst(lc); + + snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname); + + if (lstat(pathbuf, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", pathbuf))); + + sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid, + &manifest, NULL); + + /* unconditionally mark file as archived */ + StatusFilePath(pathbuf, fname, ".done"); + sendFileWithContent(pathbuf, "", &manifest); + } + + /* Send CopyDone message for the last tar file */ + pq_putemptymessage('c'); + } + + AddWALInfoToBackupManifest(&manifest, startptr, starttli, endptr, endtli); + + SendBackupManifest(&manifest); + + SendXlogRecPtrResult(endptr, endtli); + + if (total_checksum_failures) + { + if (total_checksum_failures > 1) + ereport(WARNING, + (errmsg_plural("%lld total checksum verification failure", + "%lld total checksum verification failures", + total_checksum_failures, + total_checksum_failures))); + + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("checksum verification failure during base backup"))); + } + + /* + * Make sure to free the manifest before the resource owners as manifests + * use cryptohash contexts that may depend on resource owners (like + * OpenSSL). + */ + FreeBackupManifest(&manifest); + + /* clean up the resource owner we created */ + WalSndResourceCleanup(true); + + pgstat_progress_end_command(); +} + +/* + * list_sort comparison function, to compare log/seg portion of WAL segment + * filenames, ignoring the timeline portion. + */ +static int +compareWalFileNames(const ListCell *a, const ListCell *b) +{ + char *fna = (char *) lfirst(a); + char *fnb = (char *) lfirst(b); + + return strcmp(fna + 8, fnb + 8); +} + +/* + * Parse the base backup options passed down by the parser + */ +static void +parse_basebackup_options(List *options, basebackup_options *opt) +{ + ListCell *lopt; + bool o_label = false; + bool o_progress = false; + bool o_fast = false; + bool o_nowait = false; + bool o_wal = false; + bool o_maxrate = false; + bool o_tablespace_map = false; + bool o_noverify_checksums = false; + bool o_manifest = false; + bool o_manifest_checksums = false; + + MemSet(opt, 0, sizeof(*opt)); + opt->manifest = MANIFEST_OPTION_NO; + opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C; + + foreach(lopt, options) + { + DefElem *defel = (DefElem *) lfirst(lopt); + + if (strcmp(defel->defname, "label") == 0) + { + if (o_label) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->label = strVal(defel->arg); + o_label = true; + } + else if (strcmp(defel->defname, "progress") == 0) + { + if (o_progress) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->progress = true; + o_progress = true; + } + else if (strcmp(defel->defname, "fast") == 0) + { + if (o_fast) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->fastcheckpoint = true; + o_fast = true; + } + else if (strcmp(defel->defname, "nowait") == 0) + { + if (o_nowait) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->nowait = true; + o_nowait = true; + } + else if (strcmp(defel->defname, "wal") == 0) + { + if (o_wal) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->includewal = true; + o_wal = true; + } + else if (strcmp(defel->defname, "max_rate") == 0) + { + long maxrate; + + if (o_maxrate) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + + maxrate = intVal(defel->arg); + if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)", + (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER))); + + opt->maxrate = (uint32) maxrate; + o_maxrate = true; + } + else if (strcmp(defel->defname, "tablespace_map") == 0) + { + if (o_tablespace_map) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->sendtblspcmapfile = true; + o_tablespace_map = true; + } + else if (strcmp(defel->defname, "noverify_checksums") == 0) + { + if (o_noverify_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + noverify_checksums = true; + o_noverify_checksums = true; + } + else if (strcmp(defel->defname, "manifest") == 0) + { + char *optval = strVal(defel->arg); + bool manifest_bool; + + if (o_manifest) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + if (parse_bool(optval, &manifest_bool)) + { + if (manifest_bool) + opt->manifest = MANIFEST_OPTION_YES; + else + opt->manifest = MANIFEST_OPTION_NO; + } + else if (pg_strcasecmp(optval, "force-encode") == 0) + opt->manifest = MANIFEST_OPTION_FORCE_ENCODE; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized manifest option: \"%s\"", + optval))); + o_manifest = true; + } + else if (strcmp(defel->defname, "manifest_checksums") == 0) + { + char *optval = strVal(defel->arg); + + if (o_manifest_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + if (!pg_checksum_parse_type(optval, + &opt->manifest_checksum_type)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized checksum algorithm: \"%s\"", + optval))); + o_manifest_checksums = true; + } + else + elog(ERROR, "option \"%s\" not recognized", + defel->defname); + } + if (opt->label == NULL) + opt->label = "base backup"; + if (opt->manifest == MANIFEST_OPTION_NO) + { + if (o_manifest_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("manifest checksums require a backup manifest"))); + opt->manifest_checksum_type = CHECKSUM_TYPE_NONE; + } +} + + +/* + * SendBaseBackup() - send a complete base backup. + * + * The function will put the system into backup mode like pg_start_backup() + * does, so that the backup is consistent even though we read directly from + * the filesystem, bypassing the buffer cache. + */ +void +SendBaseBackup(BaseBackupCmd *cmd) +{ + basebackup_options opt; + SessionBackupState status = get_backup_status(); + + if (status == SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress in this session"))); + + parse_basebackup_options(cmd->options, &opt); + + WalSndSetState(WALSNDSTATE_BACKUP); + + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"", + opt.label); + set_ps_display(activitymsg); + } + + perform_base_backup(&opt); +} + +static void +send_int8_string(StringInfoData *buf, int64 intval) +{ + char is[32]; + + sprintf(is, INT64_FORMAT, intval); + pq_sendint32(buf, strlen(is)); + pq_sendbytes(buf, is, strlen(is)); +} + +static void +SendBackupHeader(List *tablespaces) +{ + StringInfoData buf; + ListCell *lc; + + /* Construct and send the directory information */ + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint16(&buf, 3); /* 3 fields */ + + /* First field - spcoid */ + pq_sendstring(&buf, "spcoid"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, OIDOID); /* type oid */ + pq_sendint16(&buf, 4); /* typlen */ + pq_sendint32(&buf, 0); /* typmod */ + pq_sendint16(&buf, 0); /* format code */ + + /* Second field - spclocation */ + pq_sendstring(&buf, "spclocation"); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_sendint32(&buf, TEXTOID); + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + + /* Third field - size */ + pq_sendstring(&buf, "size"); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_sendint32(&buf, INT8OID); + pq_sendint16(&buf, 8); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + + foreach(lc, tablespaces) + { + tablespaceinfo *ti = lfirst(lc); + + /* Send one datarow message */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, 3); /* number of columns */ + if (ti->path == NULL) + { + pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */ + pq_sendint32(&buf, -1); + } + else + { + Size len; + + len = strlen(ti->oid); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, ti->oid, len); + + len = strlen(ti->path); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, ti->path, len); + } + if (ti->size >= 0) + send_int8_string(&buf, ti->size / 1024); + else + pq_sendint32(&buf, -1); /* NULL */ + + pq_endmessage(&buf); + } + + /* Send a CommandComplete message */ + pq_puttextmessage('C', "SELECT"); +} + +/* + * Send a single resultset containing just a single + * XLogRecPtr record (in text format) + */ +static void +SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli) +{ + StringInfoData buf; + char str[MAXFNAMELEN]; + Size len; + + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint16(&buf, 2); /* 2 fields */ + + /* Field headers */ + pq_sendstring(&buf, "recptr"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, TEXTOID); /* type oid */ + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + + pq_sendstring(&buf, "tli"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + + /* + * int8 may seem like a surprising data type for this, but in theory int4 + * would not be wide enough for this, as TimeLineID is unsigned. + */ + pq_sendint32(&buf, INT8OID); /* type oid */ + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + + /* Data row */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, 2); /* number of columns */ + + len = snprintf(str, sizeof(str), + "%X/%X", LSN_FORMAT_ARGS(ptr)); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, str, len); + + len = snprintf(str, sizeof(str), "%u", tli); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, str, len); + + pq_endmessage(&buf); + + /* Send a CommandComplete message */ + pq_puttextmessage('C', "SELECT"); +} + +/* + * Inject a file with given name and content in the output tar stream. + */ +static void +sendFileWithContent(const char *filename, const char *content, + backup_manifest_info *manifest) +{ + struct stat statbuf; + int pad, + len; + pg_checksum_context checksum_ctx; + + if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) + elog(ERROR, "could not initialize checksum of file \"%s\"", + filename); + + len = strlen(content); + + /* + * Construct a stat struct for the backup_label file we're injecting in + * the tar. + */ + /* Windows doesn't have the concept of uid and gid */ +#ifdef WIN32 + statbuf.st_uid = 0; + statbuf.st_gid = 0; +#else + statbuf.st_uid = geteuid(); + statbuf.st_gid = getegid(); +#endif + statbuf.st_mtime = time(NULL); + statbuf.st_mode = pg_file_create_mode; + statbuf.st_size = len; + + _tarWriteHeader(filename, NULL, &statbuf, false); + /* Send the contents as a CopyData message */ + pq_putmessage('d', content, len); + update_basebackup_progress(len); + + /* Pad to a multiple of the tar block size. */ + pad = tarPaddingBytesRequired(len); + if (pad > 0) + { + char buf[TAR_BLOCK_SIZE]; + + MemSet(buf, 0, pad); + pq_putmessage('d', buf, pad); + update_basebackup_progress(pad); + } + + if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0) + elog(ERROR, "could not update checksum of file \"%s\"", + filename); + + AddFileToBackupManifest(manifest, NULL, filename, len, + (pg_time_t) statbuf.st_mtime, &checksum_ctx); +} + +/* + * Include the tablespace directory pointed to by 'path' in the output tar + * stream. If 'sizeonly' is true, we just calculate a total length and return + * it, without actually sending anything. + * + * Only used to send auxiliary tablespaces, not PGDATA. + */ +static int64 +sendTablespace(char *path, char *spcoid, bool sizeonly, + backup_manifest_info *manifest) +{ + int64 size; + char pathbuf[MAXPGPATH]; + struct stat statbuf; + + /* + * 'path' points to the tablespace location, but we only want to include + * the version directory in it that belongs to us. + */ + snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, + TABLESPACE_VERSION_DIRECTORY); + + /* + * Store a directory entry in the tar file so we get the permissions + * right. + */ + if (lstat(pathbuf, &statbuf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file or directory \"%s\": %m", + pathbuf))); + + /* If the tablespace went away while scanning, it's no error. */ + return 0; + } + + size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf, + sizeonly); + + /* Send all the files in the tablespace version directory */ + size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true, manifest, + spcoid); + + return size; +} + +/* + * Include all files from the given directory in the output tar stream. If + * 'sizeonly' is true, we just calculate a total length and return it, without + * actually sending anything. + * + * Omit any directory in the tablespaces list, to avoid backing up + * tablespaces twice when they were created inside PGDATA. + * + * If sendtblspclinks is true, we need to include symlink + * information in the tar file. If not, we can skip that + * as it will be sent separately in the tablespace_map file. + */ +static int64 +sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, + bool sendtblspclinks, backup_manifest_info *manifest, + const char *spcoid) +{ + DIR *dir; + struct dirent *de; + char pathbuf[MAXPGPATH * 2]; + struct stat statbuf; + int64 size = 0; + const char *lastDir; /* Split last dir from parent path. */ + bool isDbDir = false; /* Does this directory contain relations? */ + + /* + * Determine if the current path is a database directory that can contain + * relations. + * + * Start by finding the location of the delimiter between the parent path + * and the current path. + */ + lastDir = last_dir_separator(path); + + /* Does this path look like a database path (i.e. all digits)? */ + if (lastDir != NULL && + strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1)) + { + /* Part of path that contains the parent directory. */ + int parentPathLen = lastDir - path; + + /* + * Mark path as a database directory if the parent path is either + * $PGDATA/base or a tablespace version path. + */ + if (strncmp(path, "./base", parentPathLen) == 0 || + (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) && + strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1), + TABLESPACE_VERSION_DIRECTORY, + sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0)) + isDbDir = true; + } + + dir = AllocateDir(path); + while ((de = ReadDir(dir, path)) != NULL) + { + int excludeIdx; + bool excludeFound; + ForkNumber relForkNum; /* Type of fork if file is a relation */ + int relOidChars; /* Chars in filename that are the rel oid */ + + /* Skip special stuff */ + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + /* Skip temporary files */ + if (strncmp(de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + continue; + + /* + * Check if the postmaster has signaled us to exit, and abort with an + * error in that case. The error handler further up will call + * do_pg_abort_backup() for us. Also check that if the backup was + * started while still in recovery, the server wasn't promoted. + * do_pg_stop_backup() will check that too, but it's better to stop + * the backup early than continue to the end and fail there. + */ + CHECK_FOR_INTERRUPTS(); + if (RecoveryInProgress() != backup_started_in_recovery) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the standby was promoted during online backup"), + errhint("This means that the backup being taken is corrupt " + "and should not be used. " + "Try taking another online backup."))); + + /* Scan for files that should be excluded */ + excludeFound = false; + for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++) + { + int cmplen = strlen(excludeFiles[excludeIdx].name); + + if (!excludeFiles[excludeIdx].match_prefix) + cmplen++; + if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0) + { + elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name); + excludeFound = true; + break; + } + } + + if (excludeFound) + continue; + + /* Exclude all forks for unlogged tables except the init fork */ + if (isDbDir && + parse_filename_for_nontemp_relation(de->d_name, &relOidChars, + &relForkNum)) + { + /* Never exclude init forks */ + if (relForkNum != INIT_FORKNUM) + { + char initForkFile[MAXPGPATH]; + char relOid[OIDCHARS + 1]; + + /* + * If any other type of fork, check if there is an init fork + * with the same OID. If so, the file can be excluded. + */ + memcpy(relOid, de->d_name, relOidChars); + relOid[relOidChars] = '\0'; + snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init", + path, relOid); + + if (lstat(initForkFile, &statbuf) == 0) + { + elog(DEBUG2, + "unlogged relation file \"%s\" excluded from backup", + de->d_name); + + continue; + } + } + } + + /* Exclude temporary relations */ + if (isDbDir && looks_like_temp_rel_name(de->d_name)) + { + elog(DEBUG2, + "temporary relation file \"%s\" excluded from backup", + de->d_name); + + continue; + } + + snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name); + + /* Skip pg_control here to back up it last */ + if (strcmp(pathbuf, "./global/pg_control") == 0) + continue; + + if (lstat(pathbuf, &statbuf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file or directory \"%s\": %m", + pathbuf))); + + /* If the file went away while scanning, it's not an error. */ + continue; + } + + /* Scan for directories whose contents should be excluded */ + excludeFound = false; + for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++) + { + if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0) + { + elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name); + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + excludeFound = true; + break; + } + } + + if (excludeFound) + continue; + + /* + * Exclude contents of directory specified by statrelpath if not set + * to the default (pg_stat_tmp) which is caught in the loop above. + */ + if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0) + { + elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath); + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + continue; + } + + /* + * We can skip pg_wal, the WAL segments need to be fetched from the + * WAL archive anyway. But include it as an empty directory anyway, so + * we get permissions right. + */ + if (strcmp(pathbuf, "./pg_wal") == 0) + { + /* If pg_wal is a symlink, write it as a directory anyway */ + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + + /* + * Also send archive_status directory (by hackishly reusing + * statbuf from above ...). + */ + size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf, + sizeonly); + + continue; /* don't recurse into pg_wal */ + } + + /* Allow symbolic links in pg_tblspc only */ + if (strcmp(path, "./pg_tblspc") == 0 && +#ifndef WIN32 + S_ISLNK(statbuf.st_mode) +#else + pgwin32_is_junction(pathbuf) +#endif + ) + { +#if defined(HAVE_READLINK) || defined(WIN32) + char linkpath[MAXPGPATH]; + int rllen; + + rllen = readlink(pathbuf, linkpath, sizeof(linkpath)); + if (rllen < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read symbolic link \"%s\": %m", + pathbuf))); + if (rllen >= sizeof(linkpath)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("symbolic link \"%s\" target is too long", + pathbuf))); + linkpath[rllen] = '\0'; + + size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath, + &statbuf, sizeonly); +#else + + /* + * If the platform does not have symbolic links, it should not be + * possible to have tablespaces - clearly somebody else created + * them. Warn about it and ignore. + */ + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablespaces are not supported on this platform"))); + continue; +#endif /* HAVE_READLINK */ + } + else if (S_ISDIR(statbuf.st_mode)) + { + bool skip_this_dir = false; + ListCell *lc; + + /* + * Store a directory entry in the tar file so we can get the + * permissions right. + */ + size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf, + sizeonly); + + /* + * Call ourselves recursively for a directory, unless it happens + * to be a separate tablespace located within PGDATA. + */ + foreach(lc, tablespaces) + { + tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); + + /* + * ti->rpath is the tablespace relative path within PGDATA, or + * NULL if the tablespace has been properly located somewhere + * else. + * + * Skip past the leading "./" in pathbuf when comparing. + */ + if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0) + { + skip_this_dir = true; + break; + } + } + + /* + * skip sending directories inside pg_tblspc, if not required. + */ + if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks) + skip_this_dir = true; + + if (!skip_this_dir) + size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, + sendtblspclinks, manifest, spcoid); + } + else if (S_ISREG(statbuf.st_mode)) + { + bool sent = false; + + if (!sizeonly) + sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, + true, isDbDir ? atooid(lastDir + 1) : InvalidOid, + manifest, spcoid); + + if (sent || sizeonly) + { + /* Add size. */ + size += statbuf.st_size; + + /* Pad to a multiple of the tar block size. */ + size += tarPaddingBytesRequired(statbuf.st_size); + + /* Size of the header for the file. */ + size += TAR_BLOCK_SIZE; + } + } + else + ereport(WARNING, + (errmsg("skipping special file \"%s\"", pathbuf))); + } + FreeDir(dir); + return size; +} + +/* + * Check if a file should have its checksum validated. + * We validate checksums on files in regular tablespaces + * (including global and default) only, and in those there + * are some files that are explicitly excluded. + */ +static bool +is_checksummed_file(const char *fullpath, const char *filename) +{ + /* Check that the file is in a tablespace */ + if (strncmp(fullpath, "./global/", 9) == 0 || + strncmp(fullpath, "./base/", 7) == 0 || + strncmp(fullpath, "/", 1) == 0) + { + int excludeIdx; + + /* Compare file against noChecksumFiles skip list */ + for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++) + { + int cmplen = strlen(noChecksumFiles[excludeIdx].name); + + if (!noChecksumFiles[excludeIdx].match_prefix) + cmplen++; + if (strncmp(filename, noChecksumFiles[excludeIdx].name, + cmplen) == 0) + return false; + } + + return true; + } + else + return false; +} + +/***** + * Functions for handling tar file format + * + * Copied from pg_dump, but modified to work with libpq for sending + */ + + +/* + * Given the member, write the TAR header & send the file. + * + * If 'missing_ok' is true, will not throw an error if the file is not found. + * + * If dboid is anything other than InvalidOid then any checksum failures detected + * will get reported to the stats collector. + * + * Returns true if the file was successfully sent, false if 'missing_ok', + * and the file did not exist. + */ +static bool +sendFile(const char *readfilename, const char *tarfilename, + struct stat *statbuf, bool missing_ok, Oid dboid, + backup_manifest_info *manifest, const char *spcoid) +{ + int fd; + BlockNumber blkno = 0; + bool block_retry = false; + char buf[TAR_SEND_SIZE]; + uint16 checksum; + int checksum_failures = 0; + off_t cnt; + int i; + pgoff_t len = 0; + char *page; + size_t pad; + PageHeader phdr; + int segmentno = 0; + char *segmentpath; + bool verify_checksum = false; + pg_checksum_context checksum_ctx; + + if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) + elog(ERROR, "could not initialize checksum of file \"%s\"", + readfilename); + + fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno == ENOENT && missing_ok) + return false; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", readfilename))); + } + + _tarWriteHeader(tarfilename, NULL, statbuf, false); + + if (!noverify_checksums && DataChecksumsEnabled()) + { + char *filename; + + /* + * Get the filename (excluding path). As last_dir_separator() + * includes the last directory separator, we chop that off by + * incrementing the pointer. + */ + filename = last_dir_separator(readfilename) + 1; + + if (is_checksummed_file(readfilename, filename)) + { + verify_checksum = true; + + /* + * Cut off at the segment boundary (".") to get the segment number + * in order to mix it into the checksum. + */ + segmentpath = strstr(filename, "."); + if (segmentpath != NULL) + { + segmentno = atoi(segmentpath + 1); + if (segmentno == 0) + ereport(ERROR, + (errmsg("invalid segment number %d in file \"%s\"", + segmentno, filename))); + } + } + } + + /* + * Loop until we read the amount of data the caller told us to expect. The + * file could be longer, if it was extended while we were sending it, but + * for a base backup we can ignore such extended data. It will be restored + * from WAL. + */ + while (len < statbuf->st_size) + { + /* Try to read some more data. */ + cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), statbuf->st_size - len), + len, readfilename, true); + + /* + * If we hit end-of-file, a concurrent truncation must have occurred. + * That's not an error condition, because WAL replay will fix things + * up. + */ + if (cnt == 0) + break; + + /* + * The checksums are verified at block level, so we iterate over the + * buffer in chunks of BLCKSZ, after making sure that + * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of + * BLCKSZ bytes. + */ + Assert(TAR_SEND_SIZE % BLCKSZ == 0); + + if (verify_checksum && (cnt % BLCKSZ != 0)) + { + ereport(WARNING, + (errmsg("could not verify checksum in file \"%s\", block " + "%u: read buffer size %d and page size %d " + "differ", + readfilename, blkno, (int) cnt, BLCKSZ))); + verify_checksum = false; + } + + if (verify_checksum) + { + for (i = 0; i < cnt / BLCKSZ; i++) + { + page = buf + BLCKSZ * i; + + /* + * Only check pages which have not been modified since the + * start of the base backup. Otherwise, they might have been + * written only halfway and the checksum would not be valid. + * However, replaying WAL would reinstate the correct page in + * this case. We also skip completely new pages, since they + * don't have a checksum yet. + */ + if (!PageIsNew(page) && PageGetLSN(page) < startptr) + { + checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + phdr = (PageHeader) page; + if (phdr->pd_checksum != checksum) + { + /* + * Retry the block on the first failure. It's + * possible that we read the first 4K page of the + * block just before postgres updated the entire block + * so it ends up looking torn to us. We only need to + * retry once because the LSN should be updated to + * something we can ignore on the next pass. If the + * error happens again then it is a true validation + * failure. + */ + if (block_retry == false) + { + int reread_cnt; + + /* Reread the failed block */ + reread_cnt = + basebackup_read_file(fd, buf + BLCKSZ * i, + BLCKSZ, len + BLCKSZ * i, + readfilename, + false); + if (reread_cnt == 0) + { + /* + * If we hit end-of-file, a concurrent + * truncation must have occurred, so break out + * of this loop just as if the initial fread() + * returned 0. We'll drop through to the same + * code that handles that case. (We must fix + * up cnt first, though.) + */ + cnt = BLCKSZ * i; + break; + } + + /* Set flag so we know a retry was attempted */ + block_retry = true; + + /* Reset loop to validate the block again */ + i--; + continue; + } + + checksum_failures++; + + if (checksum_failures <= 5) + ereport(WARNING, + (errmsg("checksum verification failed in " + "file \"%s\", block %u: calculated " + "%X but expected %X", + readfilename, blkno, checksum, + phdr->pd_checksum))); + if (checksum_failures == 5) + ereport(WARNING, + (errmsg("further checksum verification " + "failures in file \"%s\" will not " + "be reported", readfilename))); + } + } + block_retry = false; + blkno++; + } + } + + /* Send the chunk as a CopyData message */ + if (pq_putmessage('d', buf, cnt)) + ereport(ERROR, + (errmsg("base backup could not send data, aborting backup"))); + update_basebackup_progress(cnt); + + /* Also feed it to the checksum machinery. */ + if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0) + elog(ERROR, "could not update checksum of base backup"); + + len += cnt; + throttle(cnt); + } + + /* If the file was truncated while we were sending it, pad it with zeros */ + if (len < statbuf->st_size) + { + MemSet(buf, 0, sizeof(buf)); + while (len < statbuf->st_size) + { + cnt = Min(sizeof(buf), statbuf->st_size - len); + pq_putmessage('d', buf, cnt); + if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0) + elog(ERROR, "could not update checksum of base backup"); + update_basebackup_progress(cnt); + len += cnt; + throttle(cnt); + } + } + + /* + * Pad to a block boundary, per tar format requirements. (This small piece + * of data is probably not worth throttling, and is not checksummed + * because it's not actually part of the file.) + */ + pad = tarPaddingBytesRequired(len); + if (pad > 0) + { + MemSet(buf, 0, pad); + pq_putmessage('d', buf, pad); + update_basebackup_progress(pad); + } + + CloseTransientFile(fd); + + if (checksum_failures > 1) + { + ereport(WARNING, + (errmsg_plural("file \"%s\" has a total of %d checksum verification failure", + "file \"%s\" has a total of %d checksum verification failures", + checksum_failures, + readfilename, checksum_failures))); + + pgstat_report_checksum_failures_in_db(dboid, checksum_failures); + } + + total_checksum_failures += checksum_failures; + + AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size, + (pg_time_t) statbuf->st_mtime, &checksum_ctx); + + return true; +} + + +static int64 +_tarWriteHeader(const char *filename, const char *linktarget, + struct stat *statbuf, bool sizeonly) +{ + char h[TAR_BLOCK_SIZE]; + enum tarError rc; + + if (!sizeonly) + { + rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size, + statbuf->st_mode, statbuf->st_uid, statbuf->st_gid, + statbuf->st_mtime); + + switch (rc) + { + case TAR_OK: + break; + case TAR_NAME_TOO_LONG: + ereport(ERROR, + (errmsg("file name too long for tar format: \"%s\"", + filename))); + break; + case TAR_SYMLINK_TOO_LONG: + ereport(ERROR, + (errmsg("symbolic link target too long for tar format: " + "file name \"%s\", target \"%s\"", + filename, linktarget))); + break; + default: + elog(ERROR, "unrecognized tar error: %d", rc); + } + + pq_putmessage('d', h, sizeof(h)); + update_basebackup_progress(sizeof(h)); + } + + return sizeof(h); +} + +/* + * Write tar header for a directory. If the entry in statbuf is a link then + * write it as a directory anyway. + */ +static int64 +_tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf, + bool sizeonly) +{ + /* If symlink, write it as a directory anyway */ +#ifndef WIN32 + if (S_ISLNK(statbuf->st_mode)) +#else + if (pgwin32_is_junction(pathbuf)) +#endif + statbuf->st_mode = S_IFDIR | pg_dir_create_mode; + + return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly); +} + +/* + * Increment the network transfer counter by the given number of bytes, + * and sleep if necessary to comply with the requested network transfer + * rate. + */ +static void +throttle(size_t increment) +{ + TimeOffset elapsed_min; + + if (throttling_counter < 0) + return; + + throttling_counter += increment; + if (throttling_counter < throttling_sample) + return; + + /* How much time should have elapsed at minimum? */ + elapsed_min = elapsed_min_unit * + (throttling_counter / throttling_sample); + + /* + * Since the latch could be set repeatedly because of concurrently WAL + * activity, sleep in a loop to ensure enough time has passed. + */ + for (;;) + { + TimeOffset elapsed, + sleep; + int wait_result; + + /* Time elapsed since the last measurement (and possible wake up). */ + elapsed = GetCurrentTimestamp() - throttled_last; + + /* sleep if the transfer is faster than it should be */ + sleep = elapsed_min - elapsed; + if (sleep <= 0) + break; + + ResetLatch(MyLatch); + + /* We're eating a potentially set latch, so check for interrupts */ + CHECK_FOR_INTERRUPTS(); + + /* + * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be + * the maximum time to sleep. Thus the cast to long is safe. + */ + wait_result = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (long) (sleep / 1000), + WAIT_EVENT_BASE_BACKUP_THROTTLE); + + if (wait_result & WL_LATCH_SET) + CHECK_FOR_INTERRUPTS(); + + /* Done waiting? */ + if (wait_result & WL_TIMEOUT) + break; + } + + /* + * As we work with integers, only whole multiple of throttling_sample was + * processed. The rest will be done during the next call of this function. + */ + throttling_counter %= throttling_sample; + + /* + * Time interval for the remaining amount and possible next increments + * starts now. + */ + throttled_last = GetCurrentTimestamp(); +} + +/* + * Increment the counter for the amount of data already streamed + * by the given number of bytes, and update the progress report for + * pg_stat_progress_basebackup. + */ +static void +update_basebackup_progress(int64 delta) +{ + const int index[] = { + PROGRESS_BASEBACKUP_BACKUP_STREAMED, + PROGRESS_BASEBACKUP_BACKUP_TOTAL + }; + int64 val[2]; + int nparam = 0; + + backup_streamed += delta; + val[nparam++] = backup_streamed; + + /* + * Avoid overflowing past 100% or the full size. This may make the total + * size number change as we approach the end of the backup (the estimate + * will always be wrong if WAL is included), but that's better than having + * the done column be bigger than the total. + */ + if (backup_total > -1 && backup_streamed > backup_total) + { + backup_total = backup_streamed; + val[nparam++] = backup_total; + } + + pgstat_progress_update_multi_param(nparam, index, val); +} + +/* + * Read some data from a file, setting a wait event and reporting any error + * encountered. + * + * If partial_read_ok is false, also report an error if the number of bytes + * read is not equal to the number of bytes requested. + * + * Returns the number of bytes read. + */ +static int +basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok) +{ + int rc; + + pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ); + rc = pg_pread(fd, buf, nbytes, offset); + pgstat_report_wait_end(); + + if (rc < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", filename))); + if (!partial_read_ok && rc > 0 && rc != nbytes) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + filename, rc, nbytes))); + + return rc; +} diff --git a/src/backend/replication/libpqwalreceiver/Makefile b/src/backend/replication/libpqwalreceiver/Makefile new file mode 100644 index 0000000..f26daa1 --- /dev/null +++ b/src/backend/replication/libpqwalreceiver/Makefile @@ -0,0 +1,37 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/libpqwalreceiver +# +# IDENTIFICATION +# src/backend/replication/libpqwalreceiver/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/libpqwalreceiver +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS) + +OBJS = \ + $(WIN32RES) \ + libpqwalreceiver.o +SHLIB_LINK_INTERNAL = $(libpq) +SHLIB_LINK = $(filter -lintl, $(LIBS)) +SHLIB_PREREQS = submake-libpq +PGFILEDESC = "libpqwalreceiver - receive WAL during streaming replication" +NAME = libpqwalreceiver + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean maintainer-clean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c new file mode 100644 index 0000000..6eaa84a --- /dev/null +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -0,0 +1,1112 @@ +/*------------------------------------------------------------------------- + * + * libpqwalreceiver.c + * + * This file contains the libpq-specific parts of walreceiver. It's + * loaded as a dynamic module to avoid linking the main server binary with + * libpq. + * + * Portions Copyright (c) 2010-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/xlog.h" +#include "catalog/pg_type.h" +#include "common/connect.h" +#include "funcapi.h" +#include "libpq-fe.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "pqexpbuffer.h" +#include "replication/walreceiver.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/tuplestore.h" + +PG_MODULE_MAGIC; + +void _PG_init(void); + +struct WalReceiverConn +{ + /* Current connection to the primary, if any */ + PGconn *streamConn; + /* Used to remember if the connection is logical or physical */ + bool logical; + /* Buffer for currently read records */ + char *recvBuf; +}; + +/* Prototypes for interface functions */ +static WalReceiverConn *libpqrcv_connect(const char *conninfo, + bool logical, const char *appname, + char **err); +static void libpqrcv_check_conninfo(const char *conninfo); +static char *libpqrcv_get_conninfo(WalReceiverConn *conn); +static void libpqrcv_get_senderinfo(WalReceiverConn *conn, + char **sender_host, int *sender_port); +static char *libpqrcv_identify_system(WalReceiverConn *conn, + TimeLineID *primary_tli); +static int libpqrcv_server_version(WalReceiverConn *conn); +static void libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, + TimeLineID tli, char **filename, + char **content, int *len); +static bool libpqrcv_startstreaming(WalReceiverConn *conn, + const WalRcvStreamOptions *options); +static void libpqrcv_endstreaming(WalReceiverConn *conn, + TimeLineID *next_tli); +static int libpqrcv_receive(WalReceiverConn *conn, char **buffer, + pgsocket *wait_fd); +static void libpqrcv_send(WalReceiverConn *conn, const char *buffer, + int nbytes); +static char *libpqrcv_create_slot(WalReceiverConn *conn, + const char *slotname, + bool temporary, + CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn); +static pid_t libpqrcv_get_backend_pid(WalReceiverConn *conn); +static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); +static void libpqrcv_disconnect(WalReceiverConn *conn); + +static WalReceiverFunctionsType PQWalReceiverFunctions = { + libpqrcv_connect, + libpqrcv_check_conninfo, + libpqrcv_get_conninfo, + libpqrcv_get_senderinfo, + libpqrcv_identify_system, + libpqrcv_server_version, + libpqrcv_readtimelinehistoryfile, + libpqrcv_startstreaming, + libpqrcv_endstreaming, + libpqrcv_receive, + libpqrcv_send, + libpqrcv_create_slot, + libpqrcv_get_backend_pid, + libpqrcv_exec, + libpqrcv_disconnect +}; + +/* Prototypes for private functions */ +static PGresult *libpqrcv_PQexec(PGconn *streamConn, const char *query); +static PGresult *libpqrcv_PQgetResult(PGconn *streamConn); +static char *stringlist_to_identifierstr(PGconn *conn, List *strings); + +/* + * Module initialization function + */ +void +_PG_init(void) +{ + if (WalReceiverFunctions != NULL) + elog(ERROR, "libpqwalreceiver already loaded"); + WalReceiverFunctions = &PQWalReceiverFunctions; +} + +/* + * Establish the connection to the primary server for XLOG streaming + * + * Returns NULL on error and fills the err with palloc'ed error message. + */ +static WalReceiverConn * +libpqrcv_connect(const char *conninfo, bool logical, const char *appname, + char **err) +{ + WalReceiverConn *conn; + PostgresPollingStatusType status; + const char *keys[5]; + const char *vals[5]; + int i = 0; + + /* + * We use the expand_dbname parameter to process the connection string (or + * URI), and pass some extra options. + */ + keys[i] = "dbname"; + vals[i] = conninfo; + keys[++i] = "replication"; + vals[i] = logical ? "database" : "true"; + if (!logical) + { + /* + * The database name is ignored by the server in replication mode, but + * specify "replication" for .pgpass lookup. + */ + keys[++i] = "dbname"; + vals[i] = "replication"; + } + keys[++i] = "fallback_application_name"; + vals[i] = appname; + if (logical) + { + keys[++i] = "client_encoding"; + vals[i] = GetDatabaseEncodingName(); + } + keys[++i] = NULL; + vals[i] = NULL; + + Assert(i < sizeof(keys)); + + conn = palloc0(sizeof(WalReceiverConn)); + conn->streamConn = PQconnectStartParams(keys, vals, + /* expand_dbname = */ true); + if (PQstatus(conn->streamConn) == CONNECTION_BAD) + { + *err = pchomp(PQerrorMessage(conn->streamConn)); + return NULL; + } + + /* + * Poll connection until we have OK or FAILED status. + * + * Per spec for PQconnectPoll, first wait till socket is write-ready. + */ + status = PGRES_POLLING_WRITING; + do + { + int io_flag; + int rc; + + if (status == PGRES_POLLING_READING) + io_flag = WL_SOCKET_READABLE; +#ifdef WIN32 + /* Windows needs a different test while waiting for connection-made */ + else if (PQstatus(conn->streamConn) == CONNECTION_STARTED) + io_flag = WL_SOCKET_CONNECTED; +#endif + else + io_flag = WL_SOCKET_WRITEABLE; + + rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | io_flag, + PQsocket(conn->streamConn), + 0, + WAIT_EVENT_LIBPQWALRECEIVER_CONNECT); + + /* Interrupted? */ + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + ProcessWalRcvInterrupts(); + } + + /* If socket is ready, advance the libpq state machine */ + if (rc & io_flag) + status = PQconnectPoll(conn->streamConn); + } while (status != PGRES_POLLING_OK && status != PGRES_POLLING_FAILED); + + if (PQstatus(conn->streamConn) != CONNECTION_OK) + { + *err = pchomp(PQerrorMessage(conn->streamConn)); + return NULL; + } + + if (logical) + { + PGresult *res; + + res = libpqrcv_PQexec(conn->streamConn, + ALWAYS_SECURE_SEARCH_PATH_SQL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errmsg("could not clear search path: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + PQclear(res); + } + + conn->logical = logical; + + return conn; +} + +/* + * Validate connection info string (just try to parse it) + */ +static void +libpqrcv_check_conninfo(const char *conninfo) +{ + PQconninfoOption *opts = NULL; + char *err = NULL; + + opts = PQconninfoParse(conninfo, &err); + if (opts == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + PQconninfoFree(opts); +} + +/* + * Return a user-displayable conninfo string. Any security-sensitive fields + * are obfuscated. + */ +static char * +libpqrcv_get_conninfo(WalReceiverConn *conn) +{ + PQconninfoOption *conn_opts; + PQconninfoOption *conn_opt; + PQExpBufferData buf; + char *retval; + + Assert(conn->streamConn != NULL); + + initPQExpBuffer(&buf); + conn_opts = PQconninfo(conn->streamConn); + + if (conn_opts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("could not parse connection string: %s", + _("out of memory")))); + + /* build a clean connection string from pieces */ + for (conn_opt = conn_opts; conn_opt->keyword != NULL; conn_opt++) + { + bool obfuscate; + + /* Skip debug and empty options */ + if (strchr(conn_opt->dispchar, 'D') || + conn_opt->val == NULL || + conn_opt->val[0] == '\0') + continue; + + /* Obfuscate security-sensitive options */ + obfuscate = strchr(conn_opt->dispchar, '*') != NULL; + + appendPQExpBuffer(&buf, "%s%s=%s", + buf.len == 0 ? "" : " ", + conn_opt->keyword, + obfuscate ? "********" : conn_opt->val); + } + + PQconninfoFree(conn_opts); + + retval = PQExpBufferDataBroken(buf) ? NULL : pstrdup(buf.data); + termPQExpBuffer(&buf); + return retval; +} + +/* + * Provides information of sender this WAL receiver is connected to. + */ +static void +libpqrcv_get_senderinfo(WalReceiverConn *conn, char **sender_host, + int *sender_port) +{ + char *ret = NULL; + + *sender_host = NULL; + *sender_port = 0; + + Assert(conn->streamConn != NULL); + + ret = PQhost(conn->streamConn); + if (ret && strlen(ret) != 0) + *sender_host = pstrdup(ret); + + ret = PQport(conn->streamConn); + if (ret && strlen(ret) != 0) + *sender_port = atoi(ret); +} + +/* + * Check that primary's system identifier matches ours, and fetch the current + * timeline ID of the primary. + */ +static char * +libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) +{ + PGresult *res; + char *primary_sysid; + + /* + * Get the system identifier and timeline ID as a DataRow message from the + * primary server. + */ + res = libpqrcv_PQexec(conn->streamConn, "IDENTIFY_SYSTEM"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive database system identifier and timeline ID from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + if (PQnfields(res) < 3 || PQntuples(res) != 1) + { + int ntuples = PQntuples(res); + int nfields = PQnfields(res); + + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid response from primary server"), + errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", + ntuples, nfields, 3, 1))); + } + primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); + *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); + PQclear(res); + + return primary_sysid; +} + +/* + * Thin wrapper around libpq to obtain server version. + */ +static int +libpqrcv_server_version(WalReceiverConn *conn) +{ + return PQserverVersion(conn->streamConn); +} + +/* + * Start streaming WAL data from given streaming options. + * + * Returns true if we switched successfully to copy-both mode. False + * means the server received the command and executed it successfully, but + * didn't switch to copy-mode. That means that there was no WAL on the + * requested timeline and starting point, because the server switched to + * another timeline at or before the requested starting point. On failure, + * throws an ERROR. + */ +static bool +libpqrcv_startstreaming(WalReceiverConn *conn, + const WalRcvStreamOptions *options) +{ + StringInfoData cmd; + PGresult *res; + + Assert(options->logical == conn->logical); + Assert(options->slotname || !options->logical); + + initStringInfo(&cmd); + + /* Build the command. */ + appendStringInfoString(&cmd, "START_REPLICATION"); + if (options->slotname != NULL) + appendStringInfo(&cmd, " SLOT \"%s\"", + options->slotname); + + if (options->logical) + appendStringInfoString(&cmd, " LOGICAL"); + + appendStringInfo(&cmd, " %X/%X", LSN_FORMAT_ARGS(options->startpoint)); + + /* + * Additional options are different depending on if we are doing logical + * or physical replication. + */ + if (options->logical) + { + char *pubnames_str; + List *pubnames; + char *pubnames_literal; + + appendStringInfoString(&cmd, " ("); + + appendStringInfo(&cmd, "proto_version '%u'", + options->proto.logical.proto_version); + + if (options->proto.logical.streaming && + PQserverVersion(conn->streamConn) >= 140000) + appendStringInfoString(&cmd, ", streaming 'on'"); + + pubnames = options->proto.logical.publication_names; + pubnames_str = stringlist_to_identifierstr(conn->streamConn, pubnames); + if (!pubnames_str) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), /* likely guess */ + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + pubnames_literal = PQescapeLiteral(conn->streamConn, pubnames_str, + strlen(pubnames_str)); + if (!pubnames_literal) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), /* likely guess */ + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + appendStringInfo(&cmd, ", publication_names %s", pubnames_literal); + PQfreemem(pubnames_literal); + pfree(pubnames_str); + + if (options->proto.logical.binary && + PQserverVersion(conn->streamConn) >= 140000) + appendStringInfoString(&cmd, ", binary 'true'"); + + appendStringInfoChar(&cmd, ')'); + } + else + appendStringInfo(&cmd, " TIMELINE %u", + options->proto.physical.startpointTLI); + + /* Start streaming. */ + res = libpqrcv_PQexec(conn->streamConn, cmd.data); + pfree(cmd.data); + + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + return false; + } + else if (PQresultStatus(res) != PGRES_COPY_BOTH) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + PQclear(res); + return true; +} + +/* + * Stop streaming WAL data. Returns the next timeline's ID in *next_tli, as + * reported by the server, or 0 if it did not report it. + */ +static void +libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli) +{ + PGresult *res; + + /* + * Send copy-end message. As in libpqrcv_PQexec, this could theoretically + * block, but the risk seems small. + */ + if (PQputCopyEnd(conn->streamConn, NULL) <= 0 || + PQflush(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not send end-of-streaming message to primary: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + *next_tli = 0; + + /* + * After COPY is finished, we should receive a result set indicating the + * next timeline's ID, or just CommandComplete if the server was shut + * down. + * + * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT is + * also possible in case we aborted the copy in mid-stream. + */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (PQresultStatus(res) == PGRES_TUPLES_OK) + { + /* + * Read the next timeline's ID. The server also sends the timeline's + * starting point, but it is ignored. + */ + if (PQnfields(res) < 2 || PQntuples(res) != 1) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result set after end-of-streaming"))); + *next_tli = pg_strtoint32(PQgetvalue(res, 0, 0)); + PQclear(res); + + /* the result set should be followed by CommandComplete */ + res = libpqrcv_PQgetResult(conn->streamConn); + } + else if (PQresultStatus(res) == PGRES_COPY_OUT) + { + PQclear(res); + + /* End the copy */ + if (PQendcopy(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("error while shutting down streaming COPY: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* CommandComplete should follow */ + res = libpqrcv_PQgetResult(conn->streamConn); + } + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("error reading result of streaming command: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + PQclear(res); + + /* Verify that there are no more results */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (res != NULL) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result after CommandComplete: %s", + pchomp(PQerrorMessage(conn->streamConn))))); +} + +/* + * Fetch the timeline history file for 'tli' from primary. + */ +static void +libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, + TimeLineID tli, char **filename, + char **content, int *len) +{ + PGresult *res; + char cmd[64]; + + Assert(!conn->logical); + + /* + * Request the primary to send over the history file for given timeline. + */ + snprintf(cmd, sizeof(cmd), "TIMELINE_HISTORY %u", tli); + res = libpqrcv_PQexec(conn->streamConn, cmd); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive timeline history file from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + if (PQnfields(res) != 2 || PQntuples(res) != 1) + { + int ntuples = PQntuples(res); + int nfields = PQnfields(res); + + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid response from primary server"), + errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", + ntuples, nfields))); + } + *filename = pstrdup(PQgetvalue(res, 0, 0)); + + *len = PQgetlength(res, 0, 1); + *content = palloc(*len); + memcpy(*content, PQgetvalue(res, 0, 1), *len); + PQclear(res); +} + +/* + * Send a query and wait for the results by using the asynchronous libpq + * functions and socket readiness events. + * + * We must not use the regular blocking libpq functions like PQexec() + * since they are uninterruptible by signals on some platforms, such as + * Windows. + * + * The function is modeled on PQexec() in libpq, but only implements + * those parts that are in use in the walreceiver api. + * + * May return NULL, rather than an error result, on failure. + */ +static PGresult * +libpqrcv_PQexec(PGconn *streamConn, const char *query) +{ + PGresult *lastResult = NULL; + + /* + * PQexec() silently discards any prior query results on the connection. + * This is not required for this function as it's expected that the caller + * (which is this library in all cases) will behave correctly and we don't + * have to be backwards compatible with old libpq. + */ + + /* + * Submit the query. Since we don't use non-blocking mode, this could + * theoretically block. In practice, since we don't send very long query + * strings, the risk seems negligible. + */ + if (!PQsendQuery(streamConn, query)) + return NULL; + + for (;;) + { + /* Wait for, and collect, the next PGresult. */ + PGresult *result; + + result = libpqrcv_PQgetResult(streamConn); + if (result == NULL) + break; /* query is complete, or failure */ + + /* + * Emulate PQexec()'s behavior of returning the last result when there + * are many. We are fine with returning just last error message. + */ + PQclear(lastResult); + lastResult = result; + + if (PQresultStatus(lastResult) == PGRES_COPY_IN || + PQresultStatus(lastResult) == PGRES_COPY_OUT || + PQresultStatus(lastResult) == PGRES_COPY_BOTH || + PQstatus(streamConn) == CONNECTION_BAD) + break; + } + + return lastResult; +} + +/* + * Perform the equivalent of PQgetResult(), but watch for interrupts. + */ +static PGresult * +libpqrcv_PQgetResult(PGconn *streamConn) +{ + /* + * Collect data until PQgetResult is ready to get the result without + * blocking. + */ + while (PQisBusy(streamConn)) + { + int rc; + + /* + * We don't need to break down the sleep into smaller increments, + * since we'll get interrupted by signals and can handle any + * interrupts here. + */ + rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE | + WL_LATCH_SET, + PQsocket(streamConn), + 0, + WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); + + /* Interrupted? */ + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + ProcessWalRcvInterrupts(); + } + + /* Consume whatever data is available from the socket */ + if (PQconsumeInput(streamConn) == 0) + { + /* trouble; return NULL */ + return NULL; + } + } + + /* Now we can collect and return the next PGresult */ + return PQgetResult(streamConn); +} + +/* + * Disconnect connection to primary, if any. + */ +static void +libpqrcv_disconnect(WalReceiverConn *conn) +{ + PQfinish(conn->streamConn); + if (conn->recvBuf != NULL) + PQfreemem(conn->recvBuf); + pfree(conn); +} + +/* + * Receive a message available from XLOG stream. + * + * Returns: + * + * If data was received, returns the length of the data. *buffer is set to + * point to a buffer holding the received message. The buffer is only valid + * until the next libpqrcv_* call. + * + * If no data was available immediately, returns 0, and *wait_fd is set to a + * socket descriptor which can be waited on before trying again. + * + * -1 if the server ended the COPY. + * + * ereports on error. + */ +static int +libpqrcv_receive(WalReceiverConn *conn, char **buffer, + pgsocket *wait_fd) +{ + int rawlen; + + if (conn->recvBuf != NULL) + PQfreemem(conn->recvBuf); + conn->recvBuf = NULL; + + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); + if (rawlen == 0) + { + /* Try consuming some data. */ + if (PQconsumeInput(conn->streamConn) == 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* Now that we've consumed some input, try again */ + rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); + if (rawlen == 0) + { + /* Tell caller to try again when our socket is ready. */ + *wait_fd = PQsocket(conn->streamConn); + return 0; + } + } + if (rawlen == -1) /* end-of-streaming or error */ + { + PGresult *res; + + res = libpqrcv_PQgetResult(conn->streamConn); + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + + /* Verify that there are no more results. */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (res != NULL) + { + PQclear(res); + + /* + * If the other side closed the connection orderly (otherwise + * we'd seen an error, or PGRES_COPY_IN) don't report an error + * here, but let callers deal with it. + */ + if (PQstatus(conn->streamConn) == CONNECTION_BAD) + return -1; + + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result after CommandComplete: %s", + PQerrorMessage(conn->streamConn)))); + } + + return -1; + } + else if (PQresultStatus(res) == PGRES_COPY_IN) + { + PQclear(res); + return -1; + } + else + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + } + if (rawlen < -1) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* Return received messages to caller */ + *buffer = conn->recvBuf; + return rawlen; +} + +/* + * Send a message to XLOG stream. + * + * ereports on error. + */ +static void +libpqrcv_send(WalReceiverConn *conn, const char *buffer, int nbytes) +{ + if (PQputCopyData(conn->streamConn, buffer, nbytes) <= 0 || + PQflush(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not send data to WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); +} + +/* + * Create new replication slot. + * Returns the name of the exported snapshot for logical slot or NULL for + * physical slot. + */ +static char * +libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, + bool temporary, CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn) +{ + PGresult *res; + StringInfoData cmd; + char *snapshot; + + initStringInfo(&cmd); + + appendStringInfo(&cmd, "CREATE_REPLICATION_SLOT \"%s\"", slotname); + + if (temporary) + appendStringInfoString(&cmd, " TEMPORARY"); + + if (conn->logical) + { + appendStringInfoString(&cmd, " LOGICAL pgoutput"); + switch (snapshot_action) + { + case CRS_EXPORT_SNAPSHOT: + appendStringInfoString(&cmd, " EXPORT_SNAPSHOT"); + break; + case CRS_NOEXPORT_SNAPSHOT: + appendStringInfoString(&cmd, " NOEXPORT_SNAPSHOT"); + break; + case CRS_USE_SNAPSHOT: + appendStringInfoString(&cmd, " USE_SNAPSHOT"); + break; + } + } + else + { + appendStringInfoString(&cmd, " PHYSICAL RESERVE_WAL"); + } + + res = libpqrcv_PQexec(conn->streamConn, cmd.data); + pfree(cmd.data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not create replication slot \"%s\": %s", + slotname, pchomp(PQerrorMessage(conn->streamConn))))); + } + + if (lsn) + *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, + CStringGetDatum(PQgetvalue(res, 0, 1)))); + + if (!PQgetisnull(res, 0, 2)) + snapshot = pstrdup(PQgetvalue(res, 0, 2)); + else + snapshot = NULL; + + PQclear(res); + + return snapshot; +} + +/* + * Return PID of remote backend process. + */ +static pid_t +libpqrcv_get_backend_pid(WalReceiverConn *conn) +{ + return PQbackendPID(conn->streamConn); +} + +/* + * Convert tuple query result to tuplestore. + */ +static void +libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, + const int nRetTypes, const Oid *retTypes) +{ + int tupn; + int coln; + int nfields = PQnfields(pgres); + HeapTuple tuple; + AttInMetadata *attinmeta; + MemoryContext rowcontext; + MemoryContext oldcontext; + + /* Make sure we got expected number of fields. */ + if (nfields != nRetTypes) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid query response"), + errdetail("Expected %d fields, got %d fields.", + nRetTypes, nfields))); + + walres->tuplestore = tuplestore_begin_heap(true, false, work_mem); + + /* Create tuple descriptor corresponding to expected result. */ + walres->tupledesc = CreateTemplateTupleDesc(nRetTypes); + for (coln = 0; coln < nRetTypes; coln++) + TupleDescInitEntry(walres->tupledesc, (AttrNumber) coln + 1, + PQfname(pgres, coln), retTypes[coln], -1, 0); + attinmeta = TupleDescGetAttInMetadata(walres->tupledesc); + + /* No point in doing more here if there were no tuples returned. */ + if (PQntuples(pgres) == 0) + return; + + /* Create temporary context for local allocations. */ + rowcontext = AllocSetContextCreate(CurrentMemoryContext, + "libpqrcv query result context", + ALLOCSET_DEFAULT_SIZES); + + /* Process returned rows. */ + for (tupn = 0; tupn < PQntuples(pgres); tupn++) + { + char *cstrs[MaxTupleAttributeNumber]; + + ProcessWalRcvInterrupts(); + + /* Do the allocations in temporary context. */ + oldcontext = MemoryContextSwitchTo(rowcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (coln = 0; coln < nfields; coln++) + { + if (PQgetisnull(pgres, tupn, coln)) + cstrs[coln] = NULL; + else + cstrs[coln] = PQgetvalue(pgres, tupn, coln); + } + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(attinmeta, cstrs); + tuplestore_puttuple(walres->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(rowcontext); + } + + MemoryContextDelete(rowcontext); +} + +/* + * Public interface for sending generic queries (and commands). + * + * This can only be called from process connected to database. + */ +static WalRcvExecResult * +libpqrcv_exec(WalReceiverConn *conn, const char *query, + const int nRetTypes, const Oid *retTypes) +{ + PGresult *pgres = NULL; + WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult)); + char *diag_sqlstate; + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the query interface requires a database connection"))); + + pgres = libpqrcv_PQexec(conn->streamConn, query); + + switch (PQresultStatus(pgres)) + { + case PGRES_SINGLE_TUPLE: + case PGRES_TUPLES_OK: + walres->status = WALRCV_OK_TUPLES; + libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes); + break; + + case PGRES_COPY_IN: + walres->status = WALRCV_OK_COPY_IN; + break; + + case PGRES_COPY_OUT: + walres->status = WALRCV_OK_COPY_OUT; + break; + + case PGRES_COPY_BOTH: + walres->status = WALRCV_OK_COPY_BOTH; + break; + + case PGRES_COMMAND_OK: + walres->status = WALRCV_OK_COMMAND; + break; + + /* Empty query is considered error. */ + case PGRES_EMPTY_QUERY: + walres->status = WALRCV_ERROR; + walres->err = _("empty query"); + break; + + case PGRES_PIPELINE_SYNC: + case PGRES_PIPELINE_ABORTED: + walres->status = WALRCV_ERROR; + walres->err = _("unexpected pipeline mode"); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_BAD_RESPONSE: + walres->status = WALRCV_ERROR; + walres->err = pchomp(PQerrorMessage(conn->streamConn)); + diag_sqlstate = PQresultErrorField(pgres, PG_DIAG_SQLSTATE); + if (diag_sqlstate) + walres->sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], + diag_sqlstate[1], + diag_sqlstate[2], + diag_sqlstate[3], + diag_sqlstate[4]); + break; + } + + PQclear(pgres); + + return walres; +} + +/* + * Given a List of strings, return it as single comma separated + * string, quoting identifiers as needed. + * + * This is essentially the reverse of SplitIdentifierString. + * + * The caller should free the result. + */ +static char * +stringlist_to_identifierstr(PGconn *conn, List *strings) +{ + ListCell *lc; + StringInfoData res; + bool first = true; + + initStringInfo(&res); + + foreach(lc, strings) + { + char *val = strVal(lfirst(lc)); + char *val_escaped; + + if (first) + first = false; + else + appendStringInfoChar(&res, ','); + + val_escaped = PQescapeIdentifier(conn, val, strlen(val)); + if (!val_escaped) + { + free(res.data); + return NULL; + } + appendStringInfoString(&res, val_escaped); + PQfreemem(val_escaped); + } + + return res.data; +} diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile new file mode 100644 index 0000000..c4e2fde --- /dev/null +++ b/src/backend/replication/logical/Makefile @@ -0,0 +1,31 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/logical +# +# IDENTIFICATION +# src/backend/replication/logical/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/logical +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) + +OBJS = \ + decode.o \ + launcher.o \ + logical.o \ + logicalfuncs.o \ + message.o \ + origin.o \ + proto.o \ + relation.o \ + reorderbuffer.o \ + snapbuild.o \ + tablesync.o \ + worker.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c new file mode 100644 index 0000000..92dfafc --- /dev/null +++ b/src/backend/replication/logical/decode.c @@ -0,0 +1,1316 @@ +/* ------------------------------------------------------------------------- + * + * decode.c + * This module decodes WAL records read using xlogreader.h's APIs for the + * purpose of logical decoding by passing information to the + * reorderbuffer module (containing the actual changes) and to the + * snapbuild module to build a fitting catalog snapshot (to be able to + * properly decode the changes in the reorderbuffer). + * + * NOTE: + * This basically tries to handle all low level xlog stuff for + * reorderbuffer.c and snapbuild.c. There's some minor leakage where a + * specific record's struct is used to pass data along, but those just + * happen to contain the right amount of data in a convenient + * format. There isn't and shouldn't be much intelligence about the + * contents of records in here except turning them into a more usable + * format. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/replication/logical/decode.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" +#include "access/xlogutils.h" +#include "catalog/pg_control.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/standby.h" + +typedef struct XLogRecordBuffer +{ + XLogRecPtr origptr; + XLogRecPtr endptr; + XLogReaderState *record; +} XLogRecordBuffer; + +/* RMGR Handlers */ +static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* individual record(group)'s handlers */ +static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase); +static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase); +static void DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed); + + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tup); + +/* helper functions for decoding transactions */ +static inline bool FilterPrepare(LogicalDecodingContext *ctx, + TransactionId xid, const char *gid); +static bool DecodeTXNNeedSkip(LogicalDecodingContext *ctx, + XLogRecordBuffer *buf, Oid dbId, + RepOriginId origin_id); + +/* + * Take every XLogReadRecord()ed record and perform the actions required to + * decode it using the output plugin already setup in the logical decoding + * context. + * + * NB: Note that every record's xid needs to be processed by reorderbuffer + * (xids contained in the content of records are not relevant for this rule). + * That means that for records which'd otherwise not go through the + * reorderbuffer ReorderBufferProcessXid() has to be called. We don't want to + * call ReorderBufferProcessXid for each record type by default, because + * e.g. empty xacts can be handled more efficiently if there's no previous + * state for them. + * + * We also support the ability to fast forward thru records, skipping some + * record types completely - see individual record types for details. + */ +void +LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record) +{ + XLogRecordBuffer buf; + TransactionId txid; + + buf.origptr = ctx->reader->ReadRecPtr; + buf.endptr = ctx->reader->EndRecPtr; + buf.record = record; + + txid = XLogRecGetTopXid(record); + + /* + * If the top-level xid is valid, we need to assign the subxact to the + * top-level xact. We need to do this for all records, hence we do it + * before the switch. + */ + if (TransactionIdIsValid(txid)) + { + ReorderBufferAssignChild(ctx->reorder, + txid, + record->decoded_record->xl_xid, + buf.origptr); + } + + /* cast so we get a warning when new rmgrs are added */ + switch ((RmgrId) XLogRecGetRmid(record)) + { + /* + * Rmgrs we care about for logical decoding. Add new rmgrs in + * rmgrlist.h's order. + */ + case RM_XLOG_ID: + DecodeXLogOp(ctx, &buf); + break; + + case RM_XACT_ID: + DecodeXactOp(ctx, &buf); + break; + + case RM_STANDBY_ID: + DecodeStandbyOp(ctx, &buf); + break; + + case RM_HEAP2_ID: + DecodeHeap2Op(ctx, &buf); + break; + + case RM_HEAP_ID: + DecodeHeapOp(ctx, &buf); + break; + + case RM_LOGICALMSG_ID: + DecodeLogicalMsgOp(ctx, &buf); + break; + + /* + * Rmgrs irrelevant for logical decoding; they describe stuff not + * represented in logical decoding. Add new rmgrs in rmgrlist.h's + * order. + */ + case RM_SMGR_ID: + case RM_CLOG_ID: + case RM_DBASE_ID: + case RM_TBLSPC_ID: + case RM_MULTIXACT_ID: + case RM_RELMAP_ID: + case RM_BTREE_ID: + case RM_HASH_ID: + case RM_GIN_ID: + case RM_GIST_ID: + case RM_SEQ_ID: + case RM_SPGIST_ID: + case RM_BRIN_ID: + case RM_COMMIT_TS_ID: + case RM_REPLORIGIN_ID: + case RM_GENERIC_ID: + /* just deal with xid, and done */ + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), + buf.origptr); + break; + case RM_NEXT_ID: + elog(ERROR, "unexpected RM_NEXT_ID rmgr_id: %u", (RmgrIds) XLogRecGetRmid(buf.record)); + } +} + +/* + * Handle rmgr XLOG_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), + buf->origptr); + + switch (info) + { + /* this is also used in END_OF_RECOVERY checkpoints */ + case XLOG_CHECKPOINT_SHUTDOWN: + case XLOG_END_OF_RECOVERY: + SnapBuildSerializationPoint(builder, buf->origptr); + + break; + case XLOG_CHECKPOINT_ONLINE: + + /* + * a RUNNING_XACTS record will have been logged near to this, we + * can restart from there. + */ + break; + case XLOG_NOOP: + case XLOG_NEXTOID: + case XLOG_SWITCH: + case XLOG_BACKUP_END: + case XLOG_PARAMETER_CHANGE: + case XLOG_RESTORE_POINT: + case XLOG_FPW_CHANGE: + case XLOG_FPI_FOR_HINT: + case XLOG_FPI: + case XLOG_OVERWRITE_CONTRECORD: + break; + default: + elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); + } +} + +/* + * Handle rmgr XACT_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + ReorderBuffer *reorder = ctx->reorder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & XLOG_XACT_OPMASK; + + /* + * If the snapshot isn't yet fully built, we cannot decode anything, so + * bail out. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_XACT_COMMIT: + case XLOG_XACT_COMMIT_PREPARED: + { + xl_xact_commit *xlrec; + xl_xact_parsed_commit parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_commit *) XLogRecGetData(r); + ParseCommitRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_COMMIT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeCommit(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ABORT: + case XLOG_XACT_ABORT_PREPARED: + { + xl_xact_abort *xlrec; + xl_xact_parsed_abort parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_abort *) XLogRecGetData(r); + ParseAbortRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_ABORT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeAbort(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ASSIGNMENT: + + /* + * We assign subxact to the toplevel xact while processing each + * record if required. So, we don't need to do anything here. See + * LogicalDecodingProcessRecord. + */ + break; + case XLOG_XACT_INVALIDATIONS: + { + TransactionId xid; + xl_xact_invals *invals; + + xid = XLogRecGetXid(r); + invals = (xl_xact_invals *) XLogRecGetData(r); + + /* + * Execute the invalidations for xid-less transactions, + * otherwise, accumulate them so that they can be processed at + * the commit time. + */ + if (TransactionIdIsValid(xid)) + { + if (!ctx->fast_forward) + ReorderBufferAddInvalidations(reorder, xid, + buf->origptr, + invals->nmsgs, + invals->msgs); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, + buf->origptr); + } + else if ((!ctx->fast_forward)) + ReorderBufferImmediateInvalidation(ctx->reorder, + invals->nmsgs, + invals->msgs); + } + break; + case XLOG_XACT_PREPARE: + { + xl_xact_parsed_prepare parsed; + xl_xact_prepare *xlrec; + + /* ok, parse it */ + xlrec = (xl_xact_prepare *) XLogRecGetData(r); + ParsePrepareRecord(XLogRecGetInfo(buf->record), + xlrec, &parsed); + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (FilterPrepare(ctx, parsed.twophase_xid, + parsed.twophase_gid)) + { + ReorderBufferProcessXid(reorder, parsed.twophase_xid, + buf->origptr); + break; + } + + /* + * Note that if the prepared transaction has locked [user] + * catalog tables exclusively then decoding prepare can block + * till the main transaction is committed because it needs to + * lock the catalog tables. + * + * XXX Now, this can even lead to a deadlock if the prepare + * transaction is waiting to get it logically replicated for + * distributed 2PC. Currently, we don't have an in-core + * implementation of prepares for distributed 2PC but some + * out-of-core logical replication solution can have such an + * implementation. They need to inform users to not have locks + * on catalog tables in such transactions. + */ + DecodePrepare(ctx, buf, &parsed); + break; + } + default: + elog(ERROR, "unexpected RM_XACT_ID record type: %u", info); + } +} + +/* + * Handle rmgr STANDBY_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + switch (info) + { + case XLOG_RUNNING_XACTS: + { + xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r); + + SnapBuildProcessRunningXacts(builder, buf->origptr, running); + + /* + * Abort all transactions that we keep track of, that are + * older than the record's oldestRunningXid. This is the most + * convenient spot for doing so since, in contrast to shutdown + * or end-of-recovery checkpoints, we have information about + * all running transactions which includes prepared ones, + * while shutdown checkpoints just know that no non-prepared + * transactions are in progress. + */ + ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid); + } + break; + case XLOG_STANDBY_LOCK: + break; + case XLOG_INVALIDATIONS: + + /* + * We are processing the invalidations at the command level via + * XLOG_XACT_INVALIDATIONS. So we don't need to do anything here. + */ + break; + default: + elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP2_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP2_MULTI_INSERT: + if (!ctx->fast_forward && + SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; + case XLOG_HEAP2_NEW_CID: + { + xl_heap_new_cid *xlrec; + + xlrec = (xl_heap_new_cid *) XLogRecGetData(buf->record); + SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec); + + break; + } + case XLOG_HEAP2_REWRITE: + + /* + * Although these records only exist to serve the needs of logical + * decoding, all the work happens as part of crash or archive + * recovery, so we don't need to do anything here. + */ + break; + + /* + * Everything else here is just low level physical stuff we're not + * interested in. + */ + case XLOG_HEAP2_FREEZE_PAGE: + case XLOG_HEAP2_PRUNE: + case XLOG_HEAP2_VACUUM: + case XLOG_HEAP2_VISIBLE: + case XLOG_HEAP2_LOCK_UPDATED: + break; + default: + elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeInsert(ctx, buf); + break; + + /* + * Treat HOT update as normal updates. There is no useful + * information in the fact that we could make it a HOT update + * locally and the WAL layout is compatible. + */ + case XLOG_HEAP_HOT_UPDATE: + case XLOG_HEAP_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeUpdate(ctx, buf); + break; + + case XLOG_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeDelete(ctx, buf); + break; + + case XLOG_HEAP_TRUNCATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeTruncate(ctx, buf); + break; + + case XLOG_HEAP_INPLACE: + + /* + * Inplace updates are only ever performed on catalog tuples and + * can, per definition, not change tuple visibility. Since we + * don't decode catalog tuples, we're not interested in the + * record's contents. + * + * In-place updates can be used either by XID-bearing transactions + * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less + * transactions (e.g. VACUUM). In the former case, the commit + * record will include cache invalidations, so we mark the + * transaction as catalog modifying here. Currently that's + * redundant because the commit will do that as well, but once we + * support decoding in-progress relations, this will be important. + */ + if (!TransactionIdIsValid(xid)) + break; + + SnapBuildProcessChange(builder, xid, buf->origptr); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + break; + + case XLOG_HEAP_CONFIRM: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeSpecConfirm(ctx, buf); + break; + + case XLOG_HEAP_LOCK: + /* we don't care about row level locks for now */ + break; + + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +/* + * Ask output plugin whether we want to skip this PREPARE and send + * this transaction as a regular commit later. + */ +static inline bool +FilterPrepare(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + /* + * Skip if decoding of two-phase transactions at PREPARE time is not + * enabled. In that case, all two-phase transactions are considered + * filtered out and will be applied as regular transactions at COMMIT + * PREPARED. + */ + if (!ctx->twophase) + return true; + + /* + * The filter_prepare callback is optional. When not supplied, all + * prepared transactions should go through. + */ + if (ctx->callbacks.filter_prepare_cb == NULL) + return false; + + return filter_prepare_cb_wrapper(ctx, xid, gid); +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Handle rmgr LOGICALMSG_ID records for DecodeRecordIntoReorderBuffer(). + */ +static void +DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + TransactionId xid = XLogRecGetXid(r); + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + RepOriginId origin_id = XLogRecGetOrigin(r); + Snapshot snapshot; + xl_logical_message *message; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(ERROR, "unexpected RM_LOGICALMSG_ID record type: %u", info); + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding messages. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + message = (xl_logical_message *) XLogRecGetData(r); + + if (message->dbId != ctx->slot->data.database || + FilterByOrigin(ctx, origin_id)) + return; + + if (message->transactional && + !SnapBuildProcessChange(builder, xid, buf->origptr)) + return; + else if (!message->transactional && + (SnapBuildCurrentState(builder) != SNAPBUILD_CONSISTENT || + SnapBuildXactNeedsSkip(builder, buf->origptr))) + return; + + snapshot = SnapBuildGetOrBuildSnapshot(builder, xid); + ReorderBufferQueueMessage(ctx->reorder, xid, snapshot, buf->endptr, + message->transactional, + message->message, /* first part of message is + * prefix */ + message->message_size, + message->message + message->prefix_size); +} + +/* + * Consolidated commit record handling between the different form of commit + * records. + * + * 'two_phase' indicates that caller wants to process the transaction in two + * phases, first process prepare if not already done and then process + * commit_prepared. + */ +static void +DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase) +{ + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz commit_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + int i; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + commit_time = parsed->origin_timestamp; + } + + SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid, + parsed->nsubxacts, parsed->subxacts); + + /* ---- + * Check whether we are interested in this specific transaction, and tell + * the reorderbuffer to forget the content of the (sub-)transactions + * if not. + * + * We can't just use ReorderBufferAbort() here, because we need to execute + * the transaction's invalidations. This currently won't be needed if + * we're just skipping over the transaction because currently we only do + * so during startup, to get to the first transaction the client needs. As + * we have reset the catalog caches before starting to read WAL, and we + * haven't yet touched any catalogs, there can't be anything to invalidate. + * But if we're "forgetting" this commit because it happened in another + * database, the invalidations might be important, because they could be + * for shared catalogs and we might have loaded data into the relevant + * syscaches. + * --- + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferForget(ctx->reorder, parsed->subxacts[i], buf->origptr); + } + ReorderBufferForget(ctx->reorder, xid, buf->origptr); + + return; + } + + /* tell the reorderbuffer about the surviving subtransactions */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* + * Send the final commit record if the transaction data is already + * decoded, otherwise, process the entire transaction. + */ + if (two_phase) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + SnapBuildInitialConsistentPoint(ctx->snapshot_builder), + commit_time, origin_id, origin_lsn, + parsed->twophase_gid, true); + } + else + { + ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr, + commit_time, origin_id, origin_lsn); + } + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + +/* + * Decode PREPARE record. Similar logic as in DecodeCommit. + * + * Note that we don't skip prepare even if have detected concurrent abort + * because it is quite possible that we had already sent some changes before we + * detect abort in which case we need to abort those changes in the subscriber. + * To abort such changes, we do send the prepare and then the rollback prepared + * which is what happened on the publisher-side as well. Now, we can invent a + * new abort API wherein in such cases we send abort and skip sending prepared + * and rollback prepared but then it is not that straightforward because we + * might have streamed this transaction by that time in which case it is + * handled when the rollback is encountered. It is not impossible to optimize + * the concurrent abort case but it can introduce design complexity w.r.t + * handling different cases so leaving it for now as it doesn't seem worth it. + */ +static void +DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogRecPtr origin_lsn = parsed->origin_lsn; + TimestampTz prepare_time = parsed->xact_time; + XLogRecPtr origin_id = XLogRecGetOrigin(buf->record); + int i; + TransactionId xid = parsed->twophase_xid; + + if (parsed->origin_timestamp != 0) + prepare_time = parsed->origin_timestamp; + + /* + * Remember the prepare info for a txn so that it can be used later in + * commit prepared if required. See ReorderBufferFinishPrepared. + */ + if (!ReorderBufferRememberPrepareInfo(ctx->reorder, xid, buf->origptr, + buf->endptr, prepare_time, origin_id, + origin_lsn)) + return; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + return; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + * + * We can't call ReorderBufferForget as we did in DecodeCommit as the txn + * hasn't yet been committed, removing this txn before a commit might + * result in the computation of an incorrect restart_lsn. See + * SnapBuildProcessRunningXacts. But we need to process cache + * invalidations if there are any for the reasons mentioned in + * DecodeCommit. + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + ReorderBufferInvalidate(ctx->reorder, xid, buf->origptr); + return; + } + + /* Tell the reorderbuffer about the surviving subtransactions. */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* replay actions of all transaction + subtransactions in order */ + ReorderBufferPrepare(ctx->reorder, xid, parsed->twophase_gid); + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + + +/* + * Get the data from the various forms of abort records and pass it on to + * snapbuild.c and reorderbuffer.c. + * + * 'two_phase' indicates to finish prepared transaction. + */ +static void +DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase) +{ + int i; + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz abort_time = parsed->xact_time; + XLogRecPtr origin_id = XLogRecGetOrigin(buf->record); + bool skip_xact; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + abort_time = parsed->origin_timestamp; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + */ + skip_xact = DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id); + + /* + * Send the final rollback record for a prepared transaction unless we + * need to skip it. For non-two-phase xacts, simply forget the xact. + */ + if (two_phase && !skip_xact) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + InvalidXLogRecPtr, + abort_time, origin_id, origin_lsn, + parsed->twophase_gid, false); + } + else + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferAbort(ctx->reorder, parsed->subxacts[i], + buf->record->EndRecPtr); + } + + ReorderBufferAbort(ctx->reorder, xid, buf->record->EndRecPtr); + } + + /* update the decoding stats */ + UpdateDecodingStats(ctx); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileNode target_node; + + xlrec = (xl_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileNode target_node; + + xlrec = (xl_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileNode target_node; + + xlrec = (xl_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfHeapDelete; + Size tuplelen = datalen - SizeOfHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_TRUNCATE from wal + */ +static void +DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_truncate *xlrec; + ReorderBufferChange *change; + + xlrec = (xl_heap_truncate *) XLogRecGetData(r); + + /* only interested in our database */ + if (xlrec->dbId != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_TRUNCATE; + change->origin_id = XLogRecGetOrigin(r); + if (xlrec->flags & XLH_TRUNCATE_CASCADE) + change->data.truncate.cascade = true; + if (xlrec->flags & XLH_TRUNCATE_RESTART_SEQS) + change->data.truncate.restart_seqs = true; + change->data.truncate.nrelids = xlrec->nrelids; + change->data.truncate.relids = ReorderBufferGetRelids(ctx->reorder, + xlrec->nrelids); + memcpy(change->data.truncate.relids, xlrec->relids, + xlrec->nrelids * sizeof(Oid)); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileNode rnode; + + xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rnode, NULL, NULL); + if (rnode.dbNode != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.relnode, &rnode, sizeof(RelFileNode)); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Parse XLOG_HEAP_CONFIRM from wal into a confirmation change. + * + * This is pretty trivial, all the state essentially already setup by the + * speculative insertion. + */ +static void +DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + ReorderBufferChange *change; + RelFileNode target_node; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); + if (target_node.dbNode != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_heap_header xlhdr; + int datalen = len - SizeOfHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, + data + SizeOfHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} + +/* + * Check whether we are interested in this specific transaction. + * + * There can be several reasons we might not be interested in this + * transaction: + * 1) We might not be interested in decoding transactions up to this + * LSN. This can happen because we previously decoded it and now just + * are restarting or if we haven't assembled a consistent snapshot yet. + * 2) The transaction happened in another database. + * 3) The output plugin is not interested in the origin. + * 4) We are doing fast-forwarding + */ +static bool +DecodeTXNNeedSkip(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + Oid txn_dbid, RepOriginId origin_id) +{ + return (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) || + (txn_dbid != InvalidOid && txn_dbid != ctx->slot->data.database) || + ctx->fast_forward || FilterByOrigin(ctx, origin_id)); +} diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c new file mode 100644 index 0000000..3c69817 --- /dev/null +++ b/src/backend/replication/logical/launcher.c @@ -0,0 +1,1024 @@ +/*------------------------------------------------------------------------- + * launcher.c + * PostgreSQL logical replication worker launcher process + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/launcher.c + * + * NOTES + * This module contains the logical replication worker launcher which + * uses the background worker infrastructure to start the logical + * replication workers for every enabled subscription. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "funcapi.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/ps_status.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +int max_logical_replication_workers = 4; +int max_sync_workers_per_subscription = 2; + +LogicalRepWorker *MyLogicalRepWorker = NULL; + +typedef struct LogicalRepCtxStruct +{ + /* Supervisor process. */ + pid_t launcher_pid; + + /* Background workers. */ + LogicalRepWorker workers[FLEXIBLE_ARRAY_MEMBER]; +} LogicalRepCtxStruct; + +LogicalRepCtxStruct *LogicalRepCtx; + +static void ApplyLauncherWakeup(void); +static void logicalrep_launcher_onexit(int code, Datum arg); +static void logicalrep_worker_onexit(int code, Datum arg); +static void logicalrep_worker_detach(void); +static void logicalrep_worker_cleanup(LogicalRepWorker *worker); + +static bool on_commit_launcher_wakeup = false; + +Datum pg_stat_get_subscription(PG_FUNCTION_ARGS); + + +/* + * Load the list of subscriptions. + * + * Only the fields interesting for worker start/stop functions are filled for + * each subscription. + */ +static List * +get_subscription_list(void) +{ + List *res = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + /* + * Start a transaction so we can access pg_database, and get a snapshot. + * We don't have a use for the snapshot itself, but we're interested in + * the secondary effect that it sets RecentGlobalXmin. (This is critical + * for anything that reads heap pages, because HOT may decide to prune + * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). + */ + StartTransactionCommand(); + (void) GetTransactionSnapshot(); + + rel = table_open(SubscriptionRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_subscription subform = (Form_pg_subscription) GETSTRUCT(tup); + Subscription *sub; + MemoryContext oldcxt; + + /* + * Allocate our results in the caller's context, not the + * transaction's. We do this inside the loop, and restore the original + * context at the end, so that leaky things like heap_getnext() are + * not called in a potentially long-lived context. + */ + oldcxt = MemoryContextSwitchTo(resultcxt); + + sub = (Subscription *) palloc0(sizeof(Subscription)); + sub->oid = subform->oid; + sub->dbid = subform->subdbid; + sub->owner = subform->subowner; + sub->enabled = subform->subenabled; + sub->name = pstrdup(NameStr(subform->subname)); + /* We don't fill fields we are not interested in. */ + + res = lappend(res, sub); + MemoryContextSwitchTo(oldcxt); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return res; +} + +/* + * Wait for a background worker to start up and attach to the shmem context. + * + * This is only needed for cleaning up the shared memory in case the worker + * fails to attach. + */ +static void +WaitForReplicationWorkerAttach(LogicalRepWorker *worker, + uint16 generation, + BackgroundWorkerHandle *handle) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* Worker either died or has started; no need to do anything. */ + if (!worker->in_use || worker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + return; + } + + LWLockRelease(LogicalRepWorkerLock); + + /* Check if worker has died before attaching, and clean up after it. */ + status = GetBackgroundWorkerPid(handle, &pid); + + if (status == BGWH_STOPPED) + { + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + /* Ensure that this was indeed the worker we waited for. */ + if (generation == worker->generation) + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + return; + } + + /* + * We need timeout because we generally don't get notified via latch + * about the worker attach. But we don't expect to have to wait long. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + } +} + +/* + * Walks the workers array and searches for one that matches given + * subscription id and relid. + */ +LogicalRepWorker * +logicalrep_worker_find(Oid subid, Oid relid, bool only_running) +{ + int i; + LogicalRepWorker *res = NULL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->in_use && w->subid == subid && w->relid == relid && + (!only_running || w->proc)) + { + res = w; + break; + } + } + + return res; +} + +/* + * Similar to logicalrep_worker_find(), but returns list of all workers for + * the subscription, instead just one. + */ +List * +logicalrep_workers_find(Oid subid, bool only_running) +{ + int i; + List *res = NIL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->in_use && w->subid == subid && (!only_running || w->proc)) + res = lappend(res, w); + } + + return res; +} + +/* + * Start new apply background worker, if possible. + */ +void +logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, + Oid relid) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + uint16 generation; + int i; + int slot = 0; + LogicalRepWorker *worker = NULL; + int nsyncworkers; + TimestampTz now; + + ereport(DEBUG1, + (errmsg_internal("starting logical replication worker for subscription \"%s\"", + subname))); + + /* Report this after the initial starting message for consistency. */ + if (max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("cannot start logical replication workers when max_replication_slots = 0"))); + + /* + * We need to do the modification of the shared memory under lock so that + * we have consistent view. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + +retry: + /* Find unused worker slot. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (!w->in_use) + { + worker = w; + slot = i; + break; + } + } + + nsyncworkers = logicalrep_sync_worker_count(subid); + + now = GetCurrentTimestamp(); + + /* + * If we didn't find a free slot, try to do garbage collection. The + * reason we do this is because if some worker failed to start up and its + * parent has crashed while waiting, the in_use state was never cleared. + */ + if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription) + { + bool did_cleanup = false; + + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + /* + * If the worker was marked in use but didn't manage to attach in + * time, clean it up. + */ + if (w->in_use && !w->proc && + TimestampDifferenceExceeds(w->launch_time, now, + wal_receiver_timeout)) + { + elog(WARNING, + "logical replication worker for subscription %u took too long to start; canceled", + w->subid); + + logicalrep_worker_cleanup(w); + did_cleanup = true; + } + } + + if (did_cleanup) + goto retry; + } + + /* + * We don't allow to invoke more sync workers once we have reached the sync + * worker limit per subscription. So, just return silently as we might get + * here because of an otherwise harmless race condition. + */ + if (OidIsValid(relid) && nsyncworkers >= max_sync_workers_per_subscription) + { + LWLockRelease(LogicalRepWorkerLock); + return; + } + + /* + * However if there are no more free worker slots, inform user about it + * before exiting. + */ + if (worker == NULL) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of logical replication worker slots"), + errhint("You might need to increase max_logical_replication_workers."))); + return; + } + + /* Prepare the worker slot. */ + worker->launch_time = now; + worker->in_use = true; + worker->generation++; + worker->proc = NULL; + worker->dbid = dbid; + worker->userid = userid; + worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); + + /* Before releasing lock, remember generation for future identification. */ + generation = worker->generation; + + LWLockRelease(LogicalRepWorkerLock); + + /* Register the new dynamic worker. */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyWorkerMain"); + if (OidIsValid(relid)) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u sync %u", subid, relid); + else + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u", subid); + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication worker"); + + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = Int32GetDatum(slot); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + /* Failed to start worker, so clean up the worker slot. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + Assert(generation == worker->generation); + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase max_worker_processes."))); + return; + } + + /* Now wait until it attaches. */ + WaitForReplicationWorkerAttach(worker, generation, bgw_handle); +} + +/* + * Stop the logical replication worker for subid/relid, if any, and wait until + * it detaches from the slot. + */ +void +logicalrep_worker_stop(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + uint16 generation; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, false); + + /* No worker, nothing to do. */ + if (!worker) + { + LWLockRelease(LogicalRepWorkerLock); + return; + } + + /* + * Remember which generation was our worker so we can check if what we see + * is still the same one. + */ + generation = worker->generation; + + /* + * If we found a worker but it does not have proc set then it is still + * starting up; wait for it to finish starting and then kill it. + */ + while (worker->in_use && !worker->proc) + { + int rc; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + /* Recheck worker status. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* + * Check whether the worker slot is no longer used, which would mean + * that the worker has exited, or whether the worker generation is + * different, meaning that a different worker has taken the slot. + */ + if (!worker->in_use || worker->generation != generation) + { + LWLockRelease(LogicalRepWorkerLock); + return; + } + + /* Worker has assigned proc, so it has started. */ + if (worker->proc) + break; + } + + /* Now terminate the worker ... */ + kill(worker->proc->pid, SIGTERM); + + /* ... and wait for it to die. */ + for (;;) + { + int rc; + + /* is it gone? */ + if (!worker->proc || worker->generation != generation) + break; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + } + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) any logical replication worker for specified sub/rel. + */ +void +logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, true); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) the specified logical replication worker. + * + * Caller must hold lock, else worker->proc could change under us. + */ +void +logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + SetLatch(&worker->proc->procLatch); +} + +/* + * Attach to a slot. + */ +void +logicalrep_worker_attach(int slot) +{ + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + Assert(slot >= 0 && slot < max_logical_replication_workers); + MyLogicalRepWorker = &LogicalRepCtx->workers[slot]; + + if (!MyLogicalRepWorker->in_use) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is empty, cannot attach", + slot))); + } + + if (MyLogicalRepWorker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is already used by " + "another worker, cannot attach", slot))); + } + + MyLogicalRepWorker->proc = MyProc; + before_shmem_exit(logicalrep_worker_onexit, (Datum) 0); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Detach the worker (cleans up the worker info). + */ +static void +logicalrep_worker_detach(void) +{ + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + logicalrep_worker_cleanup(MyLogicalRepWorker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Clean up worker info. + */ +static void +logicalrep_worker_cleanup(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_EXCLUSIVE)); + + worker->in_use = false; + worker->proc = NULL; + worker->dbid = InvalidOid; + worker->userid = InvalidOid; + worker->subid = InvalidOid; + worker->relid = InvalidOid; +} + +/* + * Cleanup function for logical replication launcher. + * + * Called on logical replication launcher exit. + */ +static void +logicalrep_launcher_onexit(int code, Datum arg) +{ + LogicalRepCtx->launcher_pid = 0; +} + +/* + * Cleanup function. + * + * Called on logical replication worker exit. + */ +static void +logicalrep_worker_onexit(int code, Datum arg) +{ + /* Disconnect gracefully from the remote side. */ + if (LogRepWorkerWalRcvConn) + walrcv_disconnect(LogRepWorkerWalRcvConn); + + logicalrep_worker_detach(); + + ApplyLauncherWakeup(); +} + +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int +logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + +/* + * ApplyLauncherShmemSize + * Compute space needed for replication launcher shared memory + */ +Size +ApplyLauncherShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of LogicalRepWorker. + */ + size = sizeof(LogicalRepCtxStruct); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_logical_replication_workers, + sizeof(LogicalRepWorker))); + return size; +} + +/* + * ApplyLauncherRegister + * Register a background worker running the logical replication launcher. + */ +void +ApplyLauncherRegister(void) +{ + BackgroundWorker bgw; + + if (max_logical_replication_workers == 0) + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, + "logical replication launcher"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +/* + * ApplyLauncherShmemInit + * Allocate and initialize replication launcher shared memory + */ +void +ApplyLauncherShmemInit(void) +{ + bool found; + + LogicalRepCtx = (LogicalRepCtxStruct *) + ShmemInitStruct("Logical Replication Launcher Data", + ApplyLauncherShmemSize(), + &found); + + if (!found) + { + int slot; + + memset(LogicalRepCtx, 0, ApplyLauncherShmemSize()); + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < max_logical_replication_workers; slot++) + { + LogicalRepWorker *worker = &LogicalRepCtx->workers[slot]; + + memset(worker, 0, sizeof(LogicalRepWorker)); + SpinLockInit(&worker->relmutex); + } + } +} + +/* + * Wakeup the launcher on commit if requested. + */ +void +AtEOXact_ApplyLauncher(bool isCommit) +{ + if (isCommit) + { + if (on_commit_launcher_wakeup) + ApplyLauncherWakeup(); + } + + on_commit_launcher_wakeup = false; +} + +/* + * Request wakeup of the launcher on commit of the transaction. + * + * This is used to send launcher signal to stop sleeping and process the + * subscriptions when current transaction commits. Should be used when new + * tuple was added to the pg_subscription catalog. +*/ +void +ApplyLauncherWakeupAtCommit(void) +{ + if (!on_commit_launcher_wakeup) + on_commit_launcher_wakeup = true; +} + +static void +ApplyLauncherWakeup(void) +{ + if (LogicalRepCtx->launcher_pid != 0) + kill(LogicalRepCtx->launcher_pid, SIGUSR1); +} + +/* + * Main loop for the apply launcher process. + */ +void +ApplyLauncherMain(Datum main_arg) +{ + TimestampTz last_start_time = 0; + + ereport(DEBUG1, + (errmsg_internal("logical replication launcher started"))); + + before_shmem_exit(logicalrep_launcher_onexit, (Datum) 0); + + Assert(LogicalRepCtx->launcher_pid == 0); + LogicalRepCtx->launcher_pid = MyProcPid; + + /* Establish signal handlers. */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * Establish connection to nailed catalogs (we only ever access + * pg_subscription). + */ + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Enter main loop */ + for (;;) + { + int rc; + List *sublist; + ListCell *lc; + MemoryContext subctx; + MemoryContext oldctx; + TimestampTz now; + long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + + /* Limit the start retry to once a wal_retrieve_retry_interval */ + if (TimestampDifferenceExceeds(last_start_time, now, + wal_retrieve_retry_interval)) + { + /* Use temporary context for the database list and worker info. */ + subctx = AllocSetContextCreate(TopMemoryContext, + "Logical Replication Launcher sublist", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(subctx); + + /* search for subscriptions to start or stop. */ + sublist = get_subscription_list(); + + /* Start the missing workers for enabled subscriptions. */ + foreach(lc, sublist) + { + Subscription *sub = (Subscription *) lfirst(lc); + LogicalRepWorker *w; + + if (!sub->enabled) + continue; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); + LWLockRelease(LogicalRepWorkerLock); + + if (w == NULL) + { + last_start_time = now; + wait_time = wal_retrieve_retry_interval; + + logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid); + } + } + + /* Switch back to original memory context. */ + MemoryContextSwitchTo(oldctx); + /* Clean the temporary memory. */ + MemoryContextDelete(subctx); + } + else + { + /* + * The wait in previous cycle was interrupted in less than + * wal_retrieve_retry_interval since last worker was started, this + * usually means crash of the worker, so we should retry in + * wal_retrieve_retry_interval again. + */ + wait_time = wal_retrieve_retry_interval; + } + + /* Wait for more work. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_LOGICAL_LAUNCHER_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* Not reachable */ +} + +/* + * Is current process the logical replication launcher? + */ +bool +IsLogicalLauncher(void) +{ + return LogicalRepCtx->launcher_pid == MyProcPid; +} + +/* + * Returns state of the subscriptions. + */ +Datum +pg_stat_get_subscription(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_SUBSCRIPTION_COLS 8 + Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0); + int i; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + /* Make sure we get consistent view of the workers. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + for (i = 0; i < max_logical_replication_workers; i++) + { + /* for each row */ + Datum values[PG_STAT_GET_SUBSCRIPTION_COLS]; + bool nulls[PG_STAT_GET_SUBSCRIPTION_COLS]; + int worker_pid; + LogicalRepWorker worker; + + memcpy(&worker, &LogicalRepCtx->workers[i], + sizeof(LogicalRepWorker)); + if (!worker.proc || !IsBackendPid(worker.proc->pid)) + continue; + + if (OidIsValid(subid) && worker.subid != subid) + continue; + + worker_pid = worker.proc->pid; + + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(worker.subid); + if (OidIsValid(worker.relid)) + values[1] = ObjectIdGetDatum(worker.relid); + else + nulls[1] = true; + values[2] = Int32GetDatum(worker_pid); + if (XLogRecPtrIsInvalid(worker.last_lsn)) + nulls[3] = true; + else + values[3] = LSNGetDatum(worker.last_lsn); + if (worker.last_send_time == 0) + nulls[4] = true; + else + values[4] = TimestampTzGetDatum(worker.last_send_time); + if (worker.last_recv_time == 0) + nulls[5] = true; + else + values[5] = TimestampTzGetDatum(worker.last_recv_time); + if (XLogRecPtrIsInvalid(worker.reply_lsn)) + nulls[6] = true; + else + values[6] = LSNGetDatum(worker.reply_lsn); + if (worker.reply_time == 0) + nulls[7] = true; + else + values[7] = TimestampTzGetDatum(worker.reply_time); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + /* + * If only a single subscription was requested, and we found it, + * break. + */ + if (OidIsValid(subid)) + break; + } + + LWLockRelease(LogicalRepWorkerLock); + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c new file mode 100644 index 0000000..f7d1491 --- /dev/null +++ b/src/backend/replication/logical/logical.c @@ -0,0 +1,1840 @@ +/*------------------------------------------------------------------------- + * logical.c + * PostgreSQL logical decoding coordination + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logical.c + * + * NOTES + * This file coordinates interaction between the various modules that + * together provide logical decoding, primarily by providing so + * called LogicalDecodingContexts. The goal is to encapsulate most of the + * internal complexity for consumers of logical decoding, so they can + * create and consume a changestream with a low amount of code. Builtin + * consumers are the walsender and SQL SRF interface, but it's possible to + * add further ones without changing core code, e.g. to consume changes in + * a bgworker. + * + * The idea is that a consumer provides three callbacks, one to read WAL, + * one to prepare a data write, and a final one for actually writing since + * their implementation depends on the type of consumer. Check + * logicalfuncs.c for an example implementation of a fairly simple consumer + * and an implementation of a WAL reading callback that's suitable for + * simple consumers. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +/* data for errcontext callback */ +typedef struct LogicalErrorCallbackState +{ + LogicalDecodingContext *ctx; + const char *callback_name; + XLogRecPtr report_location; +} LogicalErrorCallbackState; + +/* wrappers around output plugin callbacks */ +static void output_plugin_error_callback(void *arg); +static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init); +static void shutdown_cb_wrapper(LogicalDecodingContext *ctx); +static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, TimestampTz prepare_time); +static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); +static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); + +/* streaming callbacks */ +static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn); +static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn); +static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); +static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); + +static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin); + +/* + * Make sure the current settings & environment are capable of doing logical + * decoding. + */ +void +CheckLogicalDecodingRequirements(void) +{ + CheckSlotRequirements(); + + /* + * NB: Adding a new requirement likely means that RestoreSlotFromDisk() + * needs the same check. + */ + + if (wal_level < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires wal_level >= logical"))); + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires a database connection"))); + + /* ---- + * TODO: We got to change that someday soon... + * + * There's basically three things missing to allow this: + * 1) We need to be able to correctly and quickly identify the timeline a + * LSN belongs to + * 2) We need to force hot_standby_feedback to be enabled at all times so + * the primary cannot remove rows we need. + * 3) support dropping replication slots referring to a database, in + * dbase_redo. There can't be any active ones due to HS recovery + * conflicts, so that should be relatively easy. + * ---- + */ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("logical decoding cannot be used while in recovery"))); +} + +/* + * Helper function for CreateInitDecodingContext() and + * CreateDecodingContext() performing common tasks. + */ +static LogicalDecodingContext * +StartupDecodingContext(List *output_plugin_options, + XLogRecPtr start_lsn, + TransactionId xmin_horizon, + bool need_full_snapshot, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + ReplicationSlot *slot; + MemoryContext context, + old_context; + LogicalDecodingContext *ctx; + + /* shorter lines... */ + slot = MyReplicationSlot; + + context = AllocSetContextCreate(CurrentMemoryContext, + "Logical decoding context", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(context); + ctx = palloc0(sizeof(LogicalDecodingContext)); + + ctx->context = context; + + /* + * (re-)load output plugins, so we detect a bad (removed) output plugin + * now. + */ + if (!fast_forward) + LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin)); + + /* + * Now that the slot's xmin has been set, we can announce ourselves as a + * logical decoding backend which doesn't need to be checked individually + * when computing the xmin horizon because the xmin is enforced via + * replication slots. + * + * We can only do so if we're outside of a transaction (i.e. the case when + * streaming changes via walsender), otherwise an already setup + * snapshot/xid would end up being ignored. That's not a particularly + * bothersome restriction since the SQL interface can't be used for + * streaming anyway. + */ + if (!IsTransactionOrTransactionBlock()) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags |= PROC_IN_LOGICAL_DECODING; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); + } + + ctx->slot = slot; + + ctx->reader = XLogReaderAllocate(wal_segment_size, NULL, xl_routine, ctx); + if (!ctx->reader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + ctx->reorder = ReorderBufferAllocate(); + ctx->snapshot_builder = + AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn, + need_full_snapshot, slot->data.initial_consistent_point); + + ctx->reorder->private_data = ctx; + + /* wrap output plugin callbacks, so we can add error context information */ + ctx->reorder->begin = begin_cb_wrapper; + ctx->reorder->apply_change = change_cb_wrapper; + ctx->reorder->apply_truncate = truncate_cb_wrapper; + ctx->reorder->commit = commit_cb_wrapper; + ctx->reorder->message = message_cb_wrapper; + + /* + * To support streaming, we require start/stop/abort/commit/change + * callbacks. The message and truncate callbacks are optional, similar to + * regular output plugins. We however enable streaming when at least one + * of the methods is enabled so that we can easily identify missing + * methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->streaming = (ctx->callbacks.stream_start_cb != NULL) || + (ctx->callbacks.stream_stop_cb != NULL) || + (ctx->callbacks.stream_abort_cb != NULL) || + (ctx->callbacks.stream_commit_cb != NULL) || + (ctx->callbacks.stream_change_cb != NULL) || + (ctx->callbacks.stream_message_cb != NULL) || + (ctx->callbacks.stream_truncate_cb != NULL); + + /* + * streaming callbacks + * + * stream_message and stream_truncate callbacks are optional, so we do not + * fail with ERROR when missing, but the wrappers simply do nothing. We + * must set the ReorderBuffer callbacks to something, otherwise the calls + * from there will crash (we don't want to move the checks there). + */ + ctx->reorder->stream_start = stream_start_cb_wrapper; + ctx->reorder->stream_stop = stream_stop_cb_wrapper; + ctx->reorder->stream_abort = stream_abort_cb_wrapper; + ctx->reorder->stream_prepare = stream_prepare_cb_wrapper; + ctx->reorder->stream_commit = stream_commit_cb_wrapper; + ctx->reorder->stream_change = stream_change_cb_wrapper; + ctx->reorder->stream_message = stream_message_cb_wrapper; + ctx->reorder->stream_truncate = stream_truncate_cb_wrapper; + + + /* + * To support two-phase logical decoding, we require + * begin_prepare/prepare/commit-prepare/abort-prepare callbacks. The + * filter_prepare callback is optional. We however enable two-phase + * logical decoding when at least one of the methods is enabled so that we + * can easily identify missing methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->twophase = (ctx->callbacks.begin_prepare_cb != NULL) || + (ctx->callbacks.prepare_cb != NULL) || + (ctx->callbacks.commit_prepared_cb != NULL) || + (ctx->callbacks.rollback_prepared_cb != NULL) || + (ctx->callbacks.stream_prepare_cb != NULL) || + (ctx->callbacks.filter_prepare_cb != NULL); + + /* + * Callback to support decoding at prepare time. + */ + ctx->reorder->begin_prepare = begin_prepare_cb_wrapper; + ctx->reorder->prepare = prepare_cb_wrapper; + ctx->reorder->commit_prepared = commit_prepared_cb_wrapper; + ctx->reorder->rollback_prepared = rollback_prepared_cb_wrapper; + + ctx->out = makeStringInfo(); + ctx->prepare_write = prepare_write; + ctx->write = do_write; + ctx->update_progress = update_progress; + + ctx->output_plugin_options = output_plugin_options; + + ctx->fast_forward = fast_forward; + + MemoryContextSwitchTo(old_context); + + return ctx; +} + +/* + * Create a new decoding context, for a new logical slot. + * + * plugin -- contains the name of the output plugin + * output_plugin_options -- contains options passed to the output plugin + * need_full_snapshot -- if true, must obtain a snapshot able to read all + * tables; if false, one that can read only catalogs is acceptable. + * restart_lsn -- if given as invalid, it's this routine's responsibility to + * mark WAL as reserved by setting a convenient restart_lsn for the slot. + * Otherwise, we set for decoding to start from the given LSN without + * marking WAL reserved beforehand. In that scenario, it's up to the + * caller to guarantee that WAL remains available. + * xl_routine -- XLogReaderRoutine for underlying XLogReader + * prepare_write, do_write, update_progress -- + * callbacks that perform the use-case dependent, actual, work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateInitDecodingContext(const char *plugin, + List *output_plugin_options, + bool need_full_snapshot, + XLogRecPtr restart_lsn, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + TransactionId xmin_horizon = InvalidTransactionId; + ReplicationSlot *slot; + NameData plugin_name; + LogicalDecodingContext *ctx; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + if (plugin == NULL) + elog(ERROR, "cannot initialize logical decoding without a specified plugin"); + + /* Make sure the passed slot is suitable. These are user facing errors. */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + if (IsTransactionState() && + GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot create logical replication slot in transaction that has performed writes"))); + + /* + * Register output plugin name with slot. We need the mutex to avoid + * concurrent reading of a partially copied string. But we don't want any + * complicated code while holding a spinlock, so do namestrcpy() outside. + */ + namestrcpy(&plugin_name, plugin); + SpinLockAcquire(&slot->mutex); + slot->data.plugin = plugin_name; + SpinLockRelease(&slot->mutex); + + if (XLogRecPtrIsInvalid(restart_lsn)) + ReplicationSlotReserveWal(); + else + { + SpinLockAcquire(&slot->mutex); + slot->data.restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + } + + /* ---- + * This is a bit tricky: We need to determine a safe xmin horizon to start + * decoding from, to avoid starting from a running xacts record referring + * to xids whose rows have been vacuumed or pruned + * already. GetOldestSafeDecodingTransactionId() returns such a value, but + * without further interlock its return value might immediately be out of + * date. + * + * So we have to acquire the ProcArrayLock to prevent computation of new + * xmin horizons by other backends, get the safe decoding xid, and inform + * the slot machinery about the new limit. Once that's done the + * ProcArrayLock can be released as the slot machinery now is + * protecting against vacuum. + * + * Note that, temporarily, the data, not just the catalog, xmin has to be + * reserved if a data snapshot is to be exported. Otherwise the initial + * data snapshot created here is not guaranteed to be valid. After that + * the data xmin doesn't need to be managed anymore and the global xmin + * should be recomputed. As we are fine with losing the pegged data xmin + * after crash - no chance a snapshot would get exported anymore - we can + * get away with just setting the slot's + * effective_xmin. ReplicationSlotRelease will reset it again. + * + * ---- + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(!need_full_snapshot); + + SpinLockAcquire(&slot->mutex); + slot->effective_catalog_xmin = xmin_horizon; + slot->data.catalog_xmin = xmin_horizon; + if (need_full_snapshot) + slot->effective_xmin = xmin_horizon; + SpinLockRelease(&slot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + + ctx = StartupDecodingContext(NIL, restart_lsn, xmin_horizon, + need_full_snapshot, false, + xl_routine, prepare_write, do_write, + update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, true); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions iff the two_phase option is + * enabled at the time of slot creation. + */ + ctx->twophase &= MyReplicationSlot->data.two_phase; + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + return ctx; +} + +/* + * Create a new decoding context, for a logical slot that has previously been + * used already. + * + * start_lsn + * The LSN at which to start decoding. If InvalidXLogRecPtr, restart + * from the slot's confirmed_flush; otherwise, start from the specified + * location (but move it forwards to confirmed_flush if it's older than + * that, see below). + * + * output_plugin_options + * options passed to the output plugin. + * + * fast_forward + * bypass the generation of logical changes. + * + * xl_routine + * XLogReaderRoutine used by underlying xlogreader + * + * prepare_write, do_write, update_progress + * callbacks that have to be filled to perform the use-case dependent, + * actual work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateDecodingContext(XLogRecPtr start_lsn, + List *output_plugin_options, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + LogicalDecodingContext *ctx; + ReplicationSlot *slot; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + /* make sure the passed slot is suitable, these are user facing errors */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + if (start_lsn == InvalidXLogRecPtr) + { + /* continue from last position */ + start_lsn = slot->data.confirmed_flush; + } + else if (start_lsn < slot->data.confirmed_flush) + { + /* + * It might seem like we should error out in this case, but it's + * pretty common for a client to acknowledge a LSN it doesn't have to + * do anything for, and thus didn't store persistently, because the + * xlog records didn't result in anything relevant for logical + * decoding. Clients have to be able to do that to support synchronous + * replication. + */ + elog(DEBUG1, "cannot stream from %X/%X, minimum is %X/%X, forwarding", + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(slot->data.confirmed_flush)); + + start_lsn = slot->data.confirmed_flush; + } + + ctx = StartupDecodingContext(output_plugin_options, + start_lsn, InvalidTransactionId, false, + fast_forward, xl_routine, prepare_write, + do_write, update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, false); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions iff the two_phase option is + * enabled at the time of slot creation. + */ + ctx->twophase &= MyReplicationSlot->data.two_phase; + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + ereport(LOG, + (errmsg("starting logical decoding for slot \"%s\"", + NameStr(slot->data.name)), + errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + LSN_FORMAT_ARGS(slot->data.confirmed_flush), + LSN_FORMAT_ARGS(slot->data.restart_lsn)))); + + return ctx; +} + +/* + * Returns true if a consistent initial decoding snapshot has been built. + */ +bool +DecodingContextReady(LogicalDecodingContext *ctx) +{ + return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT; +} + +/* + * Read from the decoding slot, until it is ready to start extracting changes. + */ +void +DecodingContextFindStartpoint(LogicalDecodingContext *ctx) +{ + ReplicationSlot *slot = ctx->slot; + + /* Initialize from where to start reading WAL. */ + XLogBeginRead(ctx->reader, slot->data.restart_lsn); + + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + LSN_FORMAT_ARGS(slot->data.restart_lsn)); + + /* Wait for a consistent starting point */ + for (;;) + { + XLogRecord *record; + char *err = NULL; + + /* the read_page callback waits for new WAL */ + record = XLogReadRecord(ctx->reader, &err); + if (err) + elog(ERROR, "%s", err); + if (!record) + elog(ERROR, "no record found"); /* shouldn't happen */ + + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* only continue till we found a consistent spot */ + if (DecodingContextReady(ctx)) + break; + + CHECK_FOR_INTERRUPTS(); + } + + SpinLockAcquire(&slot->mutex); + slot->data.confirmed_flush = ctx->reader->EndRecPtr; + slot->data.initial_consistent_point = ctx->reader->EndRecPtr; + SpinLockRelease(&slot->mutex); +} + +/* + * Free a previously allocated decoding context, invoking the shutdown + * callback if necessary. + */ +void +FreeDecodingContext(LogicalDecodingContext *ctx) +{ + if (ctx->callbacks.shutdown_cb != NULL) + shutdown_cb_wrapper(ctx); + + ReorderBufferFree(ctx->reorder); + FreeSnapshotBuilder(ctx->snapshot_builder); + XLogReaderFree(ctx->reader); + MemoryContextDelete(ctx->context); +} + +/* + * Prepare a write using the context's output routine. + */ +void +OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->accept_writes) + elog(ERROR, "writes are only accepted in commit, begin and change callbacks"); + + ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = true; +} + +/* + * Perform a write using the context's output routine. + */ +void +OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->prepared_write) + elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite"); + + ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = false; +} + +/* + * Update progress tracking (if supported). + */ +void +OutputPluginUpdateProgress(struct LogicalDecodingContext *ctx) +{ + if (!ctx->update_progress) + return; + + ctx->update_progress(ctx, ctx->write_location, ctx->write_xid); +} + +/* + * Load the output plugin, lookup its output plugin init function, and check + * that it provides the required callbacks. + */ +static void +LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin) +{ + LogicalOutputPluginInit plugin_init; + + plugin_init = (LogicalOutputPluginInit) + load_external_function(plugin, "_PG_output_plugin_init", false, NULL); + + if (plugin_init == NULL) + elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol"); + + /* ask the output plugin to fill the callback struct */ + plugin_init(callbacks); + + if (callbacks->begin_cb == NULL) + elog(ERROR, "output plugins have to register a begin callback"); + if (callbacks->change_cb == NULL) + elog(ERROR, "output plugins have to register a change callback"); + if (callbacks->commit_cb == NULL) + elog(ERROR, "output plugins have to register a commit callback"); +} + +static void +output_plugin_error_callback(void *arg) +{ + LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg; + + /* not all callbacks have an associated LSN */ + if (state->report_location != InvalidXLogRecPtr) + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name, + LSN_FORMAT_ARGS(state->report_location)); + else + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name); +} + +static void +startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "startup"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.startup_cb(ctx, opt, is_init); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +shutdown_cb_wrapper(LogicalDecodingContext *ctx) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "shutdown"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.shutdown_cb(ctx); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + + +/* + * Callbacks for ReorderBuffer which add in some more information and then call + * output_plugin.h plugins. + */ +static void +begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.begin_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* do the actual work: call callback */ + ctx->callbacks.commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * The functionality of begin_prepare is quite similar to begin with the + * exception that this will have gid (global transaction id) information which + * can be used by plugin. Now, we thought about extending the existing begin + * but that would break the replication protocol and additionally this looks + * cleaner. + */ +static void +begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin_prepare"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* + * If the plugin supports two-phase commits then begin prepare callback is + * mandatory + */ + if (ctx->callbacks.begin_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "begin_prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.begin_prepare_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "prepare"; + state.report_location = txn->final_lsn; /* beginning of prepare record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin supports two-phase commits then prepare callback is + * mandatory + */ + if (ctx->callbacks.prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then commit prepared callback + * is mandatory + */ + if (ctx->callbacks.commit_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "commit_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.commit_prepared_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "rollback_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then rollback prepared callback + * is mandatory + */ + if (ctx->callbacks.rollback_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "rollback_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.rollback_prepared_cb(ctx, txn, prepare_end_lsn, + prepare_time); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (!ctx->callbacks.truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +bool +filter_prepare_cb_wrapper(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_prepare"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_prepare_cb(ctx, xid, gid); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +bool +filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_by_origin"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_by_origin_cb(ctx, origin_id); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +static void +message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (ctx->callbacks.message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_start"; + state.report_location = first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this message's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = first_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_start_cb is required */ + if (ctx->callbacks.stream_start_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_start_cb"))); + + ctx->callbacks.stream_start_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_stop"; + state.report_location = last_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this message's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = last_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_stop_cb is required */ + if (ctx->callbacks.stream_stop_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_stop_cb"))); + + ctx->callbacks.stream_stop_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_abort"; + state.report_location = abort_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = abort_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_abort_cb is required */ + if (ctx->callbacks.stream_abort_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_abort_cb"))); + + ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* + * We're only supposed to call this when streaming and two-phase commits + * are supported. + */ + Assert(ctx->streaming); + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_prepare"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode with two-phase commits, stream_prepare_cb is required */ + if (ctx->callbacks.stream_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming at prepare time requires a %s callback", + "stream_prepare_cb"))); + + ctx->callbacks.stream_prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_commit"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_commit_cb is required */ + if (ctx->callbacks.stream_commit_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_commit_cb"))); + + ctx->callbacks.stream_commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_change_cb is required */ + if (ctx->callbacks.stream_change_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_change_cb"))); + + ctx->callbacks.stream_change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (ctx->callbacks.stream_message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (!ctx->callbacks.stream_truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * report this change's lsn so replies from clients can give an up2date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * Set the required catalog xmin horizon for historic snapshots in the current + * replication slot. + * + * Note that in the most cases, we won't be able to immediately use the xmin + * to increase the xmin horizon: we need to wait till the client has confirmed + * receiving current_lsn with LogicalConfirmReceivedLocation(). + */ +void +LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) +{ + bool updated_xmin = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + + SpinLockAcquire(&slot->mutex); + + /* + * don't overwrite if we already have a newer xmin. This can happen if we + * restart decoding in a slot. + */ + if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin)) + { + } + + /* + * If the client has already confirmed up to this lsn, we directly can + * mark this as accepted. This can happen if we restart decoding in a + * slot. + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* our candidate can directly be used */ + updated_xmin = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. + */ + else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + } + SpinLockRelease(&slot->mutex); + + /* candidate already valid with the current flush position, apply */ + if (updated_xmin) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Mark the minimal LSN (restart_lsn) we need to read to replay all + * transactions that have not yet committed at current_lsn. + * + * Just like LogicalIncreaseXminForSlot this only takes effect when the + * client has confirmed to have received current_lsn. + */ +void +LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn) +{ + bool updated_lsn = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(restart_lsn != InvalidXLogRecPtr); + Assert(current_lsn != InvalidXLogRecPtr); + + SpinLockAcquire(&slot->mutex); + + /* don't overwrite if have a newer restart lsn */ + if (restart_lsn <= slot->data.restart_lsn) + { + } + + /* + * We might have already flushed far enough to directly accept this lsn, + * in this case there is no need to check for existing candidate LSNs + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + + /* our candidate can directly be used */ + updated_lsn = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. A missed + * value here will just cause some extra effort after reconnecting. + */ + if (slot->candidate_restart_valid == InvalidXLogRecPtr) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn)); + } + else + { + XLogRecPtr candidate_restart_lsn; + XLogRecPtr candidate_restart_valid; + XLogRecPtr confirmed_flush; + + candidate_restart_lsn = slot->candidate_restart_lsn; + candidate_restart_valid = slot->candidate_restart_valid; + confirmed_flush = slot->data.confirmed_flush; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn), + LSN_FORMAT_ARGS(candidate_restart_lsn), + LSN_FORMAT_ARGS(candidate_restart_valid), + LSN_FORMAT_ARGS(confirmed_flush)); + } + + /* candidates are already valid with the current flush position, apply */ + if (updated_lsn) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Handle a consumer's confirmation having received all changes up to lsn. + */ +void +LogicalConfirmReceivedLocation(XLogRecPtr lsn) +{ + Assert(lsn != InvalidXLogRecPtr); + + /* Do an unlocked check for candidate_lsn first. */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr || + MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr) + { + bool updated_xmin = false; + bool updated_restart = false; + + SpinLockAcquire(&MyReplicationSlot->mutex); + + MyReplicationSlot->data.confirmed_flush = lsn; + + /* if we're past the location required for bumping xmin, do so */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr && + MyReplicationSlot->candidate_xmin_lsn <= lsn) + { + /* + * We have to write the changed xmin to disk *before* we change + * the in-memory value, otherwise after a crash we wouldn't know + * that some catalog tuples might have been removed already. + * + * Ensure that by first writing to ->xmin and only update + * ->effective_xmin once the new state is synced to disk. After a + * crash ->effective_xmin is set to ->xmin. + */ + if (TransactionIdIsValid(MyReplicationSlot->candidate_catalog_xmin) && + MyReplicationSlot->data.catalog_xmin != MyReplicationSlot->candidate_catalog_xmin) + { + MyReplicationSlot->data.catalog_xmin = MyReplicationSlot->candidate_catalog_xmin; + MyReplicationSlot->candidate_catalog_xmin = InvalidTransactionId; + MyReplicationSlot->candidate_xmin_lsn = InvalidXLogRecPtr; + updated_xmin = true; + } + } + + if (MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr && + MyReplicationSlot->candidate_restart_valid <= lsn) + { + Assert(MyReplicationSlot->candidate_restart_lsn != InvalidXLogRecPtr); + + MyReplicationSlot->data.restart_lsn = MyReplicationSlot->candidate_restart_lsn; + MyReplicationSlot->candidate_restart_lsn = InvalidXLogRecPtr; + MyReplicationSlot->candidate_restart_valid = InvalidXLogRecPtr; + updated_restart = true; + } + + SpinLockRelease(&MyReplicationSlot->mutex); + + /* first write new xmin to disk, so we know what's up after a crash */ + if (updated_xmin || updated_restart) + { + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); + } + + /* + * Now the new xmin is safely on disk, we can let the global value + * advance. We do not take ProcArrayLock or similar since we only + * advance xmin here and there's not much harm done by a concurrent + * computation missing that. + */ + if (updated_xmin) + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_catalog_xmin = MyReplicationSlot->data.catalog_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + } + } + else + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->data.confirmed_flush = lsn; + SpinLockRelease(&MyReplicationSlot->mutex); + } +} + +/* + * Clear logical streaming state during (sub)transaction abort. + */ +void +ResetLogicalStreamingState(void) +{ + CheckXidAlive = InvalidTransactionId; + bsysscan = false; +} + +/* + * Report stats for a slot. + */ +void +UpdateDecodingStats(LogicalDecodingContext *ctx) +{ + ReorderBuffer *rb = ctx->reorder; + PgStat_StatReplSlotEntry repSlotStat; + + /* Nothing to do if we don't have any replication stats to be sent. */ + if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0) + return; + + elog(DEBUG2, "UpdateDecodingStats: updating stats %p %lld %lld %lld %lld %lld %lld %lld %lld", + rb, + (long long) rb->spillTxns, + (long long) rb->spillCount, + (long long) rb->spillBytes, + (long long) rb->streamTxns, + (long long) rb->streamCount, + (long long) rb->streamBytes, + (long long) rb->totalTxns, + (long long) rb->totalBytes); + + namestrcpy(&repSlotStat.slotname, NameStr(ctx->slot->data.name)); + repSlotStat.spill_txns = rb->spillTxns; + repSlotStat.spill_count = rb->spillCount; + repSlotStat.spill_bytes = rb->spillBytes; + repSlotStat.stream_txns = rb->streamTxns; + repSlotStat.stream_count = rb->streamCount; + repSlotStat.stream_bytes = rb->streamBytes; + repSlotStat.total_txns = rb->totalTxns; + repSlotStat.total_bytes = rb->totalBytes; + + pgstat_report_replslot(&repSlotStat); + + rb->spillTxns = 0; + rb->spillCount = 0; + rb->spillBytes = 0; + rb->streamTxns = 0; + rb->streamCount = 0; + rb->streamBytes = 0; + rb->totalTxns = 0; + rb->totalBytes = 0; +} diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c new file mode 100644 index 0000000..1f38c5b --- /dev/null +++ b/src/backend/replication/logical/logicalfuncs.c @@ -0,0 +1,417 @@ +/*------------------------------------------------------------------------- + * + * logicalfuncs.c + * + * Support functions for using logical decoding and management of + * logical replication slots via SQL. + * + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logicalfuncs.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "storage/fd.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/regproc.h" +#include "utils/resowner.h" + +/* private date for writing out data */ +typedef struct DecodingOutputState +{ + Tuplestorestate *tupstore; + TupleDesc tupdesc; + bool binary_output; + int64 returned_rows; +} DecodingOutputState; + +/* + * Prepare for an output plugin write. + */ +static void +LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + resetStringInfo(ctx->out); +} + +/* + * Perform output plugin write into tuplestore. + */ +static void +LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + Datum values[3]; + bool nulls[3]; + DecodingOutputState *p; + + /* SQL Datums can only be of a limited length... */ + if (ctx->out->len > MaxAllocSize - VARHDRSZ) + elog(ERROR, "too much output for sql interface"); + + p = (DecodingOutputState *) ctx->output_writer_private; + + memset(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(lsn); + values[1] = TransactionIdGetDatum(xid); + + /* + * Assert ctx->out is in database encoding when we're writing textual + * output. + */ + if (!p->binary_output) + Assert(pg_verify_mbstr(GetDatabaseEncoding(), + ctx->out->data, ctx->out->len, + false)); + + /* ick, but cstring_to_text_with_len works for bytea perfectly fine */ + values[2] = PointerGetDatum(cstring_to_text_with_len(ctx->out->data, ctx->out->len)); + + tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls); + p->returned_rows++; +} + +static void +check_permissions(void) +{ + if (!superuser() && !has_rolreplication(GetUserId())) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser or replication role to use replication slots"))); +} + +/* + * Helper function for the various SQL callable logical decoding functions. + */ +static Datum +pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) +{ + Name name; + XLogRecPtr upto_lsn; + int32 upto_nchanges; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + XLogRecPtr end_of_wal; + LogicalDecodingContext *ctx; + ResourceOwner old_resowner = CurrentResourceOwner; + ArrayType *arr; + Size ndim; + List *options = NIL; + DecodingOutputState *p; + + check_permissions(); + + CheckLogicalDecodingRequirements(); + + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("slot name must not be null"))); + name = PG_GETARG_NAME(0); + + if (PG_ARGISNULL(1)) + upto_lsn = InvalidXLogRecPtr; + else + upto_lsn = PG_GETARG_LSN(1); + + if (PG_ARGISNULL(2)) + upto_nchanges = InvalidXLogRecPtr; + else + upto_nchanges = PG_GETARG_INT32(2); + + if (PG_ARGISNULL(3)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("options array must not be null"))); + arr = PG_GETARG_ARRAYTYPE_P(3); + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* state to write output to */ + p = palloc0(sizeof(DecodingOutputState)); + + p->binary_output = binary; + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Deconstruct options array */ + ndim = ARR_NDIM(arr); + if (ndim > 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must be one-dimensional"))); + } + else if (array_contains_nulls(arr)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must not contain nulls"))); + } + else if (ndim == 1) + { + int nelems; + Datum *datum_opts; + int i; + + Assert(ARR_ELEMTYPE(arr) == TEXTOID); + + deconstruct_array(arr, TEXTOID, -1, false, TYPALIGN_INT, + &datum_opts, NULL, &nelems); + + if (nelems % 2 != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must have even number of elements"))); + + for (i = 0; i < nelems; i += 2) + { + char *name = TextDatumGetCString(datum_opts[i]); + char *opt = TextDatumGetCString(datum_opts[i + 1]); + + options = lappend(options, makeDefElem(name, (Node *) makeString(opt), -1)); + } + } + + p->tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = p->tupstore; + rsinfo->setDesc = p->tupdesc; + + /* + * Compute the current end-of-wal and maintain ThisTimeLineID. + * RecoveryInProgress() will update ThisTimeLineID on promotion. + */ + if (!RecoveryInProgress()) + end_of_wal = GetFlushRecPtr(); + else + end_of_wal = GetXLogReplayRecPtr(&ThisTimeLineID); + + ReplicationSlotAcquire(NameStr(*name), true); + + PG_TRY(); + { + /* restart at slot's confirmed_flush */ + ctx = CreateDecodingContext(InvalidXLogRecPtr, + options, + false, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + LogicalOutputPrepareWrite, + LogicalOutputWrite, NULL); + + /* + * After the sanity checks in CreateDecodingContext, make sure the + * restart_lsn is valid. Avoid "cannot get changes" wording in this + * errmsg because that'd be confusingly ambiguous about no changes + * being available. + */ + if (XLogRecPtrIsInvalid(MyReplicationSlot->data.restart_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer get changes from replication slot \"%s\"", + NameStr(*name)), + errdetail("This slot has never previously reserved WAL, or it has been invalidated."))); + + MemoryContextSwitchTo(oldcontext); + + /* + * Check whether the output plugin writes textual output if that's + * what we need. + */ + if (!binary && + ctx->options.output_type !=OUTPUT_PLUGIN_TEXTUAL_OUTPUT) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("logical decoding output plugin \"%s\" produces binary output, but function \"%s\" expects textual data", + NameStr(MyReplicationSlot->data.plugin), + format_procedure(fcinfo->flinfo->fn_oid)))); + + ctx->output_writer_private = p; + + /* + * Decoding of WAL must start at restart_lsn so that the entirety of + * xacts that committed after the slot's confirmed_flush can be + * accumulated into reorder buffers. + */ + XLogBeginRead(ctx->reader, MyReplicationSlot->data.restart_lsn); + + /* invalidate non-timetravel entries */ + InvalidateSystemCaches(); + + /* Decode until we run out of records */ + while (ctx->reader->EndRecPtr < end_of_wal) + { + XLogRecord *record; + char *errm = NULL; + + record = XLogReadRecord(ctx->reader, &errm); + if (errm) + elog(ERROR, "%s", errm); + + /* + * The {begin_txn,change,commit_txn}_wrapper callbacks above will + * store the description into our tuplestore. + */ + if (record != NULL) + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* check limits */ + if (upto_lsn != InvalidXLogRecPtr && + upto_lsn <= ctx->reader->EndRecPtr) + break; + if (upto_nchanges != 0 && + upto_nchanges <= p->returned_rows) + break; + CHECK_FOR_INTERRUPTS(); + } + + tuplestore_donestoring(tupstore); + + /* + * Logical decoding could have clobbered CurrentResourceOwner during + * transaction management, so restore the executor's value. (This is + * a kluge, but it's not worth cleaning up right now.) + */ + CurrentResourceOwner = old_resowner; + + /* + * Next time, start where we left off. (Hunting things, the family + * business..) + */ + if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) + { + LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); + + /* + * If only the confirmed_flush_lsn has changed the slot won't get + * marked as dirty by the above. Callers on the walsender + * interface are expected to keep track of their own progress and + * don't need it written out. But SQL-interface users cannot + * specify their own start positions and it's harder for them to + * keep track of their progress, so we should make more of an + * effort to save it for them. + * + * Dirty the slot so it's written out at the next checkpoint. + * We'll still lose its position on crash, as documented, but it's + * better than always losing the position even on clean restart. + */ + ReplicationSlotMarkDirty(); + } + + /* free context, call shutdown callback */ + FreeDecodingContext(ctx); + + ReplicationSlotRelease(); + InvalidateSystemCaches(); + } + PG_CATCH(); + { + /* clear all timetravel entries */ + InvalidateSystemCaches(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + return (Datum) 0; +} + +/* + * SQL function returning the changestream as text, consuming the data. + */ +Datum +pg_logical_slot_get_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, false); +} + +/* + * SQL function returning the changestream as text, only peeking ahead. + */ +Datum +pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, false); +} + +/* + * SQL function returning the changestream in binary, consuming the data. + */ +Datum +pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, true); +} + +/* + * SQL function returning the changestream in binary, only peeking ahead. + */ +Datum +pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, true); +} + + +/* + * SQL function for writing logical decoding message into WAL. + */ +Datum +pg_logical_emit_message_bytea(PG_FUNCTION_ARGS) +{ + bool transactional = PG_GETARG_BOOL(0); + char *prefix = text_to_cstring(PG_GETARG_TEXT_PP(1)); + bytea *data = PG_GETARG_BYTEA_PP(2); + XLogRecPtr lsn; + + lsn = LogLogicalMessage(prefix, VARDATA_ANY(data), VARSIZE_ANY_EXHDR(data), + transactional); + PG_RETURN_LSN(lsn); +} + +Datum +pg_logical_emit_message_text(PG_FUNCTION_ARGS) +{ + /* bytea and text are compatible */ + return pg_logical_emit_message_bytea(fcinfo); +} diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c new file mode 100644 index 0000000..93bd372 --- /dev/null +++ b/src/backend/replication/logical/message.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * message.c + * Generic logical messages. + * + * Copyright (c) 2013-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/message.c + * + * NOTES + * + * Generic logical messages allow XLOG logging of arbitrary binary blobs that + * get passed to the logical decoding plugin. In normal XLOG processing they + * are same as NOOP. + * + * These messages can be either transactional or non-transactional. + * Transactional messages are part of current transaction and will be sent to + * decoding plugin using in a same way as DML operations. + * Non-transactional messages are sent to the plugin at the time when the + * logical decoding reads them from XLOG. This also means that transactional + * messages won't be delivered if the transaction was rolled back but the + * non-transactional one will always be delivered. + * + * Every message carries prefix to avoid conflicts between different decoding + * plugins. The plugin authors must take extra care to use unique prefix, + * good options seems to be for example to use the name of the extension. + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "utils/memutils.h" + +/* + * Write logical decoding message into XLog. + */ +XLogRecPtr +LogLogicalMessage(const char *prefix, const char *message, size_t size, + bool transactional) +{ + xl_logical_message xlrec; + + /* + * Force xid to be allocated if we're emitting a transactional message. + */ + if (transactional) + { + Assert(IsTransactionState()); + GetCurrentTransactionId(); + } + + xlrec.dbId = MyDatabaseId; + xlrec.transactional = transactional; + /* trailing zero is critical; see logicalmsg_desc */ + xlrec.prefix_size = strlen(prefix) + 1; + xlrec.message_size = size; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfLogicalMessage); + XLogRegisterData(unconstify(char *, prefix), xlrec.prefix_size); + XLogRegisterData(unconstify(char *, message), size); + + /* allow origin filtering */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE); +} + +/* + * Redo is basically just noop for logical decoding messages. + */ +void +logicalmsg_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(PANIC, "logicalmsg_redo: unknown op code %u", info); + + /* This is only interesting for logical decoding, see decode.c. */ +} diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c new file mode 100644 index 0000000..6988840 --- /dev/null +++ b/src/backend/replication/logical/origin.c @@ -0,0 +1,1574 @@ +/*------------------------------------------------------------------------- + * + * origin.c + * Logical replication progress tracking support. + * + * Copyright (c) 2013-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/origin.c + * + * NOTES + * + * This file provides the following: + * * An infrastructure to name nodes in a replication setup + * * A facility to efficiently store and persist replication progress in an + * efficient and durable manner. + * + * Replication origin consist out of a descriptive, user defined, external + * name and a short, thus space efficient, internal 2 byte one. This split + * exists because replication origin have to be stored in WAL and shared + * memory and long descriptors would be inefficient. For now only use 2 bytes + * for the internal id of a replication origin as it seems unlikely that there + * soon will be more than 65k nodes in one replication setup; and using only + * two bytes allow us to be more space efficient. + * + * Replication progress is tracked in a shared memory table + * (ReplicationState) that's dumped to disk every checkpoint. Entries + * ('slots') in this table are identified by the internal id. That's the case + * because it allows to increase replication progress during crash + * recovery. To allow doing so we store the original LSN (from the originating + * system) of a transaction in the commit record. That allows to recover the + * precise replayed state after crash recovery; without requiring synchronous + * commits. Allowing logical replication to use asynchronous commit is + * generally good for performance, but especially important as it allows a + * single threaded replay process to keep up with a source that has multiple + * backends generating changes concurrently. For efficiency and simplicity + * reasons a backend can setup one replication origin that's from then used as + * the source of changes produced by the backend, until reset again. + * + * This infrastructure is intended to be used in cooperation with logical + * decoding. When replaying from a remote system the configured origin is + * provided to output plugins, allowing prevention of replication loops and + * other filtering. + * + * There are several levels of locking at work: + * + * * To create and drop replication origins an exclusive lock on + * pg_replication_slot is required for the duration. That allows us to + * safely and conflict free assign new origins using a dirty snapshot. + * + * * When creating an in-memory replication progress slot the ReplicationOrigin + * LWLock has to be held exclusively; when iterating over the replication + * progress a shared lock has to be held, the same when advancing the + * replication progress of an individual backend that has not setup as the + * session's replication origin. + * + * * When manipulating or looking at the remote_lsn and local_lsn fields of a + * replication progress slot that slot's lwlock has to be held. That's + * primarily because we do not assume 8 byte writes (the LSN) is atomic on + * all our platforms, but it also simplifies memory ordering concerns + * between the remote and local lsn. We use a lwlock instead of a spinlock + * so it's less harmful to hold the lock over a WAL write + * (cf. AdvanceReplicationProgress). + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/table.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "storage/condition_variable.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * Replay progress of a single remote node. + */ +typedef struct ReplicationState +{ + /* + * Local identifier for the remote node. + */ + RepOriginId roident; + + /* + * Location of the latest commit from the remote side. + */ + XLogRecPtr remote_lsn; + + /* + * Remember the local lsn of the commit record so we can XLogFlush() to it + * during a checkpoint so we know the commit record actually is safe on + * disk. + */ + XLogRecPtr local_lsn; + + /* + * PID of backend that's acquired slot, or 0 if none. + */ + int acquired_by; + + /* + * Condition variable that's signaled when acquired_by changes. + */ + ConditionVariable origin_cv; + + /* + * Lock protecting remote_lsn and local_lsn. + */ + LWLock lock; +} ReplicationState; + +/* + * On disk version of ReplicationState. + */ +typedef struct ReplicationStateOnDisk +{ + RepOriginId roident; + XLogRecPtr remote_lsn; +} ReplicationStateOnDisk; + + +typedef struct ReplicationStateCtl +{ + /* Tranche to use for per-origin LWLocks */ + int tranche_id; + /* Array of length max_replication_slots */ + ReplicationState states[FLEXIBLE_ARRAY_MEMBER]; +} ReplicationStateCtl; + +/* external variables */ +RepOriginId replorigin_session_origin = InvalidRepOriginId; /* assumed identity */ +XLogRecPtr replorigin_session_origin_lsn = InvalidXLogRecPtr; +TimestampTz replorigin_session_origin_timestamp = 0; + +/* + * Base address into a shared memory array of replication states of size + * max_replication_slots. + * + * XXX: Should we use a separate variable to size this rather than + * max_replication_slots? + */ +static ReplicationState *replication_states; + +/* + * Actual shared memory block (replication_states[] is now part of this). + */ +static ReplicationStateCtl *replication_states_ctl; + +/* + * Backend-local, cached element from ReplicationState for use in a backend + * replaying remote commits, so we don't have to search ReplicationState for + * the backends current RepOriginId. + */ +static ReplicationState *session_replication_state = NULL; + +/* Magic for on disk files. */ +#define REPLICATION_STATE_MAGIC ((uint32) 0x1257DADE) + +static void +replorigin_check_prerequisites(bool check_slots, bool recoveryOK) +{ + if (check_slots && max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot query or manipulate replication origin when max_replication_slots = 0"))); + + if (!recoveryOK && RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot manipulate replication origins during recovery"))); + +} + + +/* --------------------------------------------------------------------------- + * Functions for working with replication origins themselves. + * --------------------------------------------------------------------------- + */ + +/* + * Check for a persistent replication origin identified by name. + * + * Returns InvalidOid if the node isn't known yet and missing_ok is true. + */ +RepOriginId +replorigin_by_name(const char *roname, bool missing_ok) +{ + Form_pg_replication_origin ident; + Oid roident = InvalidOid; + HeapTuple tuple; + Datum roname_d; + + roname_d = CStringGetTextDatum(roname); + + tuple = SearchSysCache1(REPLORIGNAME, roname_d); + if (HeapTupleIsValid(tuple)) + { + ident = (Form_pg_replication_origin) GETSTRUCT(tuple); + roident = ident->roident; + ReleaseSysCache(tuple); + } + else if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin \"%s\" does not exist", + roname))); + + return roident; +} + +/* + * Create a replication origin. + * + * Needs to be called in a transaction. + */ +RepOriginId +replorigin_create(const char *roname) +{ + Oid roident; + HeapTuple tuple = NULL; + Relation rel; + Datum roname_d; + SnapshotData SnapshotDirty; + SysScanDesc scan; + ScanKeyData key; + + roname_d = CStringGetTextDatum(roname); + + Assert(IsTransactionState()); + + /* + * We need the numeric replication origin to be 16bit wide, so we cannot + * rely on the normal oid allocation. Instead we simply scan + * pg_replication_origin for the first unused id. That's not particularly + * efficient, but this should be a fairly infrequent operation - we can + * easily spend a bit more code on this when it turns out it needs to be + * faster. + * + * We handle concurrency by taking an exclusive lock (allowing reads!) + * over the table for the duration of the search. Because we use a "dirty + * snapshot" we can read rows that other in-progress sessions have + * written, even though they would be invisible with normal snapshots. Due + * to the exclusive lock there's no danger that new rows can appear while + * we're checking. + */ + InitDirtySnapshot(SnapshotDirty); + + rel = table_open(ReplicationOriginRelationId, ExclusiveLock); + + for (roident = InvalidOid + 1; roident < PG_UINT16_MAX; roident++) + { + bool nulls[Natts_pg_replication_origin]; + Datum values[Natts_pg_replication_origin]; + bool collides; + + CHECK_FOR_INTERRUPTS(); + + ScanKeyInit(&key, + Anum_pg_replication_origin_roident, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(roident)); + + scan = systable_beginscan(rel, ReplicationOriginIdentIndex, + true /* indexOK */ , + &SnapshotDirty, + 1, &key); + + collides = HeapTupleIsValid(systable_getnext(scan)); + + systable_endscan(scan); + + if (!collides) + { + /* + * Ok, found an unused roident, insert the new row and do a CCI, + * so our callers can look it up if they want to. + */ + memset(&nulls, 0, sizeof(nulls)); + + values[Anum_pg_replication_origin_roident - 1] = ObjectIdGetDatum(roident); + values[Anum_pg_replication_origin_roname - 1] = roname_d; + + tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); + CatalogTupleInsert(rel, tuple); + CommandCounterIncrement(); + break; + } + } + + /* now release lock again, */ + table_close(rel, ExclusiveLock); + + if (tuple == NULL) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("could not find free replication origin OID"))); + + heap_freetuple(tuple); + return roident; +} + +/* + * Helper function to drop a replication origin. + */ +static void +replorigin_drop_guts(Relation rel, RepOriginId roident, bool nowait) +{ + HeapTuple tuple; + int i; + + /* + * First, clean up the slot state info, if there is any matching slot. + */ +restart: + tuple = NULL; + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + if (state->roident == roident) + { + /* found our slot, is it busy? */ + if (state->acquired_by != 0) + { + ConditionVariable *cv; + + if (nowait) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("could not drop replication origin with OID %d, in use by PID %d", + state->roident, + state->acquired_by))); + + /* + * We must wait and then retry. Since we don't know which CV + * to wait on until here, we can't readily use + * ConditionVariablePrepareToSleep (calling it here would be + * wrong, since we could miss the signal if we did so); just + * use ConditionVariableSleep directly. + */ + cv = &state->origin_cv; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP); + goto restart; + } + + /* first make a WAL log entry */ + { + xl_replorigin_drop xlrec; + + xlrec.node_id = roident; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_DROP); + } + + /* then clear the in-memory slot */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + LWLockRelease(ReplicationOriginLock); + ConditionVariableCancelSleep(); + + /* + * Now, we can delete the catalog entry. + */ + tuple = SearchSysCache1(REPLORIGIDENT, ObjectIdGetDatum(roident)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for replication origin with oid %u", + roident); + + CatalogTupleDelete(rel, &tuple->t_self); + ReleaseSysCache(tuple); + + CommandCounterIncrement(); +} + +/* + * Drop replication origin (by name). + * + * Needs to be called in a transaction. + */ +void +replorigin_drop_by_name(const char *name, bool missing_ok, bool nowait) +{ + RepOriginId roident; + Relation rel; + + Assert(IsTransactionState()); + + /* + * To interlock against concurrent drops, we hold ExclusiveLock on + * pg_replication_origin till xact commit. + * + * XXX We can optimize this by acquiring the lock on a specific origin by + * using LockSharedObject if required. However, for that, we first to + * acquire a lock on ReplicationOriginRelationId, get the origin_id, lock + * the specific origin and then re-check if the origin still exists. + */ + rel = table_open(ReplicationOriginRelationId, ExclusiveLock); + + roident = replorigin_by_name(name, missing_ok); + + if (OidIsValid(roident)) + replorigin_drop_guts(rel, roident, nowait); + + /* We keep the lock on pg_replication_origin until commit */ + table_close(rel, NoLock); +} + +/* + * Lookup replication origin via its oid and return the name. + * + * The external name is palloc'd in the calling context. + * + * Returns true if the origin is known, false otherwise. + */ +bool +replorigin_by_oid(RepOriginId roident, bool missing_ok, char **roname) +{ + HeapTuple tuple; + Form_pg_replication_origin ric; + + Assert(OidIsValid((Oid) roident)); + Assert(roident != InvalidRepOriginId); + Assert(roident != DoNotReplicateId); + + tuple = SearchSysCache1(REPLORIGIDENT, + ObjectIdGetDatum((Oid) roident)); + + if (HeapTupleIsValid(tuple)) + { + ric = (Form_pg_replication_origin) GETSTRUCT(tuple); + *roname = text_to_cstring(&ric->roname); + ReleaseSysCache(tuple); + + return true; + } + else + { + *roname = NULL; + + if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin with OID %u does not exist", + roident))); + + return false; + } +} + + +/* --------------------------------------------------------------------------- + * Functions for handling replication progress. + * --------------------------------------------------------------------------- + */ + +Size +ReplicationOriginShmemSize(void) +{ + Size size = 0; + + /* + * XXX: max_replication_slots is arguably the wrong thing to use, as here + * we keep the replay state of *remote* transactions. But for now it seems + * sufficient to reuse it, rather than introduce a separate GUC. + */ + if (max_replication_slots == 0) + return size; + + size = add_size(size, offsetof(ReplicationStateCtl, states)); + + size = add_size(size, + mul_size(max_replication_slots, sizeof(ReplicationState))); + return size; +} + +void +ReplicationOriginShmemInit(void) +{ + bool found; + + if (max_replication_slots == 0) + return; + + replication_states_ctl = (ReplicationStateCtl *) + ShmemInitStruct("ReplicationOriginState", + ReplicationOriginShmemSize(), + &found); + replication_states = replication_states_ctl->states; + + if (!found) + { + int i; + + MemSet(replication_states_ctl, 0, ReplicationOriginShmemSize()); + + replication_states_ctl->tranche_id = LWTRANCHE_REPLICATION_ORIGIN_STATE; + + for (i = 0; i < max_replication_slots; i++) + { + LWLockInitialize(&replication_states[i].lock, + replication_states_ctl->tranche_id); + ConditionVariableInit(&replication_states[i].origin_cv); + } + } +} + +/* --------------------------------------------------------------------------- + * Perform a checkpoint of each replication origin's progress with respect to + * the replayed remote_lsn. Make sure that all transactions we refer to in the + * checkpoint (local_lsn) are actually on-disk. This might not yet be the case + * if the transactions were originally committed asynchronously. + * + * We store checkpoints in the following format: + * +-------+------------------------+------------------+-----+--------+ + * | MAGIC | ReplicationStateOnDisk | struct Replic... | ... | CRC32C | EOF + * +-------+------------------------+------------------+-----+--------+ + * + * So its just the magic, followed by the statically sized + * ReplicationStateOnDisk structs. Note that the maximum number of + * ReplicationState is determined by max_replication_slots. + * --------------------------------------------------------------------------- + */ +void +CheckPointReplicationOrigin(void) +{ + const char *tmppath = "pg_logical/replorigin_checkpoint.tmp"; + const char *path = "pg_logical/replorigin_checkpoint"; + int tmpfd; + int i; + uint32 magic = REPLICATION_STATE_MAGIC; + pg_crc32c crc; + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + /* make sure no old temp file is remaining */ + if (unlink(tmppath) < 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + tmppath))); + + /* + * no other backend can perform this at the same time; only one checkpoint + * can happen at a time. + */ + tmpfd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (tmpfd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmppath))); + + /* write magic */ + errno = 0; + if ((write(tmpfd, &magic, sizeof(magic))) != sizeof(magic)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + /* prevent concurrent creations/drops */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* write actual data */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationStateOnDisk disk_state; + ReplicationState *curstate = &replication_states[i]; + XLogRecPtr local_lsn; + + if (curstate->roident == InvalidRepOriginId) + continue; + + /* zero, to avoid uninitialized padding bytes */ + memset(&disk_state, 0, sizeof(disk_state)); + + LWLockAcquire(&curstate->lock, LW_SHARED); + + disk_state.roident = curstate->roident; + + disk_state.remote_lsn = curstate->remote_lsn; + local_lsn = curstate->local_lsn; + + LWLockRelease(&curstate->lock); + + /* make sure we only write out a commit that's persistent */ + XLogFlush(local_lsn); + + errno = 0; + if ((write(tmpfd, &disk_state, sizeof(disk_state))) != + sizeof(disk_state)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + } + + LWLockRelease(ReplicationOriginLock); + + /* write out the CRC */ + FIN_CRC32C(crc); + errno = 0; + if ((write(tmpfd, &crc, sizeof(crc))) != sizeof(crc)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + if (CloseTransientFile(tmpfd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + tmppath))); + + /* fsync, rename to permanent file, fsync file and directory */ + durable_rename(tmppath, path, PANIC); +} + +/* + * Recover replication replay status from checkpoint data saved earlier by + * CheckPointReplicationOrigin. + * + * This only needs to be called at startup and *not* during every checkpoint + * read during recovery (e.g. in HS or PITR from a base backup) afterwards. All + * state thereafter can be recovered by looking at commit records. + */ +void +StartupReplicationOrigin(void) +{ + const char *path = "pg_logical/replorigin_checkpoint"; + int fd; + int readBytes; + uint32 magic = REPLICATION_STATE_MAGIC; + int last_state = 0; + pg_crc32c file_crc; + pg_crc32c crc; + + /* don't want to overwrite already existing state */ +#ifdef USE_ASSERT_CHECKING + static bool already_started = false; + + Assert(!already_started); + already_started = true; +#endif + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + elog(DEBUG2, "starting up replication origin progress state"); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + /* + * might have had max_replication_slots == 0 last run, or we just brought + * up a standby. + */ + if (fd < 0 && errno == ENOENT) + return; + else if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + + /* verify magic, that is written even if nothing was active */ + readBytes = read(fd, &magic, sizeof(magic)); + if (readBytes != sizeof(magic)) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(magic)))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + if (magic != REPLICATION_STATE_MAGIC) + ereport(PANIC, + (errmsg("replication checkpoint has wrong magic %u instead of %u", + magic, REPLICATION_STATE_MAGIC))); + + /* we can skip locking here, no other access is possible */ + + /* recover individual states, until there are no more to be found */ + while (true) + { + ReplicationStateOnDisk disk_state; + + readBytes = read(fd, &disk_state, sizeof(disk_state)); + + /* no further data */ + if (readBytes == sizeof(crc)) + { + /* not pretty, but simple ... */ + file_crc = *(pg_crc32c *) &disk_state; + break; + } + + if (readBytes < 0) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + } + + if (readBytes != sizeof(disk_state)) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(disk_state)))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + + if (last_state == max_replication_slots) + ereport(PANIC, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state, increase max_replication_slots"))); + + /* copy data to shared memory */ + replication_states[last_state].roident = disk_state.roident; + replication_states[last_state].remote_lsn = disk_state.remote_lsn; + last_state++; + + ereport(LOG, + (errmsg("recovered replication state of node %u to %X/%X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + } + + /* now check checksum */ + FIN_CRC32C(crc); + if (file_crc != crc) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot checkpoint has wrong checksum %u, expected %u", + crc, file_crc))); + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); +} + +void +replorigin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_REPLORIGIN_SET: + { + xl_replorigin_set *xlrec = + (xl_replorigin_set *) XLogRecGetData(record); + + replorigin_advance(xlrec->node_id, + xlrec->remote_lsn, record->EndRecPtr, + xlrec->force /* backward */ , + false /* WAL log */ ); + break; + } + case XLOG_REPLORIGIN_DROP: + { + xl_replorigin_drop *xlrec; + int i; + + xlrec = (xl_replorigin_drop *) XLogRecGetData(record); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + /* found our slot */ + if (state->roident == xlrec->node_id) + { + /* reset entry */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + break; + } + default: + elog(PANIC, "replorigin_redo: unknown op code %u", info); + } +} + + +/* + * Tell the replication origin progress machinery that a commit from 'node' + * that originated at the LSN remote_commit on the remote node was replayed + * successfully and that we don't need to do so again. In combination with + * setting up replorigin_session_origin_lsn and replorigin_session_origin + * that ensures we won't lose knowledge about that after a crash if the + * transaction had a persistent effect (think of asynchronous commits). + * + * local_commit needs to be a local LSN of the commit so that we can make sure + * upon a checkpoint that enough WAL has been persisted to disk. + * + * Needs to be called with a RowExclusiveLock on pg_replication_origin, + * unless running in recovery. + */ +void +replorigin_advance(RepOriginId node, + XLogRecPtr remote_commit, XLogRecPtr local_commit, + bool go_backward, bool wal_log) +{ + int i; + ReplicationState *replication_state = NULL; + ReplicationState *free_state = NULL; + + Assert(node != InvalidRepOriginId); + + /* we don't track DoNotReplicateId */ + if (node == DoNotReplicateId) + return; + + /* + * XXX: For the case where this is called by WAL replay, it'd be more + * efficient to restore into a backend local hashtable and only dump into + * shmem after recovery is finished. Let's wait with implementing that + * till it's shown to be a measurable expense + */ + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_state == NULL) + { + free_state = curstate; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + { + continue; + } + + /* ok, found slot */ + replication_state = curstate; + + LWLockAcquire(&replication_state->lock, LW_EXCLUSIVE); + + /* Make sure it's not used by somebody else */ + if (replication_state->acquired_by != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with OID %d is already active for PID %d", + replication_state->roident, + replication_state->acquired_by))); + } + + break; + } + + if (replication_state == NULL && free_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with OID %u", + node), + errhint("Increase max_replication_slots and try again."))); + + if (replication_state == NULL) + { + /* initialize new slot */ + LWLockAcquire(&free_state->lock, LW_EXCLUSIVE); + replication_state = free_state; + Assert(replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(replication_state->local_lsn == InvalidXLogRecPtr); + replication_state->roident = node; + } + + Assert(replication_state->roident != InvalidRepOriginId); + + /* + * If somebody "forcefully" sets this slot, WAL log it, so it's durable + * and the standby gets the message. Primarily this will be called during + * WAL replay (of commit records) where no WAL logging is necessary. + */ + if (wal_log) + { + xl_replorigin_set xlrec; + + xlrec.remote_lsn = remote_commit; + xlrec.node_id = node; + xlrec.force = go_backward; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_SET); + } + + /* + * Due to - harmless - race conditions during a checkpoint we could see + * values here that are older than the ones we already have in memory. + * Don't overwrite those. + */ + if (go_backward || replication_state->remote_lsn < remote_commit) + replication_state->remote_lsn = remote_commit; + if (local_commit != InvalidXLogRecPtr && + (go_backward || replication_state->local_lsn < local_commit)) + replication_state->local_lsn = local_commit; + LWLockRelease(&replication_state->lock); + + /* + * Release *after* changing the LSNs, slot isn't acquired and thus could + * otherwise be dropped anytime. + */ + LWLockRelease(ReplicationOriginLock); +} + + +XLogRecPtr +replorigin_get_progress(RepOriginId node, bool flush) +{ + int i; + XLogRecPtr local_lsn = InvalidXLogRecPtr; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + + state = &replication_states[i]; + + if (state->roident == node) + { + LWLockAcquire(&state->lock, LW_SHARED); + + remote_lsn = state->remote_lsn; + local_lsn = state->local_lsn; + + LWLockRelease(&state->lock); + + break; + } + } + + LWLockRelease(ReplicationOriginLock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + +/* + * Tear down a (possibly) configured session replication origin during process + * exit. + */ +static void +ReplicationOriginExitCleanup(int code, Datum arg) +{ + ConditionVariable *cv = NULL; + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + if (session_replication_state != NULL && + session_replication_state->acquired_by == MyProcPid) + { + cv = &session_replication_state->origin_cv; + + session_replication_state->acquired_by = 0; + session_replication_state = NULL; + } + + LWLockRelease(ReplicationOriginLock); + + if (cv) + ConditionVariableBroadcast(cv); +} + +/* + * Setup a replication origin in the shared memory struct if it doesn't + * already exists and cache access to the specific ReplicationSlot so the + * array doesn't have to be searched when calling + * replorigin_session_advance(). + * + * Obviously only one such cached origin can exist per process and the current + * cached value can only be set again after the previous value is torn down + * with replorigin_session_reset(). + */ +void +replorigin_session_setup(RepOriginId node) +{ + static bool registered_cleanup; + int i; + int free_slot = -1; + + if (!registered_cleanup) + { + on_shmem_exit(ReplicationOriginExitCleanup, 0); + registered_cleanup = true; + } + + Assert(max_replication_slots > 0); + + if (session_replication_state != NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot setup replication origin when one is already setup"))); + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_slot == -1) + { + free_slot = i; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + continue; + + else if (curstate->acquired_by != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with OID %d is already active for PID %d", + curstate->roident, curstate->acquired_by))); + } + + /* ok, found slot */ + session_replication_state = curstate; + } + + + if (session_replication_state == NULL && free_slot == -1) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with OID %u", + node), + errhint("Increase max_replication_slots and try again."))); + else if (session_replication_state == NULL) + { + /* initialize new slot */ + session_replication_state = &replication_states[free_slot]; + Assert(session_replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(session_replication_state->local_lsn == InvalidXLogRecPtr); + session_replication_state->roident = node; + } + + + Assert(session_replication_state->roident != InvalidRepOriginId); + + session_replication_state->acquired_by = MyProcPid; + + LWLockRelease(ReplicationOriginLock); + + /* probably this one is pointless */ + ConditionVariableBroadcast(&session_replication_state->origin_cv); +} + +/* + * Reset replay state previously setup in this session. + * + * This function may only be called if an origin was setup with + * replorigin_session_setup(). + */ +void +replorigin_session_reset(void) +{ + ConditionVariable *cv; + + Assert(max_replication_slots != 0); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + session_replication_state->acquired_by = 0; + cv = &session_replication_state->origin_cv; + session_replication_state = NULL; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableBroadcast(cv); +} + +/* + * Do the same work replorigin_advance() does, just on the session's + * configured origin. + * + * This is noticeably cheaper than using replorigin_advance(). + */ +void +replorigin_session_advance(XLogRecPtr remote_commit, XLogRecPtr local_commit) +{ + Assert(session_replication_state != NULL); + Assert(session_replication_state->roident != InvalidRepOriginId); + + LWLockAcquire(&session_replication_state->lock, LW_EXCLUSIVE); + if (session_replication_state->local_lsn < local_commit) + session_replication_state->local_lsn = local_commit; + if (session_replication_state->remote_lsn < remote_commit) + session_replication_state->remote_lsn = remote_commit; + LWLockRelease(&session_replication_state->lock); +} + +/* + * Ask the machinery about the point up to which we successfully replayed + * changes from an already setup replication origin. + */ +XLogRecPtr +replorigin_session_get_progress(bool flush) +{ + XLogRecPtr remote_lsn; + XLogRecPtr local_lsn; + + Assert(session_replication_state != NULL); + + LWLockAcquire(&session_replication_state->lock, LW_SHARED); + remote_lsn = session_replication_state->remote_lsn; + local_lsn = session_replication_state->local_lsn; + LWLockRelease(&session_replication_state->lock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + + + +/* --------------------------------------------------------------------------- + * SQL functions for working with replication origin. + * + * These mostly should be fairly short wrappers around more generic functions. + * --------------------------------------------------------------------------- + */ + +/* + * Create replication origin for the passed in name, and return the assigned + * oid. + */ +Datum +pg_replication_origin_create(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + /* Replication origins "pg_xxx" are reserved for internal use */ + if (IsReservedName(name)) + ereport(ERROR, + (errcode(ERRCODE_RESERVED_NAME), + errmsg("replication origin name \"%s\" is reserved", + name), + errdetail("Origin names starting with \"pg_\" are reserved."))); + + /* + * If built with appropriate switch, whine when regression-testing + * conventions for replication origin names are violated. + */ +#ifdef ENFORCE_REGRESSION_TEST_NAME_RESTRICTIONS + if (strncmp(name, "regress_", 8) != 0) + elog(WARNING, "replication origins created by regression test cases should have names starting with \"regress_\""); +#endif + + roident = replorigin_create(name); + + pfree(name); + + PG_RETURN_OID(roident); +} + +/* + * Drop replication origin. + */ +Datum +pg_replication_origin_drop(PG_FUNCTION_ARGS) +{ + char *name; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + replorigin_drop_by_name(name, false, true); + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Return oid of a replication origin. + */ +Datum +pg_replication_origin_oid(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + roident = replorigin_by_name(name, true); + + pfree(name); + + if (OidIsValid(roident)) + PG_RETURN_OID(roident); + PG_RETURN_NULL(); +} + +/* + * Setup a replication origin for this session. + */ +Datum +pg_replication_origin_session_setup(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId origin; + + replorigin_check_prerequisites(true, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + origin = replorigin_by_name(name, false); + replorigin_session_setup(origin); + + replorigin_session_origin = origin; + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Reset previously setup origin in this session + */ +Datum +pg_replication_origin_session_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_reset(); + + replorigin_session_origin = InvalidRepOriginId; + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + +/* + * Has a replication origin been setup for this session. + */ +Datum +pg_replication_origin_session_is_setup(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(false, false); + + PG_RETURN_BOOL(replorigin_session_origin != InvalidRepOriginId); +} + + +/* + * Return the replication progress for origin setup in the current session. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_session_progress(PG_FUNCTION_ARGS) +{ + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + bool flush = PG_GETARG_BOOL(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + remote_lsn = replorigin_session_get_progress(flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + +Datum +pg_replication_origin_xact_setup(PG_FUNCTION_ARGS) +{ + XLogRecPtr location = PG_GETARG_LSN(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + replorigin_session_origin_lsn = location; + replorigin_session_origin_timestamp = PG_GETARG_TIMESTAMPTZ(1); + + PG_RETURN_VOID(); +} + +Datum +pg_replication_origin_xact_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + + +Datum +pg_replication_origin_advance(PG_FUNCTION_ARGS) +{ + text *name = PG_GETARG_TEXT_PP(0); + XLogRecPtr remote_commit = PG_GETARG_LSN(1); + RepOriginId node; + + replorigin_check_prerequisites(true, false); + + /* lock to prevent the replication origin from vanishing */ + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + node = replorigin_by_name(text_to_cstring(name), false); + + /* + * Can't sensibly pass a local commit to be flushed at checkpoint - this + * xact hasn't committed yet. This is why this function should be used to + * set up the initial replication state, but not for replay. + */ + replorigin_advance(node, remote_commit, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + PG_RETURN_VOID(); +} + + +/* + * Return the replication progress for an individual replication origin. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_progress(PG_FUNCTION_ARGS) +{ + char *name; + bool flush; + RepOriginId roident; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + replorigin_check_prerequisites(true, true); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + flush = PG_GETARG_BOOL(1); + + roident = replorigin_by_name(name, false); + Assert(OidIsValid(roident)); + + remote_lsn = replorigin_get_progress(roident, flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + + +Datum +pg_show_replication_origin_status(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + int i; +#define REPLICATION_ORIGIN_PROGRESS_COLS 4 + + /* we want to return 0 rows if slot is set to zero */ + replorigin_check_prerequisites(false, true); + + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (tupdesc->natts != REPLICATION_ORIGIN_PROGRESS_COLS) + elog(ERROR, "wrong function definition"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* + * Iterate through all possible replication_states, display if they are + * filled. Note that we do not take any locks, so slightly corrupted/out + * of date values are a possibility. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + Datum values[REPLICATION_ORIGIN_PROGRESS_COLS]; + bool nulls[REPLICATION_ORIGIN_PROGRESS_COLS]; + char *roname; + + state = &replication_states[i]; + + /* unused slot, nothing to display */ + if (state->roident == InvalidRepOriginId) + continue; + + memset(values, 0, sizeof(values)); + memset(nulls, 1, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(state->roident); + nulls[0] = false; + + /* + * We're not preventing the origin to be dropped concurrently, so + * silently accept that it might be gone. + */ + if (replorigin_by_oid(state->roident, true, + &roname)) + { + values[1] = CStringGetTextDatum(roname); + nulls[1] = false; + } + + LWLockAcquire(&state->lock, LW_SHARED); + + values[2] = LSNGetDatum(state->remote_lsn); + nulls[2] = false; + + values[3] = LSNGetDatum(state->local_lsn); + nulls[3] = false; + + LWLockRelease(&state->lock); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + tuplestore_donestoring(tupstore); + + LWLockRelease(ReplicationOriginLock); + +#undef REPLICATION_ORIGIN_PROGRESS_COLS + + return (Datum) 0; +} diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c new file mode 100644 index 0000000..1cf59e0 --- /dev/null +++ b/src/backend/replication/logical/proto.c @@ -0,0 +1,900 @@ +/*------------------------------------------------------------------------- + * + * proto.c + * logical replication protocol functions + * + * Copyright (c) 2015-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/proto.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/sysattr.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_type.h" +#include "libpq/pqformat.h" +#include "replication/logicalproto.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +/* + * Protocol message flags. + */ +#define LOGICALREP_IS_REPLICA_IDENTITY 1 + +#define MESSAGE_TRANSACTIONAL (1<<0) +#define TRUNCATE_CASCADE (1<<0) +#define TRUNCATE_RESTART_SEQS (1<<1) + +static void logicalrep_write_attrs(StringInfo out, Relation rel); +static void logicalrep_write_tuple(StringInfo out, Relation rel, + HeapTuple tuple, bool binary); + +static void logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel); +static void logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple); + +static void logicalrep_write_namespace(StringInfo out, Oid nspid); +static const char *logicalrep_read_namespace(StringInfo in); + +/* + * Write BEGIN to the output stream. + */ +void +logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_BEGIN); + + /* fixed fields */ + pq_sendint64(out, txn->final_lsn); + pq_sendint64(out, txn->commit_time); + pq_sendint32(out, txn->xid); +} + +/* + * Read transaction BEGIN from the stream. + */ +void +logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) +{ + /* read fields */ + begin_data->final_lsn = pq_getmsgint64(in); + if (begin_data->final_lsn == InvalidXLogRecPtr) + elog(ERROR, "final_lsn not set in begin message"); + begin_data->committime = pq_getmsgint64(in); + begin_data->xid = pq_getmsgint(in, 4); +} + + +/* + * Write COMMIT to the output stream. + */ +void +logicalrep_write_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_COMMIT); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->commit_time); +} + +/* + * Read transaction COMMIT from the stream. + */ +void +logicalrep_read_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + /* read flags (unused for now) */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); +} + +/* + * Write ORIGIN to the output stream. + */ +void +logicalrep_write_origin(StringInfo out, const char *origin, + XLogRecPtr origin_lsn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_ORIGIN); + + /* fixed fields */ + pq_sendint64(out, origin_lsn); + + /* origin string */ + pq_sendstring(out, origin); +} + +/* + * Read ORIGIN from the output stream. + */ +char * +logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn) +{ + /* fixed fields */ + *origin_lsn = pq_getmsgint64(in); + + /* return origin */ + return pstrdup(pq_getmsgstring(in)); +} + +/* + * Write INSERT to the output stream. + */ +void +logicalrep_write_insert(StringInfo out, TransactionId xid, Relation rel, + HeapTuple newtuple, bool binary) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_INSERT); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newtuple, binary); +} + +/* + * Read INSERT from stream. + * + * Fills the new tuple. + */ +LogicalRepRelId +logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + action = pq_getmsgbyte(in); + if (action != 'N') + elog(ERROR, "expected new tuple but got %d", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write UPDATE to the output stream. + */ +void +logicalrep_write_update(StringInfo out, TransactionId xid, Relation rel, + HeapTuple oldtuple, HeapTuple newtuple, bool binary) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_UPDATE); + + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (oldtuple != NULL) + { + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + logicalrep_write_tuple(out, rel, oldtuple, binary); + } + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newtuple, binary); +} + +/* + * Read UPDATE from stream. + */ +LogicalRepRelId +logicalrep_read_update(StringInfo in, bool *has_oldtuple, + LogicalRepTupleData *oldtup, + LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O' && action != 'N') + elog(ERROR, "expected action 'N', 'O' or 'K', got %c", + action); + + /* check for old tuple */ + if (action == 'K' || action == 'O') + { + logicalrep_read_tuple(in, oldtup); + *has_oldtuple = true; + + action = pq_getmsgbyte(in); + } + else + *has_oldtuple = false; + + /* check for new tuple */ + if (action != 'N') + elog(ERROR, "expected action 'N', got %c", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write DELETE to the output stream. + */ +void +logicalrep_write_delete(StringInfo out, TransactionId xid, Relation rel, + HeapTuple oldtuple, bool binary) +{ + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + pq_sendbyte(out, LOGICAL_REP_MSG_DELETE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + + logicalrep_write_tuple(out, rel, oldtuple, binary); +} + +/* + * Read DELETE from stream. + * + * Fills the old tuple. + */ +LogicalRepRelId +logicalrep_read_delete(StringInfo in, LogicalRepTupleData *oldtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O') + elog(ERROR, "expected action 'O' or 'K', got %c", action); + + logicalrep_read_tuple(in, oldtup); + + return relid; +} + +/* + * Write TRUNCATE to the output stream. + */ +void +logicalrep_write_truncate(StringInfo out, + TransactionId xid, + int nrelids, + Oid relids[], + bool cascade, bool restart_seqs) +{ + int i; + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_TRUNCATE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint32(out, nrelids); + + /* encode and send truncate flags */ + if (cascade) + flags |= TRUNCATE_CASCADE; + if (restart_seqs) + flags |= TRUNCATE_RESTART_SEQS; + pq_sendint8(out, flags); + + for (i = 0; i < nrelids; i++) + pq_sendint32(out, relids[i]); +} + +/* + * Read TRUNCATE from stream. + */ +List * +logicalrep_read_truncate(StringInfo in, + bool *cascade, bool *restart_seqs) +{ + int i; + int nrelids; + List *relids = NIL; + uint8 flags; + + nrelids = pq_getmsgint(in, 4); + + /* read and decode truncate flags */ + flags = pq_getmsgint(in, 1); + *cascade = (flags & TRUNCATE_CASCADE) > 0; + *restart_seqs = (flags & TRUNCATE_RESTART_SEQS) > 0; + + for (i = 0; i < nrelids; i++) + relids = lappend_oid(relids, pq_getmsgint(in, 4)); + + return relids; +} + +/* + * Write MESSAGE to stream + */ +void +logicalrep_write_message(StringInfo out, TransactionId xid, XLogRecPtr lsn, + bool transactional, const char *prefix, Size sz, + const char *message) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_MESSAGE); + + /* encode and send message flags */ + if (transactional) + flags |= MESSAGE_TRANSACTIONAL; + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint8(out, flags); + pq_sendint64(out, lsn); + pq_sendstring(out, prefix); + pq_sendint32(out, sz); + pq_sendbytes(out, message, sz); +} + +/* + * Write relation description to the output stream. + */ +void +logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel) +{ + char *relname; + + pq_sendbyte(out, LOGICAL_REP_MSG_RELATION); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + /* send qualified relation name */ + logicalrep_write_namespace(out, RelationGetNamespace(rel)); + relname = RelationGetRelationName(rel); + pq_sendstring(out, relname); + + /* send replica identity */ + pq_sendbyte(out, rel->rd_rel->relreplident); + + /* send the attribute info */ + logicalrep_write_attrs(out, rel); +} + +/* + * Read the relation info from stream and return as LogicalRepRelation. + */ +LogicalRepRelation * +logicalrep_read_rel(StringInfo in) +{ + LogicalRepRelation *rel = palloc(sizeof(LogicalRepRelation)); + + rel->remoteid = pq_getmsgint(in, 4); + + /* Read relation name from stream */ + rel->nspname = pstrdup(logicalrep_read_namespace(in)); + rel->relname = pstrdup(pq_getmsgstring(in)); + + /* Read the replica identity. */ + rel->replident = pq_getmsgbyte(in); + + /* Get attribute description */ + logicalrep_read_attrs(in, rel); + + return rel; +} + +/* + * Write type info to the output stream. + * + * This function will always write base type info. + */ +void +logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid) +{ + Oid basetypoid = getBaseType(typoid); + HeapTuple tup; + Form_pg_type typtup; + + pq_sendbyte(out, LOGICAL_REP_MSG_TYPE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(basetypoid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", basetypoid); + typtup = (Form_pg_type) GETSTRUCT(tup); + + /* use Oid as relation identifier */ + pq_sendint32(out, typoid); + + /* send qualified type name */ + logicalrep_write_namespace(out, typtup->typnamespace); + pq_sendstring(out, NameStr(typtup->typname)); + + ReleaseSysCache(tup); +} + +/* + * Read type info from the output stream. + */ +void +logicalrep_read_typ(StringInfo in, LogicalRepTyp *ltyp) +{ + ltyp->remoteid = pq_getmsgint(in, 4); + + /* Read type name from stream */ + ltyp->nspname = pstrdup(logicalrep_read_namespace(in)); + ltyp->typname = pstrdup(pq_getmsgstring(in)); +} + +/* + * Write a tuple to the outputstream, in the most efficient format possible. + */ +static void +logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple, bool binary) +{ + TupleDesc desc; + Datum values[MaxTupleAttributeNumber]; + bool isnull[MaxTupleAttributeNumber]; + int i; + uint16 nliveatts = 0; + + desc = RelationGetDescr(rel); + + for (i = 0; i < desc->natts; i++) + { + if (TupleDescAttr(desc, i)->attisdropped || TupleDescAttr(desc, i)->attgenerated) + continue; + nliveatts++; + } + pq_sendint16(out, nliveatts); + + /* try to allocate enough memory from the get-go */ + enlargeStringInfo(out, tuple->t_len + + nliveatts * (1 + 4)); + + heap_deform_tuple(tuple, desc, values, isnull); + + /* Write the values */ + for (i = 0; i < desc->natts; i++) + { + HeapTuple typtup; + Form_pg_type typclass; + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (isnull[i]) + { + pq_sendbyte(out, LOGICALREP_COLUMN_NULL); + continue; + } + + if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + { + /* + * Unchanged toasted datum. (Note that we don't promise to detect + * unchanged data in general; this is just a cheap check to avoid + * sending large values unnecessarily.) + */ + pq_sendbyte(out, LOGICALREP_COLUMN_UNCHANGED); + continue; + } + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(att->atttypid)); + if (!HeapTupleIsValid(typtup)) + elog(ERROR, "cache lookup failed for type %u", att->atttypid); + typclass = (Form_pg_type) GETSTRUCT(typtup); + + /* + * Send in binary if requested and type has suitable send function. + */ + if (binary && OidIsValid(typclass->typsend)) + { + bytea *outputbytes; + int len; + + pq_sendbyte(out, LOGICALREP_COLUMN_BINARY); + outputbytes = OidSendFunctionCall(typclass->typsend, values[i]); + len = VARSIZE(outputbytes) - VARHDRSZ; + pq_sendint(out, len, 4); /* length */ + pq_sendbytes(out, VARDATA(outputbytes), len); /* data */ + pfree(outputbytes); + } + else + { + char *outputstr; + + pq_sendbyte(out, LOGICALREP_COLUMN_TEXT); + outputstr = OidOutputFunctionCall(typclass->typoutput, values[i]); + pq_sendcountedtext(out, outputstr, strlen(outputstr), false); + pfree(outputstr); + } + + ReleaseSysCache(typtup); + } +} + +/* + * Read tuple in logical replication format from stream. + */ +static void +logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple) +{ + int i; + int natts; + + /* Get number of attributes */ + natts = pq_getmsgint(in, 2); + + /* Allocate space for per-column values; zero out unused StringInfoDatas */ + tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData)); + tuple->colstatus = (char *) palloc(natts * sizeof(char)); + tuple->ncols = natts; + + /* Read the data */ + for (i = 0; i < natts; i++) + { + char kind; + int len; + StringInfo value = &tuple->colvalues[i]; + + kind = pq_getmsgbyte(in); + tuple->colstatus[i] = kind; + + switch (kind) + { + case LOGICALREP_COLUMN_NULL: + /* nothing more to do */ + break; + case LOGICALREP_COLUMN_UNCHANGED: + /* we don't receive the value of an unchanged column */ + break; + case LOGICALREP_COLUMN_TEXT: + len = pq_getmsgint(in, 4); /* read length */ + + /* and data */ + value->data = palloc(len + 1); + pq_copymsgbytes(in, value->data, len); + value->data[len] = '\0'; + /* make StringInfo fully valid */ + value->len = len; + value->cursor = 0; + value->maxlen = len; + break; + case LOGICALREP_COLUMN_BINARY: + len = pq_getmsgint(in, 4); /* read length */ + + /* and data */ + value->data = palloc(len + 1); + pq_copymsgbytes(in, value->data, len); + /* not strictly necessary but per StringInfo practice */ + value->data[len] = '\0'; + /* make StringInfo fully valid */ + value->len = len; + value->cursor = 0; + value->maxlen = len; + break; + default: + elog(ERROR, "unrecognized data representation type '%c'", kind); + } + } +} + +/* + * Write relation attribute metadata to the stream. + */ +static void +logicalrep_write_attrs(StringInfo out, Relation rel) +{ + TupleDesc desc; + int i; + uint16 nliveatts = 0; + Bitmapset *idattrs = NULL; + bool replidentfull; + + desc = RelationGetDescr(rel); + + /* send number of live attributes */ + for (i = 0; i < desc->natts; i++) + { + if (TupleDescAttr(desc, i)->attisdropped || TupleDescAttr(desc, i)->attgenerated) + continue; + nliveatts++; + } + pq_sendint16(out, nliveatts); + + /* fetch bitmap of REPLICATION IDENTITY attributes */ + replidentfull = (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL); + if (!replidentfull) + idattrs = RelationGetIdentityKeyBitmap(rel); + + /* send the attributes */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + uint8 flags = 0; + + if (att->attisdropped || att->attgenerated) + continue; + + /* REPLICA IDENTITY FULL means all columns are sent as part of key. */ + if (replidentfull || + bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + idattrs)) + flags |= LOGICALREP_IS_REPLICA_IDENTITY; + + pq_sendbyte(out, flags); + + /* attribute name */ + pq_sendstring(out, NameStr(att->attname)); + + /* attribute type id */ + pq_sendint32(out, (int) att->atttypid); + + /* attribute mode */ + pq_sendint32(out, att->atttypmod); + } + + bms_free(idattrs); +} + +/* + * Read relation attribute metadata from the stream. + */ +static void +logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel) +{ + int i; + int natts; + char **attnames; + Oid *atttyps; + Bitmapset *attkeys = NULL; + + natts = pq_getmsgint(in, 2); + attnames = palloc(natts * sizeof(char *)); + atttyps = palloc(natts * sizeof(Oid)); + + /* read the attributes */ + for (i = 0; i < natts; i++) + { + uint8 flags; + + /* Check for replica identity column */ + flags = pq_getmsgbyte(in); + if (flags & LOGICALREP_IS_REPLICA_IDENTITY) + attkeys = bms_add_member(attkeys, i); + + /* attribute name */ + attnames[i] = pstrdup(pq_getmsgstring(in)); + + /* attribute type id */ + atttyps[i] = (Oid) pq_getmsgint(in, 4); + + /* we ignore attribute mode for now */ + (void) pq_getmsgint(in, 4); + } + + rel->attnames = attnames; + rel->atttyps = atttyps; + rel->attkeys = attkeys; + rel->natts = natts; +} + +/* + * Write the namespace name or empty string for pg_catalog (to save space). + */ +static void +logicalrep_write_namespace(StringInfo out, Oid nspid) +{ + if (nspid == PG_CATALOG_NAMESPACE) + pq_sendbyte(out, '\0'); + else + { + char *nspname = get_namespace_name(nspid); + + if (nspname == NULL) + elog(ERROR, "cache lookup failed for namespace %u", + nspid); + + pq_sendstring(out, nspname); + } +} + +/* + * Read the namespace name while treating empty string as pg_catalog. + */ +static const char * +logicalrep_read_namespace(StringInfo in) +{ + const char *nspname = pq_getmsgstring(in); + + if (nspname[0] == '\0') + nspname = "pg_catalog"; + + return nspname; +} + +/* + * Write the information for the start stream message to the output stream. + */ +void +logicalrep_write_stream_start(StringInfo out, + TransactionId xid, bool first_segment) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_START); + + Assert(TransactionIdIsValid(xid)); + + /* transaction ID (we're starting to stream, so must be valid) */ + pq_sendint32(out, xid); + + /* 1 if this is the first streaming segment for this xid */ + pq_sendbyte(out, first_segment ? 1 : 0); +} + +/* + * Read the information about the start stream message from output stream. + */ +TransactionId +logicalrep_read_stream_start(StringInfo in, bool *first_segment) +{ + TransactionId xid; + + Assert(first_segment); + + xid = pq_getmsgint(in, 4); + *first_segment = (pq_getmsgbyte(in) == 1); + + return xid; +} + +/* + * Write the stop stream message to the output stream. + */ +void +logicalrep_write_stream_stop(StringInfo out) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_END); +} + +/* + * Write STREAM COMMIT to the output stream. + */ +void +logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_COMMIT); + + Assert(TransactionIdIsValid(txn->xid)); + + /* transaction ID */ + pq_sendint32(out, txn->xid); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->commit_time); +} + +/* + * Read STREAM COMMIT from the output stream. + */ +TransactionId +logicalrep_read_stream_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + TransactionId xid; + uint8 flags; + + xid = pq_getmsgint(in, 4); + + /* read flags (unused for now) */ + flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); + + return xid; +} + +/* + * Write STREAM ABORT to the output stream. Note that xid and subxid will be + * same for the top-level transaction abort. + */ +void +logicalrep_write_stream_abort(StringInfo out, TransactionId xid, + TransactionId subxid) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_ABORT); + + Assert(TransactionIdIsValid(xid) && TransactionIdIsValid(subxid)); + + /* transaction ID */ + pq_sendint32(out, xid); + pq_sendint32(out, subxid); +} + +/* + * Read STREAM ABORT from the output stream. + */ +void +logicalrep_read_stream_abort(StringInfo in, TransactionId *xid, + TransactionId *subxid) +{ + Assert(xid && subxid); + + *xid = pq_getmsgint(in, 4); + *subxid = pq_getmsgint(in, 4); +} diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c new file mode 100644 index 0000000..a5e5bf9 --- /dev/null +++ b/src/backend/replication/logical/relation.c @@ -0,0 +1,705 @@ +/*------------------------------------------------------------------------- + * relation.c + * PostgreSQL logical replication relation mapping cache + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/relation.c + * + * NOTES + * Routines in this file mainly have to do with mapping the properties + * of local replication target relations to the properties of their + * remote counterpart. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "catalog/namespace.h" +#include "catalog/pg_subscription_rel.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "replication/logicalrelation.h" +#include "replication/worker_internal.h" +#include "utils/inval.h" + + +static MemoryContext LogicalRepRelMapContext = NULL; + +static HTAB *LogicalRepRelMap = NULL; + +/* + * Partition map (LogicalRepPartMap) + * + * When a partitioned table is used as replication target, replicated + * operations are actually performed on its leaf partitions, which requires + * the partitions to also be mapped to the remote relation. Parent's entry + * (LogicalRepRelMapEntry) cannot be used as-is for all partitions, because + * individual partitions may have different attribute numbers, which means + * attribute mappings to remote relation's attributes must be maintained + * separately for each partition. + */ +static MemoryContext LogicalRepPartMapContext = NULL; +static HTAB *LogicalRepPartMap = NULL; +typedef struct LogicalRepPartMapEntry +{ + Oid partoid; /* LogicalRepPartMap's key */ + LogicalRepRelMapEntry relmapentry; +} LogicalRepPartMapEntry; + +/* + * Relcache invalidation callback for our relation map cache. + */ +static void +logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepRelMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepRelMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->localreloid == reloid) + { + entry->localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + entry->localrelvalid = false; + } +} + +/* + * Initialize the relation map cache. + */ +static void +logicalrep_relmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepRelMapContext) + LogicalRepRelMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepRelMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(LogicalRepRelId); + ctl.entrysize = sizeof(LogicalRepRelMapEntry); + ctl.hcxt = LogicalRepRelMapContext; + + LogicalRepRelMap = hash_create("logicalrep relation map cache", 128, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_relmap_invalidate_cb, + (Datum) 0); +} + +/* + * Free the entry of a relation map cache. + */ +static void +logicalrep_relmap_free_entry(LogicalRepRelMapEntry *entry) +{ + LogicalRepRelation *remoterel; + + remoterel = &entry->remoterel; + + pfree(remoterel->nspname); + pfree(remoterel->relname); + + if (remoterel->natts > 0) + { + int i; + + for (i = 0; i < remoterel->natts; i++) + pfree(remoterel->attnames[i]); + + pfree(remoterel->attnames); + pfree(remoterel->atttyps); + } + bms_free(remoterel->attkeys); + + if (entry->attrmap) + free_attrmap(entry->attrmap); +} + +/* + * Add new entry or update existing entry in the relation map cache. + * + * Called when new relation mapping is sent by the publisher to update + * our expected view of incoming data from said publisher. + */ +void +logicalrep_relmap_update(LogicalRepRelation *remoterel) +{ + MemoryContext oldctx; + LogicalRepRelMapEntry *entry; + bool found; + int i; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* + * HASH_ENTER returns the existing entry if present or creates a new one. + */ + entry = hash_search(LogicalRepRelMap, (void *) &remoterel->remoteid, + HASH_ENTER, &found); + + if (found) + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + + /* Make cached copy of the data */ + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + MemoryContextSwitchTo(oldctx); +} + +/* + * Find attribute index in TupleDesc struct by attribute name. + * + * Returns -1 if not found. + */ +static int +logicalrep_rel_att_by_name(LogicalRepRelation *remoterel, const char *attname) +{ + int i; + + for (i = 0; i < remoterel->natts; i++) + { + if (strcmp(remoterel->attnames[i], attname) == 0) + return i; + } + + return -1; +} + +/* + * Report error with names of the missing local relation column(s), if any. + */ +static void +logicalrep_report_missing_attrs(LogicalRepRelation *remoterel, + Bitmapset *missingatts) +{ + if (!bms_is_empty(missingatts)) + { + StringInfoData missingattsbuf; + int missingattcnt = 0; + int i; + + initStringInfo(&missingattsbuf); + + while ((i = bms_first_member(missingatts)) >= 0) + { + missingattcnt++; + if (missingattcnt == 1) + appendStringInfo(&missingattsbuf, _("\"%s\""), + remoterel->attnames[i]); + else + appendStringInfo(&missingattsbuf, _(", \"%s\""), + remoterel->attnames[i]); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("logical replication target relation \"%s.%s\" is missing replicated column: %s", + "logical replication target relation \"%s.%s\" is missing replicated columns: %s", + missingattcnt, + remoterel->nspname, + remoterel->relname, + missingattsbuf.data))); + } +} + +/* + * Check if replica identity matches and mark the updatable flag. + * + * We allow for stricter replica identity (fewer columns) on subscriber as + * that will not stop us from finding unique tuple. IE, if publisher has + * identity (id,timestamp) and subscriber just (id) this will not be a + * problem, but in the opposite scenario it will. + * + * We just mark the relation entry as not updatable here if the local + * replica identity is found to be insufficient for applying + * updates/deletes (inserts don't care!) and leave it to + * check_relation_updatable() to throw the actual error if needed. + */ +static void +logicalrep_rel_mark_updatable(LogicalRepRelMapEntry *entry) +{ + Bitmapset *idkey; + LogicalRepRelation *remoterel = &entry->remoterel; + int i; + + entry->updatable = true; + + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + /* fallback to PK if no replica identity */ + if (idkey == NULL) + { + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_PRIMARY_KEY); + + /* + * If no replica identity index and no PK, the published table must + * have replica identity FULL. + */ + if (idkey == NULL && remoterel->replident != REPLICA_IDENTITY_FULL) + entry->updatable = false; + } + + i = -1; + while ((i = bms_next_member(idkey, i)) >= 0) + { + int attnum = i + FirstLowInvalidHeapAttributeNumber; + + if (!AttrNumberIsForUserDefinedAttr(attnum)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" uses " + "system columns in REPLICA IDENTITY index", + remoterel->nspname, remoterel->relname))); + + attnum = AttrNumberGetAttrOffset(attnum); + + if (entry->attrmap->attnums[attnum] < 0 || + !bms_is_member(entry->attrmap->attnums[attnum], remoterel->attkeys)) + { + entry->updatable = false; + break; + } + } +} + +/* + * Open the local relation associated with the remote one. + * + * Rebuilds the Relcache mapping if it was invalidated by local DDL. + */ +LogicalRepRelMapEntry * +logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) +{ + LogicalRepRelMapEntry *entry; + bool found; + LogicalRepRelation *remoterel; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* Search for existing entry. */ + entry = hash_search(LogicalRepRelMap, (void *) &remoteid, + HASH_FIND, &found); + + if (!found) + elog(ERROR, "no relation map entry for remote relation ID %u", + remoteid); + + remoterel = &entry->remoterel; + + /* Ensure we don't leak a relcache refcount. */ + if (entry->localrel) + elog(ERROR, "remote relation ID %u is already open", remoteid); + + /* + * When opening and locking a relation, pending invalidation messages are + * processed which can invalidate the relation. Hence, if the entry is + * currently considered valid, try to open the local relation by OID and + * see if invalidation ensues. + */ + if (entry->localrelvalid) + { + entry->localrel = try_table_open(entry->localreloid, lockmode); + if (!entry->localrel) + { + /* Table was renamed or dropped. */ + entry->localrelvalid = false; + } + else if (!entry->localrelvalid) + { + /* Note we release the no-longer-useful lock here. */ + table_close(entry->localrel, lockmode); + entry->localrel = NULL; + } + } + + /* + * If the entry has been marked invalid since we last had lock on it, + * re-open the local relation by name and rebuild all derived data. + */ + if (!entry->localrelvalid) + { + Oid relid; + TupleDesc desc; + MemoryContext oldctx; + int i; + Bitmapset *missingatts; + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + /* Try to find and lock the relation by name. */ + relid = RangeVarGetRelid(makeRangeVar(remoterel->nspname, + remoterel->relname, -1), + lockmode, true); + if (!OidIsValid(relid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" does not exist", + remoterel->nspname, remoterel->relname))); + entry->localrel = table_open(relid, NoLock); + entry->localreloid = relid; + + /* Check for supported relkind. */ + CheckSubscriptionRelkind(entry->localrel->rd_rel->relkind, + remoterel->nspname, remoterel->relname); + + /* + * Build the mapping of local attribute numbers to remote attribute + * numbers and validate that we don't miss any replicated columns as + * that would result in potentially unwanted data loss. + */ + desc = RelationGetDescr(entry->localrel); + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->attrmap = make_attrmap(desc->natts); + MemoryContextSwitchTo(oldctx); + + /* check and report missing attrs, if any */ + missingatts = bms_add_range(NULL, 0, remoterel->natts - 1); + for (i = 0; i < desc->natts; i++) + { + int attnum; + Form_pg_attribute attr = TupleDescAttr(desc, i); + + if (attr->attisdropped || attr->attgenerated) + { + entry->attrmap->attnums[i] = -1; + continue; + } + + attnum = logicalrep_rel_att_by_name(remoterel, + NameStr(attr->attname)); + + entry->attrmap->attnums[i] = attnum; + if (attnum >= 0) + missingatts = bms_del_member(missingatts, attnum); + } + + logicalrep_report_missing_attrs(remoterel, missingatts); + + /* be tidy */ + bms_free(missingatts); + + /* + * Set if the table's replica identity is enough to apply + * update/delete. + */ + logicalrep_rel_mark_updatable(entry); + + entry->localrelvalid = true; + } + + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(MySubscription->oid, + entry->localreloid, + &entry->statelsn); + + return entry; +} + +/* + * Close the previously opened logical relation. + */ +void +logicalrep_rel_close(LogicalRepRelMapEntry *rel, LOCKMODE lockmode) +{ + table_close(rel->localrel, lockmode); + rel->localrel = NULL; +} + +/* + * Partition cache: look up partition LogicalRepRelMapEntry's + * + * Unlike relation map cache, this is keyed by partition OID, not remote + * relation OID, because we only have to use this cache in the case where + * partitions are not directly mapped to any remote relation, such as when + * replication is occurring with one of their ancestors as target. + */ + +/* + * Relcache invalidation callback + */ +static void +logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepPartMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepPartMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->relmapentry.localreloid == reloid) + { + entry->relmapentry.localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + entry->relmapentry.localrelvalid = false; + } +} + +/* + * Reset the entries in the partition map that refer to remoterel. + * + * Called when new relation mapping is sent by the publisher to update our + * expected view of incoming data from said publisher. + * + * Note that we don't update the remoterel information in the entry here, + * we will update the information in logicalrep_partition_open to avoid + * unnecessary work. + */ +void +logicalrep_partmap_reset_relmap(LogicalRepRelation *remoterel) +{ + HASH_SEQ_STATUS status; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelMapEntry *entry; + + if (LogicalRepPartMap == NULL) + return; + + hash_seq_init(&status, LogicalRepPartMap); + while ((part_entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + entry = &part_entry->relmapentry; + + if (entry->remoterel.remoteid != remoterel->remoteid) + continue; + + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + } +} + +/* + * Initialize the partition map cache. + */ +static void +logicalrep_partmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepPartMapContext) + LogicalRepPartMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepPartMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(Oid); /* partition OID */ + ctl.entrysize = sizeof(LogicalRepPartMapEntry); + ctl.hcxt = LogicalRepPartMapContext; + + LogicalRepPartMap = hash_create("logicalrep partition map cache", 64, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_partmap_invalidate_cb, + (Datum) 0); +} + +/* + * logicalrep_partition_open + * + * Returned entry reuses most of the values of the root table's entry, save + * the attribute map, which can be different for the partition. However, + * we must physically copy all the data, in case the root table's entry + * gets freed/rebuilt. + * + * Note there's no logicalrep_partition_close, because the caller closes the + * component relation. + */ +LogicalRepRelMapEntry * +logicalrep_partition_open(LogicalRepRelMapEntry *root, + Relation partrel, AttrMap *map) +{ + LogicalRepRelMapEntry *entry; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelation *remoterel = &root->remoterel; + Oid partOid = RelationGetRelid(partrel); + AttrMap *attrmap = root->attrmap; + bool found; + MemoryContext oldctx; + + if (LogicalRepPartMap == NULL) + logicalrep_partmap_init(); + + /* Search for existing entry. */ + part_entry = (LogicalRepPartMapEntry *) hash_search(LogicalRepPartMap, + (void *) &partOid, + HASH_ENTER, &found); + + entry = &part_entry->relmapentry; + + /* + * We must always overwrite entry->localrel with the latest partition + * Relation pointer, because the Relation pointed to by the old value may + * have been cleared after the caller would have closed the partition + * relation after the last use of this entry. Note that localrelvalid is + * only updated by the relcache invalidation callback, so it may still be + * true irrespective of whether the Relation pointed to by localrel has + * been cleared or not. + */ + if (found && entry->localrelvalid) + { + entry->localrel = partrel; + return entry; + } + + /* Switch to longer-lived context. */ + oldctx = MemoryContextSwitchTo(LogicalRepPartMapContext); + + if (!found) + { + memset(part_entry, 0, sizeof(LogicalRepPartMapEntry)); + part_entry->partoid = partOid; + } + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + if (!entry->remoterel.remoteid) + { + int i; + + /* Remote relation is copied as-is from the root entry. */ + entry = &part_entry->relmapentry; + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + } + + entry->localrel = partrel; + entry->localreloid = partOid; + + /* + * If the partition's attributes don't match the root relation's, we'll + * need to make a new attrmap which maps partition attribute numbers to + * remoterel's, instead of the original which maps root relation's + * attribute numbers to remoterel's. + * + * Note that 'map' which comes from the tuple routing data structure + * contains 1-based attribute numbers (of the parent relation). However, + * the map in 'entry', a logical replication data structure, contains + * 0-based attribute numbers (of the remote relation). + */ + if (map) + { + AttrNumber attno; + + entry->attrmap = make_attrmap(map->maplen); + for (attno = 0; attno < entry->attrmap->maplen; attno++) + { + AttrNumber root_attno = map->attnums[attno]; + + /* 0 means it's a dropped attribute. See comments atop AttrMap. */ + if (root_attno == 0) + entry->attrmap->attnums[attno] = -1; + else + entry->attrmap->attnums[attno] = attrmap->attnums[root_attno - 1]; + } + } + else + { + /* Lacking copy_attmap, do this the hard way. */ + entry->attrmap = make_attrmap(attrmap->maplen); + memcpy(entry->attrmap->attnums, attrmap->attnums, + attrmap->maplen * sizeof(AttrNumber)); + } + + /* Set if the table's replica identity is enough to apply update/delete. */ + logicalrep_rel_mark_updatable(entry); + + entry->localrelvalid = true; + + /* state and statelsn are left set to 0. */ + MemoryContextSwitchTo(oldctx); + + return entry; +} diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c new file mode 100644 index 0000000..e59d139 --- /dev/null +++ b/src/backend/replication/logical/reorderbuffer.c @@ -0,0 +1,5156 @@ +/*------------------------------------------------------------------------- + * + * reorderbuffer.c + * PostgreSQL logical replay/reorder buffer management + * + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/reorderbuffer.c + * + * NOTES + * This module gets handed individual pieces of transactions in the order + * they are written to the WAL and is responsible to reassemble them into + * toplevel transaction sized pieces. When a transaction is completely + * reassembled - signaled by reading the transaction commit record - it + * will then call the output plugin (cf. ReorderBufferCommit()) with the + * individual changes. The output plugins rely on snapshots built by + * snapbuild.c which hands them to us. + * + * Transactions and subtransactions/savepoints in postgres are not + * immediately linked to each other from outside the performing + * backend. Only at commit/abort (or special xact_assignment records) they + * are linked together. Which means that we will have to splice together a + * toplevel transaction from its subtransactions. To do that efficiently we + * build a binary heap indexed by the smallest current lsn of the individual + * subtransactions' changestreams. As the individual streams are inherently + * ordered by LSN - since that is where we build them from - the transaction + * can easily be reassembled by always using the subtransaction with the + * smallest current LSN from the heap. + * + * In order to cope with large transactions - which can be several times as + * big as the available memory - this module supports spooling the contents + * of a large transactions to disk. When the transaction is replayed the + * contents of individual (sub-)transactions will be read from disk in + * chunks. + * + * This module also has to deal with reassembling toast records from the + * individual chunks stored in WAL. When a new (or initial) version of a + * tuple is stored in WAL it will always be preceded by the toast chunks + * emitted for the columns stored out of line. Within a single toplevel + * transaction there will be no other data carrying records between a row's + * toast chunks and the row data itself. See ReorderBufferToast* for + * details. + * + * ReorderBuffer uses two special memory context types - SlabContext for + * allocations of fixed-length structures (changes and transactions), and + * GenerationContext for the variable-length transaction data (allocated + * and freed in groups with similar lifespans). + * + * To limit the amount of memory used by decoded changes, we track memory + * used at the reorder buffer level (i.e. total amount of memory), and for + * each transaction. When the total amount of used memory exceeds the + * limit, the transaction consuming the most memory is then serialized to + * disk. + * + * Only decoded changes are evicted from memory (spilled to disk), not the + * transaction records. The number of toplevel transactions is limited, + * but a transaction with many subtransactions may still consume significant + * amounts of memory. However, the transaction records are fairly small and + * are not included in the memory limit. + * + * The current eviction algorithm is very simple - the transaction is + * picked merely by size, while it might be useful to also consider age + * (LSN) of the changes for example. With the new Generational memory + * allocator, evicting the oldest changes would make it more likely the + * memory gets actually freed. + * + * We still rely on max_changes_in_memory when loading serialized changes + * back into memory. At that point we can't use the memory limit directly + * as we load the subxacts independently. One option to deal with this + * would be to count the subxacts, and allow each to allocate 1/N of the + * memory limit. That however does not seem very appealing, because with + * many subtransactions it may easily cause thrashing (short cycles of + * deserializing and applying very few changes). We probably should give + * a bit more memory to the oldest subtransactions, because it's likely + * they are the source for the next sequence of changes. + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/detoast.h" +#include "access/heapam.h" +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/sinval.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relfilenodemap.h" + + +/* entry for a hash table we use to map from xid to our transaction state */ +typedef struct ReorderBufferTXNByIdEnt +{ + TransactionId xid; + ReorderBufferTXN *txn; +} ReorderBufferTXNByIdEnt; + +/* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */ +typedef struct ReorderBufferTupleCidKey +{ + RelFileNode relnode; + ItemPointerData tid; +} ReorderBufferTupleCidKey; + +typedef struct ReorderBufferTupleCidEnt +{ + ReorderBufferTupleCidKey key; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ +} ReorderBufferTupleCidEnt; + +/* Virtual file descriptor with file offset tracking */ +typedef struct TXNEntryFile +{ + File vfd; /* -1 when the file is closed */ + off_t curOffset; /* offset for next write or read. Reset to 0 + * when vfd is opened. */ +} TXNEntryFile; + +/* k-way in-order change iteration support structures */ +typedef struct ReorderBufferIterTXNEntry +{ + XLogRecPtr lsn; + ReorderBufferChange *change; + ReorderBufferTXN *txn; + TXNEntryFile file; + XLogSegNo segno; +} ReorderBufferIterTXNEntry; + +typedef struct ReorderBufferIterTXNState +{ + binaryheap *heap; + Size nr_txns; + dlist_head old_change; + ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]; +} ReorderBufferIterTXNState; + +/* toast datastructures */ +typedef struct ReorderBufferToastEnt +{ + Oid chunk_id; /* toast_table.chunk_id */ + int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we + * have seen */ + Size num_chunks; /* number of chunks we've already seen */ + Size size; /* combined size of chunks seen */ + dlist_head chunks; /* linked list of chunks */ + struct varlena *reconstructed; /* reconstructed varlena now pointed to in + * main tup */ +} ReorderBufferToastEnt; + +/* Disk serialization support datastructures */ +typedef struct ReorderBufferDiskChange +{ + Size size; + ReorderBufferChange change; + /* data follows */ +} ReorderBufferDiskChange; + +#define IsSpecInsert(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \ +) +#define IsSpecConfirmOrAbort(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \ +) +#define IsInsertOrUpdate(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INSERT) || \ + ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \ +) + +/* + * Maximum number of changes kept in memory, per transaction. After that, + * changes are spooled to disk. + * + * The current value should be sufficient to decode the entire transaction + * without hitting disk in OLTP workloads, while starting to spool to disk in + * other workloads reasonably fast. + * + * At some point in the future it probably makes sense to have a more elaborate + * resource management here, but it's not entirely clear what that would look + * like. + */ +int logical_decoding_work_mem; +static const Size max_changes_in_memory = 4096; /* XXX for restore only */ + +/* --------------------------------------- + * primary reorderbuffer support routines + * --------------------------------------- + */ +static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb); +static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb, + TransactionId xid, bool create, bool *is_new, + XLogRecPtr lsn, bool create_as_top); +static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn); + +static void AssertTXNLsnOrder(ReorderBuffer *rb); + +/* --------------------------------------- + * support functions for lsn-order iterating over the ->changes of a + * transaction and its subtransactions + * + * used for iteration over the k-way heap merge of a transaction and its + * subtransactions + * --------------------------------------- + */ +static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state); +static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state); +static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state); +static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs); + +/* + * --------------------------------------- + * Disk serialization support functions + * --------------------------------------- + */ +static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb); +static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change); +static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno); +static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *change); +static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + bool txn_prepared); +static void ReorderBufferCleanupSerializedTXNs(const char *slotname); +static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, + TransactionId xid, XLogSegNo segno); + +static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); +static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid); + +/* + * --------------------------------------- + * Streaming support functions + * --------------------------------------- + */ +static inline bool ReorderBufferCanStream(ReorderBuffer *rb); +static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb); +static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn); + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ +static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); + +/* + * --------------------------------------- + * memory accounting + * --------------------------------------- + */ +static Size ReorderBufferChangeSize(ReorderBufferChange *change); +static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz); + +/* + * Allocate a new ReorderBuffer and clean out any old serialized state from + * prior ReorderBuffer instances for the same slot. + */ +ReorderBuffer * +ReorderBufferAllocate(void) +{ + ReorderBuffer *buffer; + HASHCTL hash_ctl; + MemoryContext new_ctx; + + Assert(MyReplicationSlot != NULL); + + /* allocate memory in own context, to have better accountability */ + new_ctx = AllocSetContextCreate(CurrentMemoryContext, + "ReorderBuffer", + ALLOCSET_DEFAULT_SIZES); + + buffer = + (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer)); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + + buffer->context = new_ctx; + + buffer->change_context = SlabContextCreate(new_ctx, + "Change", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferChange)); + + buffer->txn_context = SlabContextCreate(new_ctx, + "TXN", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferTXN)); + + buffer->tup_context = GenerationContextCreate(new_ctx, + "Tuples", + SLAB_LARGE_BLOCK_SIZE); + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); + hash_ctl.hcxt = buffer->context; + + buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + buffer->by_txn_last_xid = InvalidTransactionId; + buffer->by_txn_last_txn = NULL; + + buffer->outbuf = NULL; + buffer->outbufsize = 0; + buffer->size = 0; + + buffer->spillTxns = 0; + buffer->spillCount = 0; + buffer->spillBytes = 0; + buffer->streamTxns = 0; + buffer->streamCount = 0; + buffer->streamBytes = 0; + buffer->totalTxns = 0; + buffer->totalBytes = 0; + + buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; + + dlist_init(&buffer->toplevel_by_lsn); + dlist_init(&buffer->txns_by_base_snapshot_lsn); + + /* + * Ensure there's no stale data from prior uses of this slot, in case some + * prior exit avoided calling ReorderBufferFree. Failure to do this can + * produce duplicated txns, and it's very cheap if there's nothing there. + */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); + + return buffer; +} + +/* + * Free a ReorderBuffer + */ +void +ReorderBufferFree(ReorderBuffer *rb) +{ + MemoryContext context = rb->context; + + /* + * We free separately allocated data by entirely scrapping reorderbuffer's + * memory context. + */ + MemoryContextDelete(context); + + /* Free disk space used by unconsumed reorder buffers */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); +} + +/* + * Get an unused, possibly preallocated, ReorderBufferTXN. + */ +static ReorderBufferTXN * +ReorderBufferGetTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + txn = (ReorderBufferTXN *) + MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN)); + + memset(txn, 0, sizeof(ReorderBufferTXN)); + + dlist_init(&txn->changes); + dlist_init(&txn->tuplecids); + dlist_init(&txn->subtxns); + + /* InvalidCommandId is not zero, so set it explicitly */ + txn->command_id = InvalidCommandId; + txn->output_plugin_private = NULL; + + return txn; +} + +/* + * Free a ReorderBufferTXN. + */ +static void +ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* clean the lookup cache if we were cached (quite likely) */ + if (rb->by_txn_last_xid == txn->xid) + { + rb->by_txn_last_xid = InvalidTransactionId; + rb->by_txn_last_txn = NULL; + } + + /* free data that's contained */ + + if (txn->gid != NULL) + { + pfree(txn->gid); + txn->gid = NULL; + } + + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + if (txn->invalidations) + { + pfree(txn->invalidations); + txn->invalidations = NULL; + } + + /* Reset the toast hash */ + ReorderBufferToastReset(rb, txn); + + pfree(txn); +} + +/* + * Get a fresh ReorderBufferChange. + */ +ReorderBufferChange * +ReorderBufferGetChange(ReorderBuffer *rb) +{ + ReorderBufferChange *change; + + change = (ReorderBufferChange *) + MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange)); + + memset(change, 0, sizeof(ReorderBufferChange)); + return change; +} + +/* + * Free a ReorderBufferChange and update memory accounting, if requested. + */ +void +ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, + bool upd_mem) +{ + /* update memory accounting info */ + if (upd_mem) + ReorderBufferChangeMemoryUpdate(rb, change, false, + ReorderBufferChangeSize(change)); + + /* free contained data */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.newtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple); + change->data.tp.newtuple = NULL; + } + + if (change->data.tp.oldtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple); + change->data.tp.oldtuple = NULL; + } + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + if (change->data.msg.prefix != NULL) + pfree(change->data.msg.prefix); + change->data.msg.prefix = NULL; + if (change->data.msg.message != NULL) + pfree(change->data.msg.message); + change->data.msg.message = NULL; + break; + case REORDER_BUFFER_CHANGE_INVALIDATION: + if (change->data.inval.invalidations) + pfree(change->data.inval.invalidations); + change->data.inval.invalidations = NULL; + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + if (change->data.snapshot) + { + ReorderBufferFreeSnap(rb, change->data.snapshot); + change->data.snapshot = NULL; + } + break; + /* no data in addition to the struct itself */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + if (change->data.truncate.relids != NULL) + { + ReorderBufferReturnRelids(rb, change->data.truncate.relids); + change->data.truncate.relids = NULL; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + pfree(change); +} + +/* + * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size + * tuple_len (excluding header overhead). + */ +ReorderBufferTupleBuf * +ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) +{ + ReorderBufferTupleBuf *tuple; + Size alloc_len; + + alloc_len = tuple_len + SizeofHeapTupleHeader; + + tuple = (ReorderBufferTupleBuf *) + MemoryContextAlloc(rb->tup_context, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + alloc_len); + tuple->alloc_tuple_size = alloc_len; + tuple->tuple.t_data = ReorderBufferTupleBufData(tuple); + + return tuple; +} + +/* + * Free a ReorderBufferTupleBuf. + */ +void +ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) +{ + pfree(tuple); +} + +/* + * Get an array for relids of truncated relations. + * + * We use the global memory context (for the whole reorder buffer), because + * none of the existing ones seems like a good match (some are SLAB, so we + * can't use those, and tup_context is meant for tuple data, not relids). We + * could add yet another context, but it seems like an overkill - TRUNCATE is + * not particularly common operation, so it does not seem worth it. + */ +Oid * +ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids) +{ + Oid *relids; + Size alloc_len; + + alloc_len = sizeof(Oid) * nrelids; + + relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len); + + return relids; +} + +/* + * Free an array of relids. + */ +void +ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids) +{ + pfree(relids); +} + +/* + * Return the ReorderBufferTXN from the given buffer, specified by Xid. + * If create is true, and a transaction doesn't already exist, create it + * (with the given LSN, and as top transaction if that's specified); + * when this happens, is_new is set to true. + */ +static ReorderBufferTXN * +ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, + bool *is_new, XLogRecPtr lsn, bool create_as_top) +{ + ReorderBufferTXN *txn; + ReorderBufferTXNByIdEnt *ent; + bool found; + + Assert(TransactionIdIsValid(xid)); + + /* + * Check the one-entry lookup cache first + */ + if (TransactionIdIsValid(rb->by_txn_last_xid) && + rb->by_txn_last_xid == xid) + { + txn = rb->by_txn_last_txn; + + if (txn != NULL) + { + /* found it, and it's valid */ + if (is_new) + *is_new = false; + return txn; + } + + /* + * cached as non-existent, and asked not to create? Then nothing else + * to do. + */ + if (!create) + return NULL; + /* otherwise fall through to create it */ + } + + /* + * If the cache wasn't hit or it yielded a "does-not-exist" and we want + * to create an entry. + */ + + /* search the lookup table */ + ent = (ReorderBufferTXNByIdEnt *) + hash_search(rb->by_txn, + (void *) &xid, + create ? HASH_ENTER : HASH_FIND, + &found); + if (found) + txn = ent->txn; + else if (create) + { + /* initialize the new entry, if creation was requested */ + Assert(ent != NULL); + Assert(lsn != InvalidXLogRecPtr); + + ent->txn = ReorderBufferGetTXN(rb); + ent->txn->xid = xid; + txn = ent->txn; + txn->first_lsn = lsn; + txn->restart_decoding_lsn = rb->current_restart_decoding_lsn; + + if (create_as_top) + { + dlist_push_tail(&rb->toplevel_by_lsn, &txn->node); + AssertTXNLsnOrder(rb); + } + } + else + txn = NULL; /* not found and not asked to create */ + + /* update cache */ + rb->by_txn_last_xid = xid; + rb->by_txn_last_txn = txn; + + if (is_new) + *is_new = !found; + + Assert(!create || txn != NULL); + return txn; +} + +/* + * Record the partial change for the streaming of in-progress transactions. We + * can stream only complete changes so if we have a partial change like toast + * table insert or speculative insert then we mark such a 'txn' so that it + * can't be streamed. We also ensure that if the changes in such a 'txn' are + * above logical_decoding_work_mem threshold then we stream them as soon as we + * have a complete change. + */ +static void +ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, + bool toast_insert) +{ + ReorderBufferTXN *toptxn; + + /* + * The partial changes need to be processed only while streaming + * in-progress transactions. + */ + if (!ReorderBufferCanStream(rb)) + return; + + /* Get the top transaction. */ + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + + /* + * Indicate a partial change for toast inserts. The change will be + * considered as complete once we get the insert or update on the main + * table and we are sure that the pending toast chunks are not required + * anymore. + * + * If we allow streaming when there are pending toast chunks then such + * chunks won't be released till the insert (multi_insert) is complete and + * we expect the txn to have streamed all changes after streaming. This + * restriction is mainly to ensure the correctness of streamed + * transactions and it doesn't seem worth uplifting such a restriction + * just to allow this case because anyway we will stream the transaction + * once such an insert is complete. + */ + if (toast_insert) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsInsertOrUpdate(change->action) && + change->data.tp.clear_toast_afterwards) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Indicate a partial change for speculative inserts. The change will be + * considered as complete once we get the speculative confirm or abort + * token. + */ + if (IsSpecInsert(change->action)) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsSpecConfirmOrAbort(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Stream the transaction if it is serialized before and the changes are + * now complete in the top-level transaction. + * + * The reason for doing the streaming of such a transaction as soon as we + * get the complete change for it is that previously it would have reached + * the memory threshold and wouldn't get streamed because of incomplete + * changes. Delaying such transactions would increase apply lag for them. + */ + if (ReorderBufferCanStartStreaming(rb) && + !(rbtxn_has_partial_change(toptxn)) && + rbtxn_is_serialized(txn)) + ReorderBufferStreamTXN(rb, toptxn); +} + +/* + * Queue a change into a transaction so it can be replayed upon commit or will be + * streamed when we reach logical_decoding_work_mem threshold. + */ +void +ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + ReorderBufferChange *change, bool toast_insert) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* + * While streaming the previous changes we have detected that the + * transaction is aborted. So there is no point in collecting further + * changes for it. + */ + if (txn->concurrent_abort) + { + /* + * We don't need to update memory accounting for this change as we + * have not added it to the queue yet. + */ + ReorderBufferReturnChange(rb, change, false); + return; + } + + change->lsn = lsn; + change->txn = txn; + + Assert(InvalidXLogRecPtr != lsn); + dlist_push_tail(&txn->changes, &change->node); + txn->nentries++; + txn->nentries_mem++; + + /* update memory accounting information */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); + + /* process partial change */ + ReorderBufferProcessPartialChange(rb, txn, change, toast_insert); + + /* check the memory limits and evict something if needed */ + ReorderBufferCheckMemoryLimit(rb); +} + +/* + * A transactional message is queued to be processed upon commit and a + * non-transactional message gets processed immediately. + */ +void +ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, + Snapshot snapshot, XLogRecPtr lsn, + bool transactional, const char *prefix, + Size message_size, const char *message) +{ + if (transactional) + { + MemoryContext oldcontext; + ReorderBufferChange *change; + + Assert(xid != InvalidTransactionId); + + oldcontext = MemoryContextSwitchTo(rb->context); + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_MESSAGE; + change->data.msg.prefix = pstrdup(prefix); + change->data.msg.message_size = message_size; + change->data.msg.message = palloc(message_size); + memcpy(change->data.msg.message, message, message_size); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); + } + else + { + ReorderBufferTXN *txn = NULL; + volatile Snapshot snapshot_now = snapshot; + + if (xid != InvalidTransactionId) + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* setup snapshot to allow catalog access */ + SetupHistoricSnapshot(snapshot_now, NULL); + PG_TRY(); + { + rb->message(rb, txn, lsn, false, prefix, message_size, message); + + TeardownHistoricSnapshot(false); + } + PG_CATCH(); + { + TeardownHistoricSnapshot(true); + PG_RE_THROW(); + } + PG_END_TRY(); + } +} + +/* + * AssertTXNLsnOrder + * Verify LSN ordering of transaction lists in the reorderbuffer + * + * Other LSN-related invariants are checked too. + * + * No-op if assertions are not in use. + */ +static void +AssertTXNLsnOrder(ReorderBuffer *rb) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_first_lsn = InvalidXLogRecPtr; + XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr; + + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node, + iter.cur); + + /* start LSN must be set */ + Assert(cur_txn->first_lsn != InvalidXLogRecPtr); + + /* If there is an end LSN, it must be higher than start LSN */ + if (cur_txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_txn->first_lsn <= cur_txn->end_lsn); + + /* Current initial LSN must be strictly higher than previous */ + if (prev_first_lsn != InvalidXLogRecPtr) + Assert(prev_first_lsn < cur_txn->first_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_first_lsn = cur_txn->first_lsn; + } + + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, + base_snapshot_node, + iter.cur); + + /* base snapshot (and its LSN) must be set */ + Assert(cur_txn->base_snapshot != NULL); + Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr); + + /* current LSN must be strictly higher than previous */ + if (prev_base_snap_lsn != InvalidXLogRecPtr) + Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_base_snap_lsn = cur_txn->base_snapshot_lsn; + } +#endif +} + +/* + * AssertChangeLsnOrder + * + * Check ordering of changes in the (sub)transaction. + */ +static void +AssertChangeLsnOrder(ReorderBufferTXN *txn) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_lsn = txn->first_lsn; + + dlist_foreach(iter, &txn->changes) + { + ReorderBufferChange *cur_change; + + cur_change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(txn->first_lsn <= cur_change->lsn); + + if (txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_change->lsn <= txn->end_lsn); + + Assert(prev_lsn <= cur_change->lsn); + + prev_lsn = cur_change->lsn; + } +#endif +} + +/* + * ReorderBufferGetOldestTXN + * Return oldest transaction in reorderbuffer + */ +ReorderBufferTXN * +ReorderBufferGetOldestTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->toplevel_by_lsn)) + return NULL; + + txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn); + + Assert(!rbtxn_is_known_subxact(txn)); + Assert(txn->first_lsn != InvalidXLogRecPtr); + return txn; +} + +/* + * ReorderBufferGetOldestXmin + * Return oldest Xmin in reorderbuffer + * + * Returns oldest possibly running Xid from the point of view of snapshots + * used in the transactions kept by reorderbuffer, or InvalidTransactionId if + * there are none. + * + * Since snapshots are assigned monotonically, this equals the Xmin of the + * base snapshot with minimal base_snapshot_lsn. + */ +TransactionId +ReorderBufferGetOldestXmin(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn)) + return InvalidTransactionId; + + txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node, + &rb->txns_by_base_snapshot_lsn); + return txn->base_snapshot->xmin; +} + +void +ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr) +{ + rb->current_restart_decoding_lsn = ptr; +} + +/* + * ReorderBufferAssignChild + * + * Make note that we know that subxid is a subtransaction of xid, seen as of + * the given lsn. + */ +void +ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *subtxn; + bool new_top; + bool new_sub; + + txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true); + subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false); + + if (!new_sub) + { + if (rbtxn_is_known_subxact(subtxn)) + { + /* already associated, nothing to do */ + return; + } + else + { + /* + * We already saw this transaction, but initially added it to the + * list of top-level txns. Now that we know it's not top-level, + * remove it from there. + */ + dlist_delete(&subtxn->node); + } + } + + subtxn->txn_flags |= RBTXN_IS_SUBXACT; + subtxn->toplevel_xid = xid; + Assert(subtxn->nsubtxns == 0); + + /* set the reference to top-level transaction */ + subtxn->toptxn = txn; + + /* add to subtransaction list */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + + /* Possibly transfer the subtxn's snapshot to its top-level txn. */ + ReorderBufferTransferSnapToParent(txn, subtxn); + + /* Verify LSN-ordering invariant */ + AssertTXNLsnOrder(rb); +} + +/* + * ReorderBufferTransferSnapToParent + * Transfer base snapshot from subtxn to top-level txn, if needed + * + * This is done if the top-level txn doesn't have a base snapshot, or if the + * subtxn's base snapshot has an earlier LSN than the top-level txn's base + * snapshot's LSN. This can happen if there are no changes in the toplevel + * txn but there are some in the subtxn, or the first change in subtxn has + * earlier LSN than first change in the top-level txn and we learned about + * their kinship only now. + * + * The subtransaction's snapshot is cleared regardless of the transfer + * happening, since it's not needed anymore in either case. + * + * We do this as soon as we become aware of their kinship, to avoid queueing + * extra snapshots to txns known-as-subtxns -- only top-level txns will + * receive further snapshots. + */ +static void +ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn) +{ + Assert(subtxn->toplevel_xid == txn->xid); + + if (subtxn->base_snapshot != NULL) + { + if (txn->base_snapshot == NULL || + subtxn->base_snapshot_lsn < txn->base_snapshot_lsn) + { + /* + * If the toplevel transaction already has a base snapshot but + * it's newer than the subxact's, purge it. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * The snapshot is now the top transaction's; transfer it, and + * adjust the list position of the top transaction in the list by + * moving it to where the subtransaction is. + */ + txn->base_snapshot = subtxn->base_snapshot; + txn->base_snapshot_lsn = subtxn->base_snapshot_lsn; + dlist_insert_before(&subtxn->base_snapshot_node, + &txn->base_snapshot_node); + + /* + * The subtransaction doesn't have a snapshot anymore (so it + * mustn't be in the list.) + */ + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + dlist_delete(&subtxn->base_snapshot_node); + } + else + { + /* Base snap of toplevel is fine, so subxact's is not needed */ + SnapBuildSnapDecRefcount(subtxn->base_snapshot); + dlist_delete(&subtxn->base_snapshot_node); + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + } + } +} + +/* + * Associate a subtransaction with its toplevel transaction at commit + * time. There may be no further changes added after this. + */ +void +ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr commit_lsn, + XLogRecPtr end_lsn) +{ + ReorderBufferTXN *subtxn; + + subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL, + InvalidXLogRecPtr, false); + + /* + * No need to do anything if that subtxn didn't contain any changes + */ + if (!subtxn) + return; + + subtxn->final_lsn = commit_lsn; + subtxn->end_lsn = end_lsn; + + /* + * Assign this subxact as a child of the toplevel xact (no-op if already + * done.) + */ + ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr); +} + + +/* + * Support for efficiently iterating over a transaction's and its + * subtransactions' changes. + * + * We do by doing a k-way merge between transactions/subtransactions. For that + * we model the current heads of the different transactions as a binary heap + * so we easily know which (sub-)transaction has the change with the smallest + * lsn next. + * + * We assume the changes in individual transactions are already sorted by LSN. + */ + +/* + * Binary heap comparison function. + */ +static int +ReorderBufferIterCompare(Datum a, Datum b, void *arg) +{ + ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg; + XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn; + XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn; + + if (pos_a < pos_b) + return 1; + else if (pos_a == pos_b) + return 0; + return -1; +} + +/* + * Allocate & initialize an iterator which iterates in lsn order over a + * transaction and all its subtransactions. + * + * Note: The iterator state is returned through iter_state parameter rather + * than the function's return value. This is because the state gets cleaned up + * in a PG_CATCH block in the caller, so we want to make sure the caller gets + * back the state even if this function throws an exception. + */ +static void +ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state) +{ + Size nr_txns = 0; + ReorderBufferIterTXNState *state; + dlist_iter cur_txn_i; + int32 off; + + *iter_state = NULL; + + /* Check ordering of changes in the toplevel transaction. */ + AssertChangeLsnOrder(txn); + + /* + * Calculate the size of our heap: one element for every transaction that + * contains changes. (Besides the transactions already in the reorder + * buffer, we count the one we were directly passed.) + */ + if (txn->nentries > 0) + nr_txns++; + + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + /* Check ordering of changes in this subtransaction. */ + AssertChangeLsnOrder(cur_txn); + + if (cur_txn->nentries > 0) + nr_txns++; + } + + /* allocate iteration state */ + state = (ReorderBufferIterTXNState *) + MemoryContextAllocZero(rb->context, + sizeof(ReorderBufferIterTXNState) + + sizeof(ReorderBufferIterTXNEntry) * nr_txns); + + state->nr_txns = nr_txns; + dlist_init(&state->old_change); + + for (off = 0; off < state->nr_txns; off++) + { + state->entries[off].file.vfd = -1; + state->entries[off].segno = 0; + } + + /* allocate heap */ + state->heap = binaryheap_allocate(state->nr_txns, + ReorderBufferIterCompare, + state); + + /* Now that the state fields are initialized, it is safe to return it. */ + *iter_state = state; + + /* + * Now insert items into the binary heap, in an unordered fashion. (We + * will run a heap assembly step at the end; this is more efficient.) + */ + + off = 0; + + /* add toplevel transaction if it contains changes */ + if (txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, txn); + ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file, + &state->entries[off].segno); + } + + cur_change = dlist_head_element(ReorderBufferChange, node, + &txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + + /* add subtransactions if they contain changes */ + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + if (cur_txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(cur_txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, cur_txn); + ReorderBufferRestoreChanges(rb, cur_txn, + &state->entries[off].file, + &state->entries[off].segno); + } + cur_change = dlist_head_element(ReorderBufferChange, node, + &cur_txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = cur_txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + } + + /* assemble a valid binary heap */ + binaryheap_build(state->heap); +} + +/* + * Return the next change when iterating over a transaction and its + * subtransactions. + * + * Returns NULL when no further changes exist. + */ +static ReorderBufferChange * +ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) +{ + ReorderBufferChange *change; + ReorderBufferIterTXNEntry *entry; + int32 off; + + /* nothing there anymore */ + if (state->heap->bh_size == 0) + return NULL; + + off = DatumGetInt32(binaryheap_first(state->heap)); + entry = &state->entries[off]; + + /* free memory we might have "leaked" in the previous *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + change = entry->change; + + /* + * update heap with information about which transaction has the next + * relevant change in LSN order + */ + + /* there are in-memory changes */ + if (dlist_has_next(&entry->txn->changes, &entry->change->node)) + { + dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node); + ReorderBufferChange *next_change = + dlist_container(ReorderBufferChange, node, next); + + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + return change; + } + + /* try to load changes from disk */ + if (entry->txn->nentries != entry->txn->nentries_mem) + { + /* + * Ugly: restoring changes will reuse *Change records, thus delete the + * current one from the per-tx list and only free in the next call. + */ + dlist_delete(&change->node); + dlist_push_tail(&state->old_change, &change->node); + + /* + * Update the total bytes processed by the txn for which we are + * releasing the current set of changes and restoring the new set of + * changes. + */ + rb->totalBytes += entry->txn->size; + if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file, + &state->entries[off].segno)) + { + /* successfully restored changes from disk */ + ReorderBufferChange *next_change = + dlist_head_element(ReorderBufferChange, node, + &entry->txn->changes); + + elog(DEBUG2, "restored %u/%u changes from disk", + (uint32) entry->txn->nentries_mem, + (uint32) entry->txn->nentries); + + Assert(entry->txn->nentries_mem); + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + + return change; + } + } + + /* ok, no changes there anymore, remove */ + binaryheap_remove_first(state->heap); + + return change; +} + +/* + * Deallocate the iterator + */ +static void +ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state) +{ + int32 off; + + for (off = 0; off < state->nr_txns; off++) + { + if (state->entries[off].file.vfd != -1) + FileClose(state->entries[off].file.vfd); + } + + /* free memory we might have "leaked" in the last *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + binaryheap_free(state->heap); + pfree(state); +} + +/* + * Cleanup the contents of a transaction, usually after the transaction + * committed or aborted. + */ +static void +ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + bool found; + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferCleanupTXN(rb, subtxn); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the tuplecids we stored for decoding catalog snapshot access. + * They are always stored in the toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the base snapshot, if set. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * Cleanup the snapshot for the last streamed run. + */ + if (txn->snapshot_now != NULL) + { + Assert(rbtxn_is_streamed(txn)); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + + /* + * Remove TXN from its containing list. + * + * Note: if txn is known as subxact, we are deleting the TXN from its + * parent's list of known subxacts; this leaves the parent's nsubxacts + * count too high, but we don't care. Otherwise, we are deleting the TXN + * from the LSN-ordered list of toplevel TXNs. + */ + dlist_delete(&txn->node); + + /* now remove reference from buffer */ + hash_search(rb->by_txn, + (void *) &txn->xid, + HASH_REMOVE, + &found); + Assert(found); + + /* remove entries spilled to disk */ + if (rbtxn_is_serialized(txn)) + ReorderBufferRestoreCleanup(rb, txn); + + /* deallocate */ + ReorderBufferReturnTXN(rb, txn); +} + +/* + * Discard changes from a transaction (and subtransactions), either after + * streaming or decoding them at PREPARE. Keep the remaining info - + * transactions, tuplecids, invalidations and snapshots. + * + * We additionaly remove tuplecids after decoding the transaction at prepare + * time as we only need to perform invalidation at rollback or commit prepared. + * + * 'txn_prepared' indicates that we have decoded the transaction at prepare + * time. + */ +static void +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared) +{ + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferTruncateTXN(rb, subtxn, txn_prepared); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + /* remove the change from it's containing list */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Mark the transaction as streamed. + * + * The toplevel transaction, identified by (toptxn==NULL), is marked as + * streamed always, even if it does not contain any changes (that is, when + * all the changes are in subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts for + * XIDs the downstream is not aware of. And of course, it always knows + * about the toplevel xact (we send the XID in all messages), but we never + * stream XIDs of empty subxacts. + */ + if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0))) + txn->txn_flags |= RBTXN_IS_STREAMED; + + if (txn_prepared) + { + /* + * If this is a prepared txn, cleanup the tuplecids we stored for + * decoding catalog snapshot access. They are always stored in the + * toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* Remove the change from its containing list. */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + } + + /* + * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any + * memory. We could also keep the hash table and update it with new ctid + * values, but this seems simpler and good enough for now. + */ + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + /* If this txn is serialized then clean the disk space. */ + if (rbtxn_is_serialized(txn)) + { + ReorderBufferRestoreCleanup(rb, txn); + txn->txn_flags &= ~RBTXN_IS_SERIALIZED; + + /* + * We set this flag to indicate if the transaction is ever serialized. + * We need this to accurately update the stats as otherwise the same + * transaction can be counted as serialized multiple times. + */ + txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR; + } + + /* also reset the number of entries in the transaction */ + txn->nentries_mem = 0; + txn->nentries = 0; +} + +/* + * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by + * HeapTupleSatisfiesHistoricMVCC. + */ +static void +ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter iter; + HASHCTL hash_ctl; + + if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids)) + return; + + hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey); + hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt); + hash_ctl.hcxt = rb->context; + + /* + * create the hash with the exact number of to-be-stored tuplecids from + * the start + */ + txn->tuplecid_hash = + hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + dlist_foreach(iter, &txn->tuplecids) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + bool found; + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + key.relnode = change->data.tuplecid.node; + + ItemPointerCopy(&change->data.tuplecid.tid, + &key.tid); + + ent = (ReorderBufferTupleCidEnt *) + hash_search(txn->tuplecid_hash, + (void *) &key, + HASH_ENTER, + &found); + if (!found) + { + ent->cmin = change->data.tuplecid.cmin; + ent->cmax = change->data.tuplecid.cmax; + ent->combocid = change->data.tuplecid.combocid; + } + else + { + /* + * Maybe we already saw this tuple before in this transaction, but + * if so it must have the same cmin. + */ + Assert(ent->cmin == change->data.tuplecid.cmin); + + /* + * cmax may be initially invalid, but once set it can only grow, + * and never become invalid again. + */ + Assert((ent->cmax == InvalidCommandId) || + ((change->data.tuplecid.cmax != InvalidCommandId) && + (change->data.tuplecid.cmax > ent->cmax))); + ent->cmax = change->data.tuplecid.cmax; + } + } +} + +/* + * Copy a provided snapshot so we can modify it privately. This is needed so + * that catalog modifying transactions can look into intermediate catalog + * states. + */ +static Snapshot +ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid) +{ + Snapshot snap; + dlist_iter iter; + int i = 0; + Size size; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * orig_snap->xcnt + + sizeof(TransactionId) * (txn->nsubtxns + 1); + + snap = MemoryContextAllocZero(rb->context, size); + memcpy(snap, orig_snap, sizeof(SnapshotData)); + + snap->copied = true; + snap->active_count = 1; /* mark as active so nobody frees it */ + snap->regd_count = 0; + snap->xip = (TransactionId *) (snap + 1); + + memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt); + + /* + * snap->subxip contains all txids that belong to our transaction which we + * need to check via cmin/cmax. That's why we store the toplevel + * transaction in there as well. + */ + snap->subxip = snap->xip + snap->xcnt; + snap->subxip[i++] = txn->xid; + + /* + * subxcnt isn't decreased when subtransactions abort, so count manually. + * Since it's an upper boundary it is safe to use it for the allocation + * above. + */ + snap->subxcnt = 1; + + dlist_foreach(iter, &txn->subtxns) + { + ReorderBufferTXN *sub_txn; + + sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur); + snap->subxip[i++] = sub_txn->xid; + snap->subxcnt++; + } + + /* sort so we can bsearch() later */ + qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator); + + /* store the specified current CommandId */ + snap->curcid = cid; + + return snap; +} + +/* + * Free a previously ReorderBufferCopySnap'ed snapshot + */ +static void +ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) +{ + if (snap->copied) + pfree(snap); + else + SnapBuildSnapDecRefcount(snap); +} + +/* + * If the transaction was (partially) streamed, we need to prepare or commit + * it in a 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_prepare or stream_commit message as per + * the case. + */ +static void +ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* we should only call this for previously streamed transactions */ + Assert(rbtxn_is_streamed(txn)); + + ReorderBufferStreamTXN(rb, txn); + + if (rbtxn_prepared(txn)) + { + /* + * Note, we send stream prepare even if a concurrent abort is + * detected. See DecodePrepare for more information. + */ + rb->stream_prepare(rb, txn, txn->final_lsn); + + /* + * This is a PREPARED transaction, part of a two-phase commit. The + * full cleanup will happen as part of the COMMIT PREPAREDs, so now + * just truncate txn by removing changes and tuple_cids. + */ + ReorderBufferTruncateTXN(rb, txn, true); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + { + rb->stream_commit(rb, txn, txn->final_lsn); + ReorderBufferCleanupTXN(rb, txn); + } +} + +/* + * Set xid to detect concurrent aborts. + * + * While streaming an in-progress transaction or decoding a prepared + * transaction there is a possibility that the (sub)transaction might get + * aborted concurrently. In such case if the (sub)transaction has catalog + * update then we might decode the tuple using wrong catalog version. For + * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now, + * the transaction 501 updates the catalog tuple and after that we will have + * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is + * aborted and some other transaction say 502 updates the same catalog tuple + * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the + * problem is that when we try to decode the tuple inserted/updated in 501 + * after the catalog update, we will see the catalog tuple with (xmin: 500, + * xmax: 502) as visible because it will consider that the tuple is deleted by + * xid 502 which is not visible to our snapshot. And when we will try to + * decode with that catalog tuple, it can lead to a wrong result or a crash. + * So, it is necessary to detect concurrent aborts to allow streaming of + * in-progress transactions or decoding of prepared transactions. + * + * For detecting the concurrent abort we set CheckXidAlive to the current + * (sub)transaction's xid for which this change belongs to. And, during + * catalog scan we can check the status of the xid and if it is aborted we will + * report a specific error so that we can stop streaming current transaction + * and discard the already streamed changes on such an error. We might have + * already streamed some of the changes for the aborted (sub)transaction, but + * that is fine because when we decode the abort we will stream abort message + * to truncate the changes in the subscriber. Similarly, for prepared + * transactions, we stop decoding if concurrent abort is detected and then + * rollback the changes when rollback prepared is encountered. See + * DecodePrepare. + */ +static inline void +SetupCheckXidLive(TransactionId xid) +{ + /* + * If the input transaction id is already set as a CheckXidAlive then + * nothing to do. + */ + if (TransactionIdEquals(CheckXidAlive, xid)) + return; + + /* + * setup CheckXidAlive if it's not committed yet. We don't check if the + * xid is aborted. That will happen during catalog access. + */ + if (!TransactionIdDidCommit(xid)) + CheckXidAlive = xid; + else + CheckXidAlive = InvalidTransactionId; +} + +/* + * Helper function for ReorderBufferProcessTXN for applying change. + */ +static inline void +ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change, + bool streaming) +{ + if (streaming) + rb->stream_change(rb, txn, relation, change); + else + rb->apply_change(rb, txn, relation, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the truncate. + */ +static inline void +ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, + int nrelations, Relation *relations, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_truncate(rb, txn, nrelations, relations, change); + else + rb->apply_truncate(rb, txn, nrelations, relations, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the message. + */ +static inline void +ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); + else + rb->message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); +} + +/* + * Function to store the command id and snapshot at the end of the current + * stream so that we can reuse the same while sending the next stream. + */ +static inline void +ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, CommandId command_id) +{ + txn->command_id = command_id; + + /* Avoid copying if it's already copied. */ + if (snapshot_now->copied) + txn->snapshot_now = snapshot_now; + else + txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); +} + +/* + * Helper function for ReorderBufferProcessTXN to handle the concurrent + * abort of the streaming transaction. This resets the TXN such that it + * can be used to stream the remaining data of transaction being processed. + * This can happen when the subtransaction is aborted and we still want to + * continue processing the main or other subtransactions data. + */ +static void +ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, + CommandId command_id, + XLogRecPtr last_lsn, + ReorderBufferChange *specinsert) +{ + /* Discard the changes that we just streamed */ + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + + /* Free all resources allocated for toast reconstruction */ + ReorderBufferToastReset(rb, txn); + + /* Return the spec insert change if it is not NULL */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* + * For the streaming case, stop the stream and remember the command ID and + * snapshot for the streaming run. + */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_stop(rb, txn, last_lsn); + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + } +} + +/* + * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN. + * + * Send data of a transaction (and its subtransactions) to the + * output plugin. We iterate over the top and subtransactions (using a k-way + * merge) and replay the changes in lsn order. + * + * If streaming is true then data will be sent using stream API. + * + * Note: "volatile" markers on some parameters are to avoid trouble with + * PG_TRY inside the function. + */ +static void +ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn, + volatile Snapshot snapshot_now, + volatile CommandId command_id, + bool streaming) +{ + bool using_subtxn; + MemoryContext ccxt = CurrentMemoryContext; + ReorderBufferIterTXNState *volatile iterstate = NULL; + volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; + ReorderBufferChange *volatile specinsert = NULL; + volatile bool stream_started = false; + ReorderBufferTXN *volatile curtxn = NULL; + + /* build data to be able to lookup the CommandIds of catalog tuples */ + ReorderBufferBuildTupleCidHash(rb, txn); + + /* setup the initial snapshot */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + + /* + * Decoding needs access to syscaches et al., which in turn use + * heavyweight locks and such. Thus we need to have enough state around to + * keep track of those. The easiest way is to simply use a transaction + * internally. That also allows us to easily enforce that nothing writes + * to the database by checking for xid assignments. + * + * When we're called via the SQL SRF there's already a transaction + * started, so start an explicit subtransaction there. + */ + using_subtxn = IsTransactionOrTransactionBlock(); + + PG_TRY(); + { + ReorderBufferChange *change; + + if (using_subtxn) + BeginInternalSubTransaction(streaming ? "stream" : "replay"); + else + StartTransactionCommand(); + + /* + * We only need to send begin/begin-prepare for non-streamed + * transactions. + */ + if (!streaming) + { + if (rbtxn_prepared(txn)) + rb->begin_prepare(rb, txn); + else + rb->begin(rb, txn); + } + + ReorderBufferIterTXNInit(rb, txn, &iterstate); + while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL) + { + Relation relation = NULL; + Oid reloid; + + /* + * We can't call start stream callback before processing first + * change. + */ + if (prev_lsn == InvalidXLogRecPtr) + { + if (streaming) + { + txn->origin_id = change->origin_id; + rb->stream_start(rb, txn, change->lsn); + stream_started = true; + } + } + + /* + * Enforce correct ordering of changes, merged from multiple + * subtransactions. The changes may have the same LSN due to + * MULTI_INSERT xlog records. + */ + Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + + prev_lsn = change->lsn; + + /* + * Set the current xid to detect concurrent aborts. This is + * required for the cases when we decode the changes before the + * COMMIT record is processed. + */ + if (streaming || rbtxn_prepared(change->txn)) + { + curtxn = change->txn; + SetupCheckXidLive(curtxn->xid); + } + + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + + /* + * Confirmation for speculative insertion arrived. Simply + * use as a normal record. It'll be cleaned up at the end + * of INSERT processing. + */ + if (specinsert == NULL) + elog(ERROR, "invalid ordering of speculative insertion changes"); + Assert(specinsert->data.tp.oldtuple == NULL); + change = specinsert; + change->action = REORDER_BUFFER_CHANGE_INSERT; + + /* intentionally fall through */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + Assert(snapshot_now); + + reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode, + change->data.tp.relnode.relNode); + + /* + * Mapped catalog tuple without data, emitted while + * catalog table was in the process of being rewritten. We + * can fail to look up the relfilenode, because the + * relmapper has no "historic" view, in contrast to the + * normal catalog during decoding. Thus repeated rewrites + * can cause a lookup failure. That's OK because we do not + * decode catalog changes anyway. Normally such tuples + * would be skipped over below, but we can't identify + * whether the table should be logically logged without + * mapping the relfilenode to the oid. + */ + if (reloid == InvalidOid && + change->data.tp.newtuple == NULL && + change->data.tp.oldtuple == NULL) + goto change_done; + else if (reloid == InvalidOid) + elog(ERROR, "could not map filenode \"%s\" to relation OID", + relpathperm(change->data.tp.relnode, + MAIN_FORKNUM)); + + relation = RelationIdGetRelation(reloid); + + if (!RelationIsValid(relation)) + elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")", + reloid, + relpathperm(change->data.tp.relnode, + MAIN_FORKNUM)); + + if (!RelationIsLogicallyLogged(relation)) + goto change_done; + + /* + * Ignore temporary heaps created during DDL unless the + * plugin has asked for them. + */ + if (relation->rd_rel->relrewrite && !rb->output_rewrites) + goto change_done; + + /* + * For now ignore sequence changes entirely. Most of the + * time they don't log changes using records we + * understand, so it doesn't make sense to handle the few + * cases we do. + */ + if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + goto change_done; + + /* user-triggered change */ + if (!IsToastRelation(relation)) + { + ReorderBufferToastReplace(rb, txn, relation, change); + ReorderBufferApplyChange(rb, txn, relation, change, + streaming); + + /* + * Only clear reassembled toast chunks if we're sure + * they're not required anymore. The creator of the + * tuple tells us. + */ + if (change->data.tp.clear_toast_afterwards) + ReorderBufferToastReset(rb, txn); + } + /* we're not interested in toast deletions */ + else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + { + /* + * Need to reassemble the full toasted Datum in + * memory, to ensure the chunks don't get reused till + * we're done remove it from the list of this + * transaction's changes. Otherwise it will get + * freed/reused while restoring spooled data from + * disk. + */ + Assert(change->data.tp.newtuple != NULL); + + dlist_delete(&change->node); + ReorderBufferToastAppendChunk(rb, txn, relation, + change); + } + + change_done: + + /* + * If speculative insertion was confirmed, the record + * isn't needed anymore. + */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + if (RelationIsValid(relation)) + { + RelationClose(relation); + relation = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + + /* + * Speculative insertions are dealt with by delaying the + * processing of the insert until the confirmation record + * arrives. For that we simply unlink the record from the + * chain, so it does not get freed/reused while restoring + * spooled data from disk. + * + * This is safe in the face of concurrent catalog changes + * because the relevant relation can't be changed between + * speculative insertion and confirmation due to + * CheckTableNotInUse() and locking. + */ + + /* clear out a pending (and thus failed) speculation */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* and memorize the pending insertion */ + dlist_delete(&change->node); + specinsert = change; + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + + /* + * Abort for speculative insertion arrived. So cleanup the + * specinsert tuple and toast hash. + * + * Note that we get the spec abort change for each toast + * entry but we need to perform the cleanup only the first + * time we get it for the main table. + */ + if (specinsert != NULL) + { + /* + * We must clean the toast hash before processing a + * completely new tuple to avoid confusion about the + * previous tuple's toast chunks. + */ + Assert(change->data.tp.clear_toast_afterwards); + ReorderBufferToastReset(rb, txn); + + /* We don't need this record anymore. */ + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + int i; + int nrelids = change->data.truncate.nrelids; + int nrelations = 0; + Relation *relations; + + relations = palloc0(nrelids * sizeof(Relation)); + for (i = 0; i < nrelids; i++) + { + Oid relid = change->data.truncate.relids[i]; + Relation relation; + + relation = RelationIdGetRelation(relid); + + if (!RelationIsValid(relation)) + elog(ERROR, "could not open relation with OID %u", relid); + + if (!RelationIsLogicallyLogged(relation)) + continue; + + relations[nrelations++] = relation; + } + + /* Apply the truncate. */ + ReorderBufferApplyTruncate(rb, txn, nrelations, + relations, change, + streaming); + + for (i = 0; i < nrelations; i++) + RelationClose(relations[i]); + + break; + } + + case REORDER_BUFFER_CHANGE_MESSAGE: + ReorderBufferApplyMessage(rb, txn, change, streaming); + break; + + case REORDER_BUFFER_CHANGE_INVALIDATION: + /* Execute the invalidation messages locally */ + ReorderBufferExecuteInvalidations( + change->data.inval.ninvalidations, + change->data.inval.invalidations); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + /* get rid of the old */ + TeardownHistoricSnapshot(false); + + if (snapshot_now->copied) + { + ReorderBufferFreeSnap(rb, snapshot_now); + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + + /* + * Restored from disk, need to be careful not to double + * free. We could introduce refcounting for that, but for + * now this seems infrequent enough not to care. + */ + else if (change->data.snapshot->copied) + { + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + else + { + snapshot_now = change->data.snapshot; + } + + /* and continue with the new one */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + Assert(change->data.command_id != InvalidCommandId); + + if (command_id < change->data.command_id) + { + command_id = change->data.command_id; + + if (!snapshot_now->copied) + { + /* we don't use the global one anymore */ + snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); + } + + snapshot_now->curcid = command_id; + + TeardownHistoricSnapshot(false); + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + } + + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + elog(ERROR, "tuplecid value in changequeue"); + break; + } + } + + /* speculative insertion record must be freed by now */ + Assert(!specinsert); + + /* clean up the iterator */ + ReorderBufferIterTXNFinish(rb, iterstate); + iterstate = NULL; + + /* + * Update total transaction count and total bytes processed by the + * transaction and its subtransactions. Ensure to not count the + * streamed transaction multiple times. + * + * Note that the statistics computation has to be done after + * ReorderBufferIterTXNFinish as it releases the serialized change + * which we have already accounted in ReorderBufferIterTXNNext. + */ + if (!rbtxn_is_streamed(txn)) + rb->totalTxns++; + + rb->totalBytes += txn->total_size; + + /* + * Done with current changes, send the last message for this set of + * changes depending upon streaming mode. + */ + if (streaming) + { + if (stream_started) + { + rb->stream_stop(rb, txn, prev_lsn); + stream_started = false; + } + } + else + { + /* + * Call either PREPARE (for two-phase transactions) or COMMIT (for + * regular ones). + */ + if (rbtxn_prepared(txn)) + rb->prepare(rb, txn, commit_lsn); + else + rb->commit(rb, txn, commit_lsn); + } + + /* this is just a sanity check against bad output plugin behaviour */ + if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) + elog(ERROR, "output plugin used XID %u", + GetCurrentTransactionId()); + + /* + * Remember the command ID and snapshot for the next set of changes in + * streaming mode. + */ + if (streaming) + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + else if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + + /* cleanup */ + TeardownHistoricSnapshot(false); + + /* + * Aborting the current (sub-)transaction as a whole has the right + * semantics. We want all locks acquired in here to be released, not + * reassigned to the parent and we do not want any database access + * have persistent effects. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * We are here due to one of the four reasons: 1. Decoding an + * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a + * prepared txn that was (partially) streamed. 4. Decoding a committed + * txn. + * + * For 1, we allow truncation of txn data by removing the changes + * already streamed but still keeping other things like invalidations, + * snapshot, and tuplecids. For 2 and 3, we indicate + * ReorderBufferTruncateTXN to do more elaborate truncation of txn + * data as the entire transaction has been decoded except for commit. + * For 4, as the entire txn has been decoded, we can fully clean up + * the TXN reorder buffer. + */ + if (streaming || rbtxn_prepared(txn)) + { + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + ReorderBufferCleanupTXN(rb, txn); + } + PG_CATCH(); + { + MemoryContext ecxt = MemoryContextSwitchTo(ccxt); + ErrorData *errdata = CopyErrorData(); + + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ + if (iterstate) + ReorderBufferIterTXNFinish(rb, iterstate); + + TeardownHistoricSnapshot(true); + + /* + * Force cache invalidation to happen outside of a valid transaction + * to prevent catalog access as we just caught an error. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent + * abort of the (sub)transaction we are streaming or preparing. We + * need to do the cleanup and return gracefully on this error, see + * SetupCheckXidLive. + * + * This error code can be thrown by one of the callbacks we call + * during decoding so we need to ensure that we return gracefully only + * when we are sending the data in streaming mode and the streaming is + * not finished yet or when we are sending the data out on a PREPARE + * during a two-phase commit. + */ + if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK && + (stream_started || rbtxn_prepared(txn))) + { + /* curtxn must be set for streaming or prepared transactions */ + Assert(curtxn); + + /* Cleanup the temporary error state. */ + FlushErrorState(); + FreeErrorData(errdata); + errdata = NULL; + curtxn->concurrent_abort = true; + + /* Reset the TXN so that it is allowed to stream remaining data. */ + ReorderBufferResetTXN(rb, txn, snapshot_now, + command_id, prev_lsn, + specinsert); + } + else + { + ReorderBufferCleanupTXN(rb, txn); + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* + * Perform the replay of a transaction and its non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * This interface is called once a prepare or toplevel commit is read for both + * streamed as well as non-streamed transactions. + */ +static void +ReorderBufferReplay(ReorderBufferTXN *txn, + ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + Snapshot snapshot_now; + CommandId command_id = FirstCommandId; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + /* + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. + * + * Called after everything (origin ID, LSN, ...) is stored in the + * transaction to avoid passing that information directly. + */ + if (rbtxn_is_streamed(txn)) + { + ReorderBufferStreamCommit(rb, txn); + return; + } + + /* + * If this transaction has no snapshot, it didn't make any changes to the + * database, so there's nothing to decode. Note that + * ReorderBufferCommitChild will have transferred any snapshots from + * subtransactions if there were any. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + + /* + * Removing this txn before a commit might result in the computation + * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts. + */ + if (!rbtxn_prepared(txn)) + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now, + command_id, false); +} + +/* + * Commit a transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time, + origin_id, origin_lsn); +} + +/* + * Record the prepare information for a transaction. + */ +bool +ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr prepare_lsn, XLogRecPtr end_lsn, + TimestampTz prepare_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return false; + + /* + * Remember the prepare information to be later used by commit prepared in + * case we skip doing prepare. + */ + txn->final_lsn = prepare_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = prepare_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + return true; +} + +/* Remember that we have skipped prepare */ +void +ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_SKIPPED_PREPARE; +} + +/* + * Prepare a two-phase transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, + char *gid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_PREPARE; + txn->gid = pstrdup(gid); + + /* The prepare info must have been updated in txn by now. */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->commit_time, txn->origin_id, txn->origin_lsn); + + /* + * We send the prepare for the concurrently aborted xacts so that later + * when rollback prepared is decoded and sent, the downstream should be + * able to rollback such a xact. See comments atop DecodePrepare. + * + * Note, for the concurrent_abort + streaming case a stream_prepare was + * already sent within the ReorderBufferReplay call above. + */ + if (txn->concurrent_abort && !rbtxn_is_streamed(txn)) + rb->prepare(rb, txn, txn->final_lsn); +} + +/* + * This is used to handle COMMIT/ROLLBACK PREPARED. + */ +void +ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + XLogRecPtr initial_consistent_point, + TimestampTz commit_time, RepOriginId origin_id, + XLogRecPtr origin_lsn, char *gid, bool is_commit) +{ + ReorderBufferTXN *txn; + XLogRecPtr prepare_end_lsn; + TimestampTz prepare_time; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + /* + * By this time the txn has the prepare record information, remember it to + * be later used for rollback. + */ + prepare_end_lsn = txn->end_lsn; + prepare_time = txn->commit_time; + + /* add the gid in the txn */ + txn->gid = pstrdup(gid); + + /* + * It is possible that this transaction is not decoded at prepare time + * either because by that time we didn't have a consistent snapshot or it + * was decoded earlier but we have restarted. We only need to send the + * prepare if it was not decoded earlier. We don't need to decode the xact + * for aborts if it is not done already. + */ + if ((txn->final_lsn < initial_consistent_point) && is_commit) + { + txn->txn_flags |= RBTXN_PREPARE; + + /* + * The prepare info must have been updated in txn even if we skip + * prepare. + */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* + * By this time the txn has the prepare record information and it is + * important to use that so that downstream gets the accurate + * information. If instead, we have passed commit information here + * then downstream can behave as it has already replayed commit + * prepared after the restart. + */ + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->commit_time, txn->origin_id, txn->origin_lsn); + } + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + if (is_commit) + rb->commit_prepared(rb, txn, commit_lsn); + else + rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time); + + /* cleanup: make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort a transaction that possibly has previous changes. Needs to be first + * called for subtransactions and then for the toplevel xid. + * + * NB: Transactions handled here have to have actively aborted (i.e. have + * produced an abort record). Implicitly aborted transactions are handled via + * ReorderBufferAbortOld(); transactions we're just not interested in, but + * which have committed are handled in ReorderBufferForget(). + * + * This function purges this transaction and its contents from memory and + * disk. + */ +void +ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to remove */ + if (txn == NULL) + return; + + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_abort(rb, txn, lsn); + + /* + * We might have decoded changes for this transaction that could load + * the cache as per the current transaction's view (consider DDL's + * happened in this transaction). We don't want the decoding of future + * transactions to use those cache entries so execute invalidations. + */ + if (txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + } + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort all transactions that aren't actually running anymore because the + * server restarted. + * + * NB: These really have to be transactions that have aborted due to a server + * crash/immediate restart, as we don't deal with invalidations here. + */ +void +ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid) +{ + dlist_mutable_iter it; + + /* + * Iterate through all (potential) toplevel TXNs and abort all that are + * older than what possibly can be running. Once we've found the first + * that is alive we stop, there might be some that acquired an xid earlier + * but started writing later, but it's unlikely and they will be cleaned + * up in a later call to this function. + */ + dlist_foreach_modify(it, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, node, it.cur); + + if (TransactionIdPrecedes(txn->xid, oldestRunningXid)) + { + elog(DEBUG2, "aborting old transaction %u", txn->xid); + + /* remove potential on-disk data, and deallocate this tx */ + ReorderBufferCleanupTXN(rb, txn); + } + else + return; + } +} + +/* + * Forget the contents of a transaction if we aren't interested in its + * contents. Needs to be first called for subtransactions and then for the + * toplevel xid. + * + * This is significantly different to ReorderBufferAbort() because + * transactions that have committed need to be treated differently from aborted + * ones since they may have modified the catalog. + * + * Note that this is only allowed to be called in the moment a transaction + * commit has just been read, not earlier; otherwise later records referring + * to this xid might re-create the transaction incompletely. + */ +void +ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to forget */ + if (txn == NULL) + return; + + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, lsn); + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Invalidate cache for those transactions that need to be skipped just in case + * catalogs were manipulated as part of the transaction. + * + * Note that this is a special-purpose function for prepared transactions where + * we don't want to clean up the TXN even when we decide to skip it. See + * DecodePrepare. + */ +void +ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to do */ + if (txn == NULL) + return; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); +} + + +/* + * Execute invalidations happening outside the context of a decoded + * transaction. That currently happens either for xid-less commits + * (cf. RecordTransactionCommit()) or for invalidations in uninteresting + * transactions (via ReorderBufferForget()). + */ +void +ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, + SharedInvalidationMessage *invalidations) +{ + bool use_subtxn = IsTransactionOrTransactionBlock(); + int i; + + if (use_subtxn) + BeginInternalSubTransaction("replay"); + + /* + * Force invalidations to happen outside of a valid transaction - that way + * entries will just be marked as invalid without accessing the catalog. + * That's advantageous because we don't need to setup the full state + * necessary for catalog access. + */ + if (use_subtxn) + AbortCurrentTransaction(); + + for (i = 0; i < ninvalidations; i++) + LocalExecuteInvalidationMessage(&invalidations[i]); + + if (use_subtxn) + RollbackAndReleaseCurrentSubTransaction(); +} + +/* + * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at + * least once for every xid in XLogRecord->xl_xid (other places in records + * may, but do not have to be passed through here). + * + * Reorderbuffer keeps some datastructures about transactions in LSN order, + * for efficiency. To do that it has to know about when transactions are seen + * first in the WAL. As many types of records are not actually interesting for + * logical decoding, they do not necessarily pass though here. + */ +void +ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + /* many records won't have an xid assigned, centralize check here */ + if (xid != InvalidTransactionId) + ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); +} + +/* + * Add a new snapshot to this transaction that may only used after lsn 'lsn' + * because the previous snapshot doesn't describe the catalog correctly for + * following rows. + */ +void +ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.snapshot = snap; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Set up the transaction's base snapshot. + * + * If we know that xid is a subtransaction, set the base snapshot on the + * top-level transaction instead. + */ +void +ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferTXN *txn; + bool is_new; + + AssertArg(snap != NULL); + + /* + * Fetch the transaction to operate on. If we know it's a subtransaction, + * operate on its top-level transaction instead. + */ + txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true); + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + Assert(txn->base_snapshot == NULL); + + txn->base_snapshot = snap; + txn->base_snapshot_lsn = lsn; + dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node); + + AssertTXNLsnOrder(rb); +} + +/* + * Access the catalog with this CommandId at this point in the changestream. + * + * May only be called for command ids > 1 + */ +void +ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, CommandId cid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.command_id = cid; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Update memory counters to account for the new or removed change. + * + * We update two counters - in the reorder buffer, and in the transaction + * containing the change. The reorder buffer counter allows us to quickly + * decide if we reached the memory limit, the transaction counter allows + * us to quickly pick the largest transaction for eviction. + * + * When streaming is enabled, we need to update the toplevel transaction + * counters instead - we don't really care about subtransactions as we + * can't stream them individually anyway, and we only pick toplevel + * transactions for eviction. So only toplevel transactions matter. + */ +static void +ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *toptxn; + + Assert(change->txn); + + /* + * Ignore tuple CID changes, because those are not evicted when reaching + * memory limit. So we just don't count them, because it might easily + * trigger a pointless attempt to spill. + */ + if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID) + return; + + txn = change->txn; + + /* + * Update the total size in top level as well. This is later used to + * compute the decoding stats. + */ + if (txn->toptxn != NULL) + toptxn = txn->toptxn; + else + toptxn = txn; + + if (addition) + { + txn->size += sz; + rb->size += sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size += sz; + } + else + { + Assert((rb->size >= sz) && (txn->size >= sz)); + txn->size -= sz; + rb->size -= sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size -= sz; + } + + Assert(txn->size <= rb->size); +} + +/* + * Add new (relfilenode, tid) -> (cmin, cmax) mappings. + * + * We do not include this change type in memory accounting, because we + * keep CIDs in a separate list and do not evict them when reaching + * the memory limit. + */ +void +ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, RelFileNode node, + ItemPointerData tid, CommandId cmin, + CommandId cmax, CommandId combocid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + change->data.tuplecid.node = node; + change->data.tuplecid.tid = tid; + change->data.tuplecid.cmin = cmin; + change->data.tuplecid.cmax = cmax; + change->data.tuplecid.combocid = combocid; + change->lsn = lsn; + change->txn = txn; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID; + + dlist_push_tail(&txn->tuplecids, &change->node); + txn->ntuplecids++; +} + +/* + * Setup the invalidation of the toplevel transaction. + * + * This needs to be called for each XLOG_XACT_INVALIDATIONS message and + * accumulates all the invalidation messages in the toplevel transaction as + * well as in the form of change in reorder buffer. We require to record it in + * form of the change so that we can execute only the required invalidations + * instead of executing all the invalidations on each CommandId increment. We + * also need to accumulate these in the toplevel transaction because in some + * cases we skip processing the transaction (see ReorderBufferForget), we need + * to execute all the invalidations together. + */ +void +ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + ReorderBufferChange *change; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction so that we can + * execute them all together. See comment atop this function + */ + if (txn->toptxn) + txn = txn->toptxn; + + Assert(nmsgs > 0); + + /* Accumulate invalidations. */ + if (txn->ninvalidations == 0) + { + txn->ninvalidations = nmsgs; + txn->invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(txn->invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + } + else + { + txn->invalidations = (SharedInvalidationMessage *) + repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * + (txn->ninvalidations + nmsgs)); + + memcpy(txn->invalidations + txn->ninvalidations, msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + txn->ninvalidations += nmsgs; + } + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Apply all invalidations we know. Possibly we only need parts at this point + * in the changestream but we don't know which those are. + */ +static void +ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs) +{ + int i; + + for (i = 0; i < nmsgs; i++) + LocalExecuteInvalidationMessage(&msgs[i]); +} + +/* + * Mark a transaction as containing catalog changes + */ +void +ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + + /* + * Mark top-level transaction as having catalog changes too if one of its + * children has so that the ReorderBufferBuildTupleCidHash can + * conveniently check just top-level transaction and decide whether to + * build the hash table or not. + */ + if (txn->toptxn != NULL) + txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; +} + +/* + * Query whether a transaction is already *known* to contain catalog + * changes. This can be wrong until directly before the commit! + */ +bool +ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + if (txn == NULL) + return false; + + return rbtxn_has_catalog_changes(txn); +} + +/* + * ReorderBufferXidHasBaseSnapshot + * Have we already set the base snapshot for the given txn/subtxn? + */ +bool +ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, + NULL, InvalidXLogRecPtr, false); + + /* transaction isn't known yet, ergo no snapshot */ + if (txn == NULL) + return false; + + /* a known subtxn? operate on top-level txn instead */ + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + + return txn->base_snapshot != NULL; +} + + +/* + * --------------------------------------- + * Disk serialization support + * --------------------------------------- + */ + +/* + * Ensure the IO buffer is >= sz. + */ +static void +ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz) +{ + if (!rb->outbufsize) + { + rb->outbuf = MemoryContextAlloc(rb->context, sz); + rb->outbufsize = sz; + } + else if (rb->outbufsize < sz) + { + rb->outbuf = repalloc(rb->outbuf, sz); + rb->outbufsize = sz; + } +} + +/* + * Find the largest transaction (toplevel or subxact) to evict (spill to disk). + * + * XXX With many subtransactions this might be quite slow, because we'll have + * to walk through all of them. There are some options how we could improve + * that: (a) maintain some secondary structure with transactions sorted by + * amount of changes, (b) not looking for the entirely largest transaction, + * but e.g. for transaction using at least some fraction of the memory limit, + * and (c) evicting multiple transactions at once, e.g. to free a given portion + * of the memory limit (e.g. 50%). + */ +static ReorderBufferTXN * +ReorderBufferLargestTXN(ReorderBuffer *rb) +{ + HASH_SEQ_STATUS hash_seq; + ReorderBufferTXNByIdEnt *ent; + ReorderBufferTXN *largest = NULL; + + hash_seq_init(&hash_seq, rb->by_txn); + while ((ent = hash_seq_search(&hash_seq)) != NULL) + { + ReorderBufferTXN *txn = ent->txn; + + /* if the current transaction is larger, remember it */ + if ((!largest) || (txn->size > largest->size)) + largest = txn; + } + + Assert(largest); + Assert(largest->size > 0); + Assert(largest->size <= rb->size); + + return largest; +} + +/* + * Find the largest toplevel transaction to evict (by streaming). + * + * This can be seen as an optimized version of ReorderBufferLargestTXN, which + * should give us the same transaction (because we don't update memory account + * for subtransaction with streaming, so it's always 0). But we can simply + * iterate over the limited number of toplevel transactions that have a base + * snapshot. There is no use of selecting a transaction that doesn't have base + * snapshot because we don't decode such transactions. + * + * Note that, we skip transactions that contains incomplete changes. There + * is a scope of optimization here such that we can select the largest + * transaction which has incomplete changes. But that will make the code and + * design quite complex and that might not be worth the benefit. If we plan to + * stream the transactions that contains incomplete changes then we need to + * find a way to partially stream/truncate the transaction changes in-memory + * and build a mechanism to partially truncate the spilled files. + * Additionally, whenever we partially stream the transaction we need to + * maintain the last streamed lsn and next time we need to restore from that + * segment and the offset in WAL. As we stream the changes from the top + * transaction and restore them subtransaction wise, we need to even remember + * the subxact from where we streamed the last change. + */ +static ReorderBufferTXN * +ReorderBufferLargestTopTXN(ReorderBuffer *rb) +{ + dlist_iter iter; + Size largest_size = 0; + ReorderBufferTXN *largest = NULL; + + /* Find the largest top-level transaction having a base snapshot. */ + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur); + + /* must not be a subtxn */ + Assert(!rbtxn_is_known_subxact(txn)); + /* base_snapshot must be set */ + Assert(txn->base_snapshot != NULL); + + if ((largest == NULL || txn->total_size > largest_size) && + (txn->total_size > 0) && !(rbtxn_has_partial_change(txn))) + { + largest = txn; + largest_size = txn->total_size; + } + } + + return largest; +} + +/* + * Check whether the logical_decoding_work_mem limit was reached, and if yes + * pick the largest (sub)transaction at-a-time to evict and spill its changes to + * disk until we reach under the memory limit. + * + * XXX At this point we select the transactions until we reach under the memory + * limit, but we might also adapt a more elaborate eviction strategy - for example + * evicting enough transactions to free certain fraction (e.g. 50%) of the memory + * limit. + */ +static void +ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + /* bail out if we haven't exceeded the memory limit */ + if (rb->size < logical_decoding_work_mem * 1024L) + return; + + /* + * Loop until we reach under the memory limit. One might think that just + * by evicting the largest (sub)transaction we will come under the memory + * limit based on assumption that the selected transaction is at least as + * large as the most recent change (which caused us to go over the memory + * limit). However, that is not true because a user can reduce the + * logical_decoding_work_mem to a smaller value before the most recent + * change. + */ + while (rb->size >= logical_decoding_work_mem * 1024L) + { + /* + * Pick the largest transaction (or subtransaction) and evict it from + * memory by streaming, if possible. Otherwise, spill to disk. + */ + if (ReorderBufferCanStartStreaming(rb) && + (txn = ReorderBufferLargestTopTXN(rb)) != NULL) + { + /* we know there has to be one, because the size is not zero */ + Assert(txn && !txn->toptxn); + Assert(txn->total_size > 0); + Assert(rb->size >= txn->total_size); + + ReorderBufferStreamTXN(rb, txn); + } + else + { + /* + * Pick the largest transaction (or subtransaction) and evict it + * from memory by serializing it to disk. + */ + txn = ReorderBufferLargestTXN(rb); + + /* we know there has to be one, because the size is not zero */ + Assert(txn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferSerializeTXN(rb, txn); + } + + /* + * After eviction, the transaction should have no entries in memory, + * and should use 0 bytes for changes. + */ + Assert(txn->size == 0); + Assert(txn->nentries_mem == 0); + } + + /* We must be under the memory limit now. */ + Assert(rb->size < logical_decoding_work_mem * 1024L); +} + +/* + * Spill data of a large transaction (and its subtransactions) to disk. + */ +static void +ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter subtxn_i; + dlist_mutable_iter change_i; + int fd = -1; + XLogSegNo curOpenSegNo = 0; + Size spilled = 0; + Size size = txn->size; + + elog(DEBUG2, "spill %u changes in XID %u to disk", + (uint32) txn->nentries_mem, txn->xid); + + /* do the same to all child TXs */ + dlist_foreach(subtxn_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur); + ReorderBufferSerializeTXN(rb, subtxn); + } + + /* serialize changestream */ + dlist_foreach_modify(change_i, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, change_i.cur); + + /* + * store in segment in which it belongs by start lsn, don't split over + * multiple segments tho + */ + if (fd == -1 || + !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size)) + { + char path[MAXPGPATH]; + + if (fd != -1) + CloseTransientFile(fd); + + XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + curOpenSegNo); + + /* open segment, create it if necessary */ + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | O_APPEND | PG_BINARY); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + ReorderBufferSerializeChange(rb, txn, fd, change); + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + + spilled++; + } + + /* update the statistics iff we have spilled anything */ + if (spilled) + { + rb->spillCount += 1; + rb->spillBytes += size; + + /* don't consider already serialized transactions */ + rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + } + + Assert(spilled == txn->nentries_mem); + Assert(dlist_is_empty(&txn->changes)); + txn->nentries_mem = 0; + txn->txn_flags |= RBTXN_IS_SERIALIZED; + + if (fd != -1) + CloseTransientFile(fd); +} + +/* + * Serialize individual change to disk. + */ +static void +ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change) +{ + ReorderBufferDiskChange *ondisk; + Size sz = sizeof(ReorderBufferDiskChange); + + ReorderBufferSerializeReserve(rb, sz); + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(&ondisk->change, change, sizeof(ReorderBufferChange)); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + char *data; + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + if (oldlen) + { + memcpy(data, &oldtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, oldtup->tuple.t_data, oldlen); + data += oldlen; + } + + if (newlen) + { + memcpy(data, &newtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, newtup->tuple.t_data, newlen); + data += newlen; + } + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + char *data; + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + /* write the prefix including the size */ + memcpy(data, &prefix_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.prefix, + prefix_size); + data += prefix_size; + + /* write the message including the size */ + memcpy(data, &change->data.msg.message_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.message, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + char *data; + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + sz += inval_size; + + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(data, change->data.inval.invalidations, inval_size); + data += inval_size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + char *data; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, snap, sizeof(SnapshotData)); + data += sizeof(SnapshotData); + + if (snap->xcnt) + { + memcpy(data, snap->xip, + sizeof(TransactionId) * snap->xcnt); + data += sizeof(TransactionId) * snap->xcnt; + } + + if (snap->subxcnt) + { + memcpy(data, snap->subxip, + sizeof(TransactionId) * snap->subxcnt); + data += sizeof(TransactionId) * snap->subxcnt; + } + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Size size; + char *data; + + /* account for the OIDs of truncated relations */ + size = sizeof(Oid) * change->data.truncate.nrelids; + sz += size; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, change->data.truncate.relids, size); + data += size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + ondisk->size = sz; + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE); + if (write(fd, rb->outbuf, ondisk->size) != ondisk->size) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to data file for XID %u: %m", + txn->xid))); + } + pgstat_report_wait_end(); + + /* + * Keep the transaction's final_lsn up to date with each change we send to + * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to + * only do this on commit and abort records, but that doesn't work if a + * system crash leaves a transaction without its abort record). + * + * Make sure not to move it backwards. + */ + if (txn->final_lsn < change->lsn) + txn->final_lsn = change->lsn; + + Assert(ondisk->change.action == change->action); +} + +/* Returns true, if the output plugin supports streaming, false, otherwise. */ +static inline bool +ReorderBufferCanStream(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + + return ctx->streaming; +} + +/* Returns true, if the streaming can be started now, false, otherwise. */ +static inline bool +ReorderBufferCanStartStreaming(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + SnapBuild *builder = ctx->snapshot_builder; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + return false; + + /* + * We can't start streaming immediately even if the streaming is enabled + * because we previously decoded this transaction and now just are + * restarting. + */ + if (ReorderBufferCanStream(rb) && + !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr)) + return true; + + return false; +} + +/* + * Send data of a large transaction (and its subtransactions) to the + * output plugin, but using the stream API. + */ +static void +ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Snapshot snapshot_now; + CommandId command_id; + Size stream_bytes; + bool txn_is_streamed; + + /* We can never reach here for a subtransaction. */ + Assert(txn->toptxn == NULL); + + /* + * We can't make any assumptions about base snapshot here, similar to what + * ReorderBufferCommit() does. That relies on base_snapshot getting + * transferred from subxact in ReorderBufferCommitChild(), but that was + * not yet called as the transaction is in-progress. + * + * So just walk the subxacts and use the same logic here. But we only need + * to do that once, when the transaction is streamed for the first time. + * After that we need to reuse the snapshot from the previous run. + * + * Unlike DecodeCommit which adds xids of all the subtransactions in + * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here + * but we do add them to subxip array instead via ReorderBufferCopySnap. + * This allows the catalog changes made in subtransactions decoded till + * now to be visible. + */ + if (txn->snapshot_now == NULL) + { + dlist_iter subxact_i; + + /* make sure this transaction is streamed for the first time */ + Assert(!rbtxn_is_streamed(txn)); + + /* at the beginning we should have invalid command ID */ + Assert(txn->command_id == InvalidCommandId); + + dlist_foreach(subxact_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur); + ReorderBufferTransferSnapToParent(txn, subtxn); + } + + /* + * If this transaction has no snapshot, it didn't make any changes to + * the database till now, so there's nothing to decode. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + return; + } + + command_id = FirstCommandId; + snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot, + txn, command_id); + } + else + { + /* the transaction must have been already streamed */ + Assert(rbtxn_is_streamed(txn)); + + /* + * Nah, we already have snapshot from the previous streaming run. We + * assume new subxacts can't move the LSN backwards, and so can't beat + * the LSN condition in the previous branch (so no need to walk + * through subxacts again). In fact, we must not do that as we may be + * using snapshot half-way through the subxact. + */ + command_id = txn->command_id; + + /* + * We can't use txn->snapshot_now directly because after the last + * streaming run, we might have got some new sub-transactions. So we + * need to add them to the snapshot. + */ + snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now, + txn, command_id); + + /* Free the previously copied snapshot. */ + Assert(txn->snapshot_now->copied); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + txn->snapshot_now = NULL; + } + + /* + * Remember this information to be used later to update stats. We can't + * update the stats here as an error while processing the changes would + * lead to the accumulation of stats even though we haven't streamed all + * the changes. + */ + txn_is_streamed = rbtxn_is_streamed(txn); + stream_bytes = txn->total_size; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, + command_id, true); + + rb->streamCount += 1; + rb->streamBytes += stream_bytes; + + /* Don't consider already streamed transaction. */ + rb->streamTxns += (txn_is_streamed) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + + Assert(dlist_is_empty(&txn->changes)); + Assert(txn->nentries == 0); + Assert(txn->nentries_mem == 0); +} + +/* + * Size of a change in memory. + */ +static Size +ReorderBufferChangeSize(ReorderBufferChange *change) +{ + Size sz = sizeof(ReorderBufferChange); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + sz += sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + sz += sizeof(Oid) * change->data.truncate.nrelids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + return sz; +} + + +/* + * Restore a number of changes spilled to disk back into memory. + */ +static Size +ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno) +{ + Size restored = 0; + XLogSegNo last_segno; + dlist_mutable_iter cleanup_iter; + File *fd = &file->vfd; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* free current entries, so we have memory for more */ + dlist_foreach_modify(cleanup_iter, &txn->changes) + { + ReorderBufferChange *cleanup = + dlist_container(ReorderBufferChange, node, cleanup_iter.cur); + + dlist_delete(&cleanup->node); + ReorderBufferReturnChange(rb, cleanup, true); + } + txn->nentries_mem = 0; + Assert(dlist_is_empty(&txn->changes)); + + XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size); + + while (restored < max_changes_in_memory && *segno <= last_segno) + { + int readBytes; + ReorderBufferDiskChange *ondisk; + + if (*fd == -1) + { + char path[MAXPGPATH]; + + /* first time in */ + if (*segno == 0) + XLByteToSeg(txn->first_lsn, *segno, wal_segment_size); + + Assert(*segno != 0 || dlist_is_empty(&txn->changes)); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + *segno); + + *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + + /* No harm in resetting the offset even in case of failure */ + file->curOffset = 0; + + if (*fd < 0 && errno == ENOENT) + { + *fd = -1; + (*segno)++; + continue; + } + else if (*fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + /* + * Read the statically sized part of a change which has information + * about the total size. If we couldn't read a record, we're at the + * end of this file. + */ + ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange)); + readBytes = FileRead(file->vfd, rb->outbuf, + sizeof(ReorderBufferDiskChange), + file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ); + + /* eof */ + if (readBytes == 0) + { + FileClose(*fd); + *fd = -1; + (*segno)++; + continue; + } + else if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) sizeof(ReorderBufferDiskChange)))); + + file->curOffset += readBytes; + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + ReorderBufferSerializeReserve(rb, + sizeof(ReorderBufferDiskChange) + ondisk->size); + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + readBytes = FileRead(file->vfd, + rb->outbuf + sizeof(ReorderBufferDiskChange), + ondisk->size - sizeof(ReorderBufferDiskChange), + file->curOffset, + WAIT_EVENT_REORDER_BUFFER_READ); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange))))); + + file->curOffset += readBytes; + + /* + * ok, read a full change from disk, now restore it into proper + * in-memory format + */ + ReorderBufferRestoreChange(rb, txn, rb->outbuf); + restored++; + } + + return restored; +} + +/* + * Convert change from its on-disk format to in-memory format and queue it onto + * the TXN's ->changes list. + * + * Note: although "data" is declared char*, at entry it points to a + * maxalign'd buffer, making it safe in most of this function to assume + * that the pointed-to data is suitably aligned for direct access. + */ +static void +ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data) +{ + ReorderBufferDiskChange *ondisk; + ReorderBufferChange *change; + + ondisk = (ReorderBufferDiskChange *) data; + + change = ReorderBufferGetChange(rb); + + /* copy static part */ + memcpy(change, &ondisk->change, sizeof(ReorderBufferChange)); + + data += sizeof(ReorderBufferDiskChange); + + /* restore individual stuff */ + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.oldtuple) + { + uint32 tuplelen = ((HeapTuple) data)->t_len; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.oldtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.oldtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.oldtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + if (change->data.tp.newtuple) + { + /* here, data might not be suitably aligned! */ + uint32 tuplelen; + + memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len), + sizeof(uint32)); + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.newtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.newtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.newtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size; + + /* read prefix */ + memcpy(&prefix_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.prefix = MemoryContextAlloc(rb->context, + prefix_size); + memcpy(change->data.msg.prefix, data, prefix_size); + Assert(change->data.msg.prefix[prefix_size - 1] == '\0'); + data += prefix_size; + + /* read the message */ + memcpy(&change->data.msg.message_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.message = MemoryContextAlloc(rb->context, + change->data.msg.message_size); + memcpy(change->data.msg.message, data, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + change->data.inval.invalidations = + MemoryContextAlloc(rb->context, inval_size); + + /* read the message */ + memcpy(change->data.inval.invalidations, data, inval_size); + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot oldsnap; + Snapshot newsnap; + Size size; + + oldsnap = (Snapshot) data; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * oldsnap->xcnt + + sizeof(TransactionId) * (oldsnap->subxcnt + 0); + + change->data.snapshot = MemoryContextAllocZero(rb->context, size); + + newsnap = change->data.snapshot; + + memcpy(newsnap, data, size); + newsnap->xip = (TransactionId *) + (((char *) newsnap) + sizeof(SnapshotData)); + newsnap->subxip = newsnap->xip + newsnap->xcnt; + newsnap->copied = true; + break; + } + /* the base struct contains all the data, easy peasy */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Oid *relids; + + relids = ReorderBufferGetRelids(rb, + change->data.truncate.nrelids); + memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid)); + change->data.truncate.relids = relids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + dlist_push_tail(&txn->changes, &change->node); + txn->nentries_mem++; + + /* + * Update memory accounting for the restored change. We need to do this + * although we don't check the memory limit when restoring the changes in + * this branch (we only do that when initially queueing the changes after + * decoding), because we will release the changes later, and that will + * update the accounting too (subtracting the size from the counters). And + * we don't want to underflow there. + */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Remove all on-disk stored for the passed in transaction. + */ +static void +ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + XLogSegNo first; + XLogSegNo cur; + XLogSegNo last; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + XLByteToSeg(txn->first_lsn, first, wal_segment_size); + XLByteToSeg(txn->final_lsn, last, wal_segment_size); + + /* iterate over all possible filenames, and delete them */ + for (cur = first; cur <= last; cur++) + { + char path[MAXPGPATH]; + + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur); + if (unlink(path) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } +} + +/* + * Remove any leftover serialized reorder buffers from a slot directory after a + * prior crash or decoding session exit. + */ +static void +ReorderBufferCleanupSerializedTXNs(const char *slotname) +{ + DIR *spill_dir; + struct dirent *spill_de; + struct stat statbuf; + char path[MAXPGPATH * 2 + 12]; + + sprintf(path, "pg_replslot/%s", slotname); + + /* we're only handling directories here, skip if it's not ours */ + if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + return; + + spill_dir = AllocateDir(path); + while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL) + { + /* only look at names that can be ours */ + if (strncmp(spill_de->d_name, "xid", 3) == 0) + { + snprintf(path, sizeof(path), + "pg_replslot/%s/%s", slotname, + spill_de->d_name); + + if (unlink(path) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m", + path, slotname))); + } + } + FreeDir(spill_dir); +} + +/* + * Given a replication slot, transaction ID and segment number, fill in the + * corresponding spill file into 'path', which is a caller-owned buffer of size + * at least MAXPGPATH. + */ +static void +ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, + XLogSegNo segno) +{ + XLogRecPtr recptr; + + XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr); + + snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill", + NameStr(MyReplicationSlot->data.name), + xid, LSN_FORMAT_ARGS(recptr)); +} + +/* + * Delete all data spilled to disk after we've restarted/crashed. It will be + * recreated when the respective slots are reused. + */ +void +StartupReorderBuffer(void) +{ + DIR *logical_dir; + struct dirent *logical_de; + + logical_dir = AllocateDir("pg_replslot"); + while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL) + { + if (strcmp(logical_de->d_name, ".") == 0 || + strcmp(logical_de->d_name, "..") == 0) + continue; + + /* if it cannot be a slot, skip the directory */ + if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + continue; + + /* + * ok, has to be a surviving logical slot, iterate and delete + * everything starting with xid-* + */ + ReorderBufferCleanupSerializedTXNs(logical_de->d_name); + } + FreeDir(logical_dir); +} + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ + +/* + * Initialize per tuple toast reconstruction support. + */ +static void +ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASHCTL hash_ctl; + + Assert(txn->toast_hash == NULL); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(ReorderBufferToastEnt); + hash_ctl.hcxt = rb->context; + txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +/* + * Per toast-chunk handling for toast reconstruction + * + * Appends a toast chunk so we can reconstruct it when the tuple "owning" the + * toasted Datum comes along. + */ +static void +ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + ReorderBufferToastEnt *ent; + ReorderBufferTupleBuf *newtup; + bool found; + int32 chunksize; + bool isnull; + Pointer chunk; + TupleDesc desc = RelationGetDescr(relation); + Oid chunk_id; + int32 chunk_seq; + + if (txn->toast_hash == NULL) + ReorderBufferToastInitHash(rb, txn); + + Assert(IsToastRelation(relation)); + + newtup = change->data.tp.newtuple; + chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull)); + Assert(!isnull); + chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull)); + Assert(!isnull); + + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + (void *) &chunk_id, + HASH_ENTER, + &found); + + if (!found) + { + Assert(ent->chunk_id == chunk_id); + ent->num_chunks = 0; + ent->last_chunk_seq = 0; + ent->size = 0; + ent->reconstructed = NULL; + dlist_init(&ent->chunks); + + if (chunk_seq != 0) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0", + chunk_seq, chunk_id); + } + else if (found && chunk_seq != ent->last_chunk_seq + 1) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d", + chunk_seq, chunk_id, ent->last_chunk_seq + 1); + + chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull)); + Assert(!isnull); + + /* calculate size so we can allocate the right size at once later */ + if (!VARATT_IS_EXTENDED(chunk)) + chunksize = VARSIZE(chunk) - VARHDRSZ; + else if (VARATT_IS_SHORT(chunk)) + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + else + elog(ERROR, "unexpected type of toast chunk"); + + ent->size += chunksize; + ent->last_chunk_seq = chunk_seq; + ent->num_chunks++; + dlist_push_tail(&ent->chunks, &change->node); +} + +/* + * Rejigger change->newtuple to point to in-memory toast tuples instead to + * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM). + * + * We cannot replace unchanged toast tuples though, so those will still point + * to on-disk toast data. + * + * While updating the existing change with detoasted tuple data, we need to + * update the memory accounting info, because the change size will differ. + * Otherwise the accounting may get out of sync, triggering serialization + * at unexpected times. + * + * We simply subtract size of the change before rejiggering the tuple, and + * then adding the new size. This makes it look like the change was removed + * and then added back, except it only tweaks the accounting info. + * + * In particular it can't trigger serialization, which would be pointless + * anyway as it happens during commit processing right before handing + * the change to the output plugin. + */ +static void +ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + TupleDesc desc; + int natt; + Datum *attrs; + bool *isnull; + bool *free; + HeapTuple tmphtup; + Relation toast_rel; + TupleDesc toast_desc; + MemoryContext oldcontext; + ReorderBufferTupleBuf *newtup; + Size old_size; + + /* no toast tuples changed */ + if (txn->toast_hash == NULL) + return; + + /* + * We're going to modify the size of the change. So, to make sure the + * accounting is correct we record the current change size and then after + * re-computing the change we'll subtract the recorded size and then + * re-add the new change size at the end. We don't immediately subtract + * the old size because if there is any error before we add the new size, + * we will release the changes and that will update the accounting info + * (subtracting the size from the counters). And we don't want to + * underflow there. + */ + old_size = ReorderBufferChangeSize(change); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* we should only have toast tuples in an INSERT or UPDATE */ + Assert(change->data.tp.newtuple); + + desc = RelationGetDescr(relation); + + toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid); + if (!RelationIsValid(toast_rel)) + elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")", + relation->rd_rel->reltoastrelid, RelationGetRelationName(relation)); + + toast_desc = RelationGetDescr(toast_rel); + + /* should we allocate from stack instead? */ + attrs = palloc0(sizeof(Datum) * desc->natts); + isnull = palloc0(sizeof(bool) * desc->natts); + free = palloc0(sizeof(bool) * desc->natts); + + newtup = change->data.tp.newtuple; + + heap_deform_tuple(&newtup->tuple, desc, attrs, isnull); + + for (natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute attr = TupleDescAttr(desc, natt); + ReorderBufferToastEnt *ent; + struct varlena *varlena; + + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + struct varatt_indirect redirect_pointer; + struct varlena *new_datum = NULL; + struct varlena *reconstructed; + dlist_iter it; + Size data_done = 0; + + /* system columns aren't toasted */ + if (attr->attnum < 0) + continue; + + if (attr->attisdropped) + continue; + + /* not a varlena datatype */ + if (attr->attlen != -1) + continue; + + /* no data */ + if (isnull[natt]) + continue; + + /* ok, we know we have a toast datum */ + varlena = (struct varlena *) DatumGetPointer(attrs[natt]); + + /* no need to do anything if the tuple isn't external */ + if (!VARATT_IS_EXTERNAL(varlena)) + continue; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena); + + /* + * Check whether the toast tuple changed, replace if so. + */ + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + (void *) &toast_pointer.va_valueid, + HASH_FIND, + NULL); + if (ent == NULL) + continue; + + new_datum = + (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + + free[natt] = true; + + reconstructed = palloc0(toast_pointer.va_rawsize); + + ent->reconstructed = reconstructed; + + /* stitch toast tuple back together from its parts */ + dlist_foreach(it, &ent->chunks) + { + bool isnull; + ReorderBufferChange *cchange; + ReorderBufferTupleBuf *ctup; + Pointer chunk; + + cchange = dlist_container(ReorderBufferChange, node, it.cur); + ctup = cchange->data.tp.newtuple; + chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull)); + + Assert(!isnull); + Assert(!VARATT_IS_EXTERNAL(chunk)); + Assert(!VARATT_IS_SHORT(chunk)); + + memcpy(VARDATA(reconstructed) + data_done, + VARDATA(chunk), + VARSIZE(chunk) - VARHDRSZ); + data_done += VARSIZE(chunk) - VARHDRSZ; + } + Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer)); + + /* make sure its marked as compressed or not */ + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ); + else + SET_VARSIZE(reconstructed, data_done + VARHDRSZ); + + memset(&redirect_pointer, 0, sizeof(redirect_pointer)); + redirect_pointer.pointer = reconstructed; + + SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT); + memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer, + sizeof(redirect_pointer)); + + attrs[natt] = PointerGetDatum(new_datum); + } + + /* + * Build tuple in separate memory & copy tuple back into the tuplebuf + * passed to the output plugin. We can't directly heap_fill_tuple() into + * the tuplebuf because attrs[] will point back into the current content. + */ + tmphtup = heap_form_tuple(desc, attrs, isnull); + Assert(newtup->tuple.t_len <= MaxHeapTupleSize); + Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data); + + memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len); + newtup->tuple.t_len = tmphtup->t_len; + + /* + * free resources we won't further need, more persistent stuff will be + * free'd in ReorderBufferToastReset(). + */ + RelationClose(toast_rel); + pfree(tmphtup); + for (natt = 0; natt < desc->natts; natt++) + { + if (free[natt]) + pfree(DatumGetPointer(attrs[natt])); + } + pfree(attrs); + pfree(free); + pfree(isnull); + + MemoryContextSwitchTo(oldcontext); + + /* subtract the old change size */ + ReorderBufferChangeMemoryUpdate(rb, change, false, old_size); + /* now add the change back, with the correct size */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Free all resources allocated for toast reconstruction. + */ +static void +ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferToastEnt *ent; + + if (txn->toast_hash == NULL) + return; + + /* sequentially walk over the hash and free everything */ + hash_seq_init(&hstat, txn->toast_hash); + while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL) + { + dlist_mutable_iter it; + + if (ent->reconstructed != NULL) + pfree(ent->reconstructed); + + dlist_foreach_modify(it, &ent->chunks) + { + ReorderBufferChange *change = + dlist_container(ReorderBufferChange, node, it.cur); + + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + } + } + + hash_destroy(txn->toast_hash); + txn->toast_hash = NULL; +} + + +/* --------------------------------------- + * Visibility support for logical decoding + * + * + * Lookup actual cmin/cmax values when using decoding snapshot. We can't + * always rely on stored cmin/cmax values because of two scenarios: + * + * * A tuple got changed multiple times during a single transaction and thus + * has got a combo CID. Combo CIDs are only valid for the duration of a + * single transaction. + * * A tuple with a cmin but no cmax (and thus no combo CID) got + * deleted/updated in another transaction than the one which created it + * which we are looking at right now. As only one of cmin, cmax or combo CID + * is actually stored in the heap we don't have access to the value we + * need anymore. + * + * To resolve those problems we have a per-transaction hash of (cmin, + * cmax) tuples keyed by (relfilenode, ctid) which contains the actual + * (cmin, cmax) values. That also takes care of combo CIDs by simply + * not caring about them at all. As we have the real cmin/cmax values + * combo CIDs aren't interesting. + * + * As we only care about catalog tuples here the overhead of this + * hashtable should be acceptable. + * + * Heap rewrites complicate this a bit, check rewriteheap.c for + * details. + * ------------------------------------------------------------------------- + */ + +/* struct for sorting mapping files by LSN efficiently */ +typedef struct RewriteMappingFile +{ + XLogRecPtr lsn; + char fname[MAXPGPATH]; +} RewriteMappingFile; + +#ifdef NOT_USED +static void +DisplayMapping(HTAB *tuplecid_data) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferTupleCidEnt *ent; + + hash_seq_init(&hstat, tuplecid_data); + while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) + { + elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + ent->key.relnode.dbNode, + ent->key.relnode.spcNode, + ent->key.relnode.relNode, + ItemPointerGetBlockNumber(&ent->key.tid), + ItemPointerGetOffsetNumber(&ent->key.tid), + ent->cmin, + ent->cmax + ); + } +} +#endif + +/* + * Apply a single mapping file to tuplecid_data. + * + * The mapping file has to have been verified to be a) committed b) for our + * transaction c) applied in LSN order. + */ +static void +ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +{ + char path[MAXPGPATH]; + int fd; + int readBytes; + LogicalRewriteMappingData map; + + sprintf(path, "pg_logical/mappings/%s", fname); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + while (true) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ReorderBufferTupleCidEnt *new_ent; + bool found; + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + /* read all mappings till the end of the file */ + pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ); + readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData)); + pgstat_report_wait_end(); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else if (readBytes == 0) /* EOF */ + break; + else if (readBytes != sizeof(LogicalRewriteMappingData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": read %d instead of %d bytes", + path, readBytes, + (int32) sizeof(LogicalRewriteMappingData)))); + + key.relnode = map.old_node; + ItemPointerCopy(&map.old_tid, + &key.tid); + + + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_FIND, + NULL); + + /* no existing mapping, no need to update */ + if (!ent) + continue; + + key.relnode = map.new_node; + ItemPointerCopy(&map.new_tid, + &key.tid); + + new_ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_ENTER, + &found); + + if (found) + { + /* + * Make sure the existing mapping makes sense. We sometime update + * old records that did not yet have a cmax (e.g. pg_class' own + * entry while rewriting it) during rewrites, so allow that. + */ + Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin); + Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax); + } + else + { + /* update mapping */ + new_ent->cmin = ent->cmin; + new_ent->cmax = ent->cmax; + new_ent->combocid = ent->combocid; + } + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + + +/* + * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return bsearch(&xid, xip, num, + sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * list_sort() comparator for sorting RewriteMappingFiles in LSN order. + */ +static int +file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p) +{ + RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p); + RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p); + + if (a->lsn < b->lsn) + return -1; + else if (a->lsn > b->lsn) + return 1; + return 0; +} + +/* + * Apply any existing logical remapping files if there are any targeted at our + * transaction for relid. + */ +static void +UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) +{ + DIR *mapping_dir; + struct dirent *mapping_de; + List *files = NIL; + ListCell *file; + Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId; + + mapping_dir = AllocateDir("pg_logical/mappings"); + while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL) + { + Oid f_dboid; + Oid f_relid; + TransactionId f_mapped_xid; + TransactionId f_create_xid; + XLogRecPtr f_lsn; + uint32 f_hi, + f_lo; + RewriteMappingFile *f; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + /* Ignore files that aren't ours */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &f_dboid, &f_relid, &f_hi, &f_lo, + &f_mapped_xid, &f_create_xid) != 6) + elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); + + f_lsn = ((uint64) f_hi) << 32 | f_lo; + + /* mapping for another database */ + if (f_dboid != dboid) + continue; + + /* mapping for another relation */ + if (f_relid != relid) + continue; + + /* did the creating transaction abort? */ + if (!TransactionIdDidCommit(f_create_xid)) + continue; + + /* not for our transaction */ + if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt)) + continue; + + /* ok, relevant, queue for apply */ + f = palloc(sizeof(RewriteMappingFile)); + f->lsn = f_lsn; + strcpy(f->fname, mapping_de->d_name); + files = lappend(files, f); + } + FreeDir(mapping_dir); + + /* sort files so we apply them in LSN order */ + list_sort(files, file_sort_by_lsn); + + foreach(file, files) + { + RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file); + + elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, + snapshot->subxip[0]); + ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + pfree(f); + } +} + +/* + * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on + * combo CIDs. + */ +bool +ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, Buffer buffer, + CommandId *cmin, CommandId *cmax) +{ + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ForkNumber forkno; + BlockNumber blockno; + bool updated_mapping = false; + + /* + * Return unresolved if tuplecid_data is not valid. That's because when + * streaming in-progress transactions we may run into tuples with the CID + * before actually decoding them. Think e.g. about INSERT followed by + * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the + * INSERT. So in such cases, we assume the CID is from the future + * command. + */ + if (tuplecid_data == NULL) + return false; + + /* be careful about padding */ + memset(&key, 0, sizeof(key)); + + Assert(!BufferIsLocal(buffer)); + + /* + * get relfilenode from the buffer, no convenient way to access it other + * than that. + */ + BufferGetTag(buffer, &key.relnode, &forkno, &blockno); + + /* tuples can only be in the main fork */ + Assert(forkno == MAIN_FORKNUM); + Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self)); + + ItemPointerCopy(&htup->t_self, + &key.tid); + +restart: + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, + (void *) &key, + HASH_FIND, + NULL); + + /* + * failed to find a mapping, check whether the table was rewritten and + * apply mapping if so, but only do that once - there can be no new + * mappings while we are in here since we have to hold a lock on the + * relation. + */ + if (ent == NULL && !updated_mapping) + { + UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot); + /* now check but don't update for a mapping again */ + updated_mapping = true; + goto restart; + } + else if (ent == NULL) + return false; + + if (cmin) + *cmin = ent->cmin; + if (cmax) + *cmax = ent->cmax; + return true; +} diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c new file mode 100644 index 0000000..6df6024 --- /dev/null +++ b/src/backend/replication/logical/snapbuild.c @@ -0,0 +1,1995 @@ +/*------------------------------------------------------------------------- + * + * snapbuild.c + * + * Infrastructure for building historic catalog snapshots based on contents + * of the WAL, for the purpose of decoding heapam.c style values in the + * WAL. + * + * NOTES: + * + * We build snapshots which can *only* be used to read catalog contents and we + * do so by reading and interpreting the WAL stream. The aim is to build a + * snapshot that behaves the same as a freshly taken MVCC snapshot would have + * at the time the XLogRecord was generated. + * + * To build the snapshots we reuse the infrastructure built for Hot + * Standby. The in-memory snapshots we build look different than HS' because + * we have different needs. To successfully decode data from the WAL we only + * need to access catalog tables and (sys|rel|cat)cache, not the actual user + * tables since the data we decode is wholly contained in the WAL + * records. Also, our snapshots need to be different in comparison to normal + * MVCC ones because in contrast to those we cannot fully rely on the clog and + * pg_subtrans for information about committed transactions because they might + * commit in the future from the POV of the WAL entry we're currently + * decoding. This definition has the advantage that we only need to prevent + * removal of catalog rows, while normal table's rows can still be + * removed. This is achieved by using the replication slot mechanism. + * + * As the percentage of transactions modifying the catalog normally is fairly + * small in comparisons to ones only manipulating user data, we keep track of + * the committed catalog modifying ones inside [xmin, xmax) instead of keeping + * track of all running transactions like it's done in a normal snapshot. Note + * that we're generally only looking at transactions that have acquired an + * xid. That is we keep a list of transactions between snapshot->(xmin, xmax) + * that we consider committed, everything else is considered aborted/in + * progress. That also allows us not to care about subtransactions before they + * have committed which means this module, in contrast to HS, doesn't have to + * care about suboverflowed subtransactions and similar. + * + * One complexity of doing this is that to e.g. handle mixed DDL/DML + * transactions we need Snapshots that see intermediate versions of the + * catalog in a transaction. During normal operation this is achieved by using + * CommandIds/cmin/cmax. The problem with that however is that for space + * efficiency reasons only one value of that is stored + * (cf. combocid.c). Since combo CIDs are only available in memory we log + * additional information which allows us to get the original (cmin, cmax) + * pair during visibility checks. Check the reorderbuffer.c's comment above + * ResolveCminCmaxDuringDecoding() for details. + * + * To facilitate all this we need our own visibility routine, as the normal + * ones are optimized for different usecases. + * + * To replace the normal catalog snapshots with decoding ones use the + * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions. + * + * + * + * The snapbuild machinery is starting up in several stages, as illustrated + * by the following graph describing the SnapBuild->state transitions: + * + * +-------------------------+ + * +----| START |-------------+ + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #1 | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | BUILDING_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #2, xacts from #1 finished | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | FULL_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * running_xacts | saved snapshot + * with zero xacts | at running_xacts's lsn + * | | | + * | running_xacts with xacts from #2 finished | + * | | | + * | v | + * | +-------------------------+ | + * +--->|SNAPBUILD_CONSISTENT |<------------+ + * +-------------------------+ + * + * Initially the machinery is in the START stage. When an xl_running_xacts + * record is read that is sufficiently new (above the safe xmin horizon), + * there's a state transition. If there were no running xacts when the + * running_xacts record was generated, we'll directly go into CONSISTENT + * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full + * snapshot means that all transactions that start henceforth can be decoded + * in their entirety, but transactions that started previously can't. In + * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously + * running transactions have committed or aborted. + * + * Only transactions that commit after CONSISTENT state has been reached will + * be replayed, even though they might have started while still in + * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous + * changes has been exported, but all the following ones will be. That point + * is a convenient point to initialize replication from, which is why we + * export a snapshot at that point, which *can* be used to read normal data. + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/snapbuild.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/block.h" /* debugging output */ +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/standby.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/snapshot.h" + +/* + * This struct contains the current state of the snapshot building + * machinery. Besides a forward declaration in the header, it is not exposed + * to the public, so we can easily change its contents. + */ +struct SnapBuild +{ + /* how far are we along building our first full snapshot */ + SnapBuildState state; + + /* private memory context used to allocate memory for this module. */ + MemoryContext context; + + /* all transactions < than this have committed/aborted */ + TransactionId xmin; + + /* all transactions >= than this are uncommitted */ + TransactionId xmax; + + /* + * Don't replay commits from an LSN < this LSN. This can be set externally + * but it will also be advanced (never retreat) from within snapbuild.c. + */ + XLogRecPtr start_decoding_at; + + /* + * LSN at which we found a consistent point at the time of slot creation. + * This is also the point where we have exported a snapshot for the + * initial copy. + * + * The prepared transactions that are not covered by initial snapshot + * needs to be sent later along with commit prepared and they must be + * before this point. + */ + XLogRecPtr initial_consistent_point; + + /* + * Don't start decoding WAL until the "xl_running_xacts" information + * indicates there are no running xids with an xid smaller than this. + */ + TransactionId initial_xmin_horizon; + + /* Indicates if we are building full snapshot or just catalog one. */ + bool building_full_snapshot; + + /* + * Snapshot that's valid to see the catalog state seen at this moment. + */ + Snapshot snapshot; + + /* + * LSN of the last location we are sure a snapshot has been serialized to. + */ + XLogRecPtr last_serialized_snapshot; + + /* + * The reorderbuffer we need to update with usable snapshots et al. + */ + ReorderBuffer *reorder; + + /* + * TransactionId at which the next phase of initial snapshot building will + * happen. InvalidTransactionId if not known (i.e. SNAPBUILD_START), or + * when no next phase necessary (SNAPBUILD_CONSISTENT). + */ + TransactionId next_phase_at; + + /* + * Array of transactions which could have catalog changes that committed + * between xmin and xmax. + */ + struct + { + /* number of committed transactions */ + size_t xcnt; + + /* available space for committed transactions */ + size_t xcnt_space; + + /* + * Until we reach a CONSISTENT state, we record commits of all + * transactions, not just the catalog changing ones. Record when that + * changes so we know we cannot export a snapshot safely anymore. + */ + bool includes_all_transactions; + + /* + * Array of committed transactions that have modified the catalog. + * + * As this array is frequently modified we do *not* keep it in + * xidComparator order. Instead we sort the array when building & + * distributing a snapshot. + * + * TODO: It's unclear whether that reasoning has much merit. Every + * time we add something here after becoming consistent will also + * require distributing a snapshot. Storing them sorted would + * potentially also make it easier to purge (but more complicated wrt + * wraparound?). Should be improved if sorting while building the + * snapshot shows up in profiles. + */ + TransactionId *xip; + } committed; +}; + +/* + * Starting a transaction -- which we need to do while exporting a snapshot -- + * removes knowledge about the previously used resowner, so we save it here. + */ +static ResourceOwner SavedResourceOwnerDuringExport = NULL; +static bool ExportInProgress = false; + +/* ->committed manipulation */ +static void SnapBuildPurgeCommittedTxn(SnapBuild *builder); + +/* snapshot building/manipulation/distribution functions */ +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder); + +static void SnapBuildFreeSnapshot(Snapshot snap); + +static void SnapBuildSnapIncRefcount(Snapshot snap); + +static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn); + +/* xlog reading helper functions for SnapBuildProcessRunningXacts */ +static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running); +static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff); + +/* serialization functions */ +static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn); +static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn); + +/* + * Allocate a new snapshot builder. + * + * xmin_horizon is the xid >= which we can be sure no catalog rows have been + * removed, start_lsn is the LSN >= we want to replay commits. + */ +SnapBuild * +AllocateSnapshotBuilder(ReorderBuffer *reorder, + TransactionId xmin_horizon, + XLogRecPtr start_lsn, + bool need_full_snapshot, + XLogRecPtr initial_consistent_point) +{ + MemoryContext context; + MemoryContext oldcontext; + SnapBuild *builder; + + /* allocate memory in own context, to have better accountability */ + context = AllocSetContextCreate(CurrentMemoryContext, + "snapshot builder context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(context); + + builder = palloc0(sizeof(SnapBuild)); + + builder->state = SNAPBUILD_START; + builder->context = context; + builder->reorder = reorder; + /* Other struct members initialized by zeroing via palloc0 above */ + + builder->committed.xcnt = 0; + builder->committed.xcnt_space = 128; /* arbitrary number */ + builder->committed.xip = + palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); + builder->committed.includes_all_transactions = true; + + builder->initial_xmin_horizon = xmin_horizon; + builder->start_decoding_at = start_lsn; + builder->building_full_snapshot = need_full_snapshot; + builder->initial_consistent_point = initial_consistent_point; + + MemoryContextSwitchTo(oldcontext); + + return builder; +} + +/* + * Free a snapshot builder. + */ +void +FreeSnapshotBuilder(SnapBuild *builder) +{ + MemoryContext context = builder->context; + + /* free snapshot explicitly, that contains some error checking */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + builder->snapshot = NULL; + } + + /* other resources are deallocated via memory context reset */ + MemoryContextDelete(context); +} + +/* + * Free an unreferenced snapshot that has previously been built by us. + */ +static void +SnapBuildFreeSnapshot(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + Assert(snap->regd_count == 0); + + /* slightly more likely, so it's checked even without c-asserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + if (snap->active_count) + elog(ERROR, "cannot free an active snapshot"); + + pfree(snap); +} + +/* + * In which state of snapshot building are we? + */ +SnapBuildState +SnapBuildCurrentState(SnapBuild *builder) +{ + return builder->state; +} + +/* + * Return the LSN at which the snapshot was exported + */ +XLogRecPtr +SnapBuildInitialConsistentPoint(SnapBuild *builder) +{ + return builder->initial_consistent_point; +} + +/* + * Should the contents of transaction ending at 'ptr' be decoded? + */ +bool +SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr) +{ + return ptr < builder->start_decoding_at; +} + +/* + * Increase refcount of a snapshot. + * + * This is used when handing out a snapshot to some external resource or when + * adding a Snapshot as builder->snapshot. + */ +static void +SnapBuildSnapIncRefcount(Snapshot snap) +{ + snap->active_count++; +} + +/* + * Decrease refcount of a snapshot and free if the refcount reaches zero. + * + * Externally visible, so that external resources that have been handed an + * IncRef'ed Snapshot can adjust its refcount easily. + */ +void +SnapBuildSnapDecRefcount(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + + Assert(snap->regd_count == 0); + + Assert(snap->active_count > 0); + + /* slightly more likely, so it's checked even without casserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + snap->active_count--; + if (snap->active_count == 0) + SnapBuildFreeSnapshot(snap); +} + +/* + * Build a new snapshot, based on currently committed catalog-modifying + * transactions. + * + * In-progress transactions with catalog access are *not* allowed to modify + * these snapshots; they have to copy them and fill in appropriate ->curcid + * and ->subxip/subxcnt values. + */ +static Snapshot +SnapBuildBuildSnapshot(SnapBuild *builder) +{ + Snapshot snapshot; + Size ssize; + + Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT); + + ssize = sizeof(SnapshotData) + + sizeof(TransactionId) * builder->committed.xcnt + + sizeof(TransactionId) * 1 /* toplevel xid */ ; + + snapshot = MemoryContextAllocZero(builder->context, ssize); + + snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC; + + /* + * We misuse the original meaning of SnapshotData's xip and subxip fields + * to make the more fitting for our needs. + * + * In the 'xip' array we store transactions that have to be treated as + * committed. Since we will only ever look at tuples from transactions + * that have modified the catalog it's more efficient to store those few + * that exist between xmin and xmax (frequently there are none). + * + * Snapshots that are used in transactions that have modified the catalog + * also use the 'subxip' array to store their toplevel xid and all the + * subtransaction xids so we can recognize when we need to treat rows as + * visible that are not in xip but still need to be visible. Subxip only + * gets filled when the transaction is copied into the context of a + * catalog modifying transaction since we otherwise share a snapshot + * between transactions. As long as a txn hasn't modified the catalog it + * doesn't need to treat any uncommitted rows as visible, so there is no + * need for those xids. + * + * Both arrays are qsort'ed so that we can use bsearch() on them. + */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + snapshot->xmin = builder->xmin; + snapshot->xmax = builder->xmax; + + /* store all transactions to be treated as committed by this snapshot */ + snapshot->xip = + (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); + snapshot->xcnt = builder->committed.xcnt; + memcpy(snapshot->xip, + builder->committed.xip, + builder->committed.xcnt * sizeof(TransactionId)); + + /* sort so we can bsearch() */ + qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator); + + /* + * Initially, subxip is empty, i.e. it's a snapshot to be used by + * transactions that don't modify the catalog. Will be filled by + * ReorderBufferCopySnap() if necessary. + */ + snapshot->subxcnt = 0; + snapshot->subxip = NULL; + + snapshot->suboverflowed = false; + snapshot->takenDuringRecovery = false; + snapshot->copied = false; + snapshot->curcid = FirstCommandId; + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->snapXactCompletionCount = 0; + + return snapshot; +} + +/* + * Build the initial slot snapshot and convert it to a normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. + * + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. + */ +Snapshot +SnapBuildInitialSnapshot(SnapBuild *builder) +{ + Snapshot snap; + TransactionId xid; + TransactionId *newxip; + int newxcnt = 0; + + Assert(!FirstSnapshotSet); + Assert(XactIsoLevel == XACT_REPEATABLE_READ); + + if (builder->state != SNAPBUILD_CONSISTENT) + elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); + + if (!builder->committed.includes_all_transactions) + elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); + + /* so we don't overwrite the existing value */ + if (TransactionIdIsValid(MyProc->xmin)) + elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid"); + + snap = SnapBuildBuildSnapshot(builder); + + /* + * We know that snap->xmin is alive, enforced by the logical xmin + * mechanism. Due to that we can do this without locks, we're only + * changing our own value. + */ +#ifdef USE_ASSERT_CHECKING + { + TransactionId safeXid; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + safeXid = GetOldestSafeDecodingTransactionId(false); + LWLockRelease(ProcArrayLock); + + Assert(TransactionIdPrecedesOrEquals(safeXid, snap->xmin)); + } +#endif + + MyProc->xmin = snap->xmin; + + /* allocate in transaction context */ + newxip = (TransactionId *) + palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); + + /* + * snapbuild.c builds transactions in an "inverted" manner, which means it + * stores committed transactions in ->xip, not ones in progress. Build a + * classical snapshot by marking all non-committed transactions as + * in-progress. This can be expensive. + */ + for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);) + { + void *test; + + /* + * Check whether transaction committed using the decoding snapshot + * meaning of ->xip. + */ + test = bsearch(&xid, snap->xip, snap->xcnt, + sizeof(TransactionId), xidComparator); + + if (test == NULL) + { + if (newxcnt >= GetMaxSnapshotXidCount()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("initial slot snapshot too large"))); + + newxip[newxcnt++] = xid; + } + + TransactionIdAdvance(xid); + } + + /* adjust remaining snapshot fields as needed */ + snap->snapshot_type = SNAPSHOT_MVCC; + snap->xcnt = newxcnt; + snap->xip = newxip; + + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildInitialSnapshot(builder); + + /* + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it + */ + snapname = ExportSnapshot(snap); + + ereport(LOG, + (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID", + "exported logical decoding snapshot: \"%s\" with %u transaction IDs", + snap->xcnt, + snapname, snap->xcnt))); + return snapname; +} + +/* + * Ensure there is a snapshot and if not build one for current transaction. + */ +Snapshot +SnapBuildGetOrBuildSnapshot(SnapBuild *builder, TransactionId xid) +{ + Assert(builder->state == SNAPBUILD_CONSISTENT); + + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + return builder->snapshot; +} + +/* + * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is + * any. Aborts the previously started transaction and resets the resource + * owner back to its original value. + */ +void +SnapBuildClearExportedSnapshot(void) +{ + ResourceOwner tmpResOwner; + + /* nothing exported, that is the usual case */ + if (!ExportInProgress) + return; + + if (!IsTransactionState()) + elog(ERROR, "clearing exported snapshot in wrong transaction state"); + + /* + * AbortCurrentTransaction() takes care of resetting the snapshot state, + * so remember SavedResourceOwnerDuringExport. + */ + tmpResOwner = SavedResourceOwnerDuringExport; + + /* make sure nothing could have ever happened */ + AbortCurrentTransaction(); + + CurrentResourceOwner = tmpResOwner; +} + +/* + * Clear snapshot export state during transaction abort. + */ +void +SnapBuildResetExportedSnapshotState(void) +{ + SavedResourceOwnerDuringExport = NULL; + ExportInProgress = false; +} + +/* + * Handle the effects of a single heap change, appropriate to the current state + * of the snapshot builder and returns whether changes made at (xid, lsn) can + * be decoded. + */ +bool +SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn) +{ + /* + * We can't handle data in transactions if we haven't built a snapshot + * yet, so don't store them. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return false; + + /* + * No point in keeping track of changes in transactions that we don't have + * enough information about to decode. This means that they started before + * we got into the SNAPBUILD_FULL_SNAPSHOT state. + */ + if (builder->state < SNAPBUILD_CONSISTENT && + TransactionIdPrecedes(xid, builder->next_phase_at)) + return false; + + /* + * If the reorderbuffer doesn't yet have a snapshot, add one now, it will + * be needed to decode the change we're currently processing. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + /* + * Increase refcount for the transaction we're handing the snapshot + * out to. + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + return true; +} + +/* + * Do CommandId/combo CID handling after reading an xl_heap_new_cid record. + * This implies that a transaction has done some form of write to system + * catalogs. + */ +void +SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn, xl_heap_new_cid *xlrec) +{ + CommandId cid; + + /* + * we only log new_cid's if a catalog tuple was modified, so mark the + * transaction as containing catalog modifications + */ + ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn); + + ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn, + xlrec->target_node, xlrec->target_tid, + xlrec->cmin, xlrec->cmax, + xlrec->combocid); + + /* figure out new command id */ + if (xlrec->cmin != InvalidCommandId && + xlrec->cmax != InvalidCommandId) + cid = Max(xlrec->cmin, xlrec->cmax); + else if (xlrec->cmax != InvalidCommandId) + cid = xlrec->cmax; + else if (xlrec->cmin != InvalidCommandId) + cid = xlrec->cmin; + else + { + cid = InvalidCommandId; /* silence compiler */ + elog(ERROR, "xl_heap_new_cid record without a valid CommandId"); + } + + ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1); +} + +/* + * Add a new Snapshot to all transactions we're decoding that currently are + * in-progress so they can see new catalog contents made by the transaction + * that just committed. This is necessary because those in-progress + * transactions will use the new catalog's contents from here on (at the very + * least everything they do needs to be compatible with newer catalog + * contents). + */ +static void +SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn) +{ + dlist_iter txn_i; + ReorderBufferTXN *txn; + + /* + * Iterate through all toplevel transactions. This can include + * subtransactions which we just don't yet know to be that, but that's + * fine, they will just get an unnecessary snapshot queued. + */ + dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn) + { + txn = dlist_container(ReorderBufferTXN, node, txn_i.cur); + + Assert(TransactionIdIsValid(txn->xid)); + + /* + * If we don't have a base snapshot yet, there are no changes in this + * transaction which in turn implies we don't yet need a snapshot at + * all. We'll add a snapshot when the first change gets queued. + * + * NB: This works correctly even for subtransactions because + * ReorderBufferAssignChild() takes care to transfer the base snapshot + * to the top-level transaction, and while iterating the changequeue + * we'll get the change from the subtxn. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid)) + continue; + + /* + * We don't need to add snapshot to prepared transactions as they + * should not see the new catalog contents. + */ + if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn)) + continue; + + elog(DEBUG2, "adding a new snapshot to %u at %X/%X", + txn->xid, LSN_FORMAT_ARGS(lsn)); + + /* + * increase the snapshot's refcount for the transaction we are handing + * it out to + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn, + builder->snapshot); + } +} + +/* + * Keep track of a new catalog changing transaction that has committed. + */ +static void +SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + if (builder->committed.xcnt == builder->committed.xcnt_space) + { + builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1; + + elog(DEBUG1, "increasing space for committed transactions to %u", + (uint32) builder->committed.xcnt_space); + + builder->committed.xip = repalloc(builder->committed.xip, + builder->committed.xcnt_space * sizeof(TransactionId)); + } + + /* + * TODO: It might make sense to keep the array sorted here instead of + * doing it every time we build a new snapshot. On the other hand this + * gets called repeatedly when a transaction with subtransactions commits. + */ + builder->committed.xip[builder->committed.xcnt++] = xid; +} + +/* + * Remove knowledge about transactions we treat as committed that are smaller + * than ->xmin. Those won't ever get checked via the ->committed array but via + * the clog machinery, so we don't need to waste memory on them. + */ +static void +SnapBuildPurgeCommittedTxn(SnapBuild *builder) +{ + int off; + TransactionId *workspace; + int surviving_xids = 0; + + /* not ready yet */ + if (!TransactionIdIsNormal(builder->xmin)) + return; + + /* TODO: Neater algorithm than just copying and iterating? */ + workspace = + MemoryContextAlloc(builder->context, + builder->committed.xcnt * sizeof(TransactionId)); + + /* copy xids that still are interesting to workspace */ + for (off = 0; off < builder->committed.xcnt; off++) + { + if (NormalTransactionIdPrecedes(builder->committed.xip[off], + builder->xmin)) + ; /* remove */ + else + workspace[surviving_xids++] = builder->committed.xip[off]; + } + + /* copy workspace back to persistent state */ + memcpy(builder->committed.xip, workspace, + surviving_xids * sizeof(TransactionId)); + + elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->committed.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->committed.xcnt = surviving_xids; + + pfree(workspace); +} + +/* + * Handle everything that needs to be done when a transaction commits + */ +void +SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, + int nsubxacts, TransactionId *subxacts) +{ + int nxact; + + bool needs_snapshot = false; + bool needs_timetravel = false; + bool sub_needs_timetravel = false; + + TransactionId xmax = xid; + + /* + * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor + * will they be part of a snapshot. So we don't need to record anything. + */ + if (builder->state == SNAPBUILD_START || + (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedes(xid, builder->next_phase_at))) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + return; + } + + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + + /* + * If building an exportable snapshot, force xid to be tracked, even + * if the transaction didn't modify the catalog. + */ + if (builder->building_full_snapshot) + { + needs_timetravel = true; + } + } + + for (nxact = 0; nxact < nsubxacts; nxact++) + { + TransactionId subxid = subxacts[nxact]; + + /* + * Add subtransaction to base snapshot if catalog modifying, we don't + * distinguish to toplevel transactions there. + */ + if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid)) + { + sub_needs_timetravel = true; + needs_snapshot = true; + + elog(DEBUG1, "found subtransaction %u:%u with catalog changes", + xid, subxid); + + SnapBuildAddCommittedTxn(builder, subxid); + + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + + /* + * If we're forcing timetravel we also need visibility information + * about subtransaction, so keep track of subtransaction's state, even + * if not catalog modifying. Don't need to distribute a snapshot in + * that case. + */ + else if (needs_timetravel) + { + SnapBuildAddCommittedTxn(builder, subxid); + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + } + + /* if top-level modified catalog, it'll need a snapshot */ + if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) + { + elog(DEBUG2, "found top level transaction %u, with catalog changes", + xid); + needs_snapshot = true; + needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (sub_needs_timetravel) + { + /* track toplevel txn as well, subxact alone isn't meaningful */ + SnapBuildAddCommittedTxn(builder, xid); + } + else if (needs_timetravel) + { + elog(DEBUG2, "forced transaction %u to do timetravel", xid); + + SnapBuildAddCommittedTxn(builder, xid); + } + + if (!needs_timetravel) + { + /* record that we cannot export a general snapshot anymore */ + builder->committed.includes_all_transactions = false; + } + + Assert(!needs_snapshot || needs_timetravel); + + /* + * Adjust xmax of the snapshot builder, we only do that for committed, + * catalog modifying, transactions, everything else isn't interesting for + * us since we'll never look at the respective rows. + */ + if (needs_timetravel && + (!TransactionIdIsValid(builder->xmax) || + TransactionIdFollowsOrEquals(xmax, builder->xmax))) + { + builder->xmax = xmax; + TransactionIdAdvance(builder->xmax); + } + + /* if there's any reason to build a historic snapshot, do so now */ + if (needs_snapshot) + { + /* + * If we haven't built a complete snapshot yet there's no need to hand + * it out, it wouldn't (and couldn't) be used anyway. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return; + + /* + * Decrease the snapshot builder's refcount of the old snapshot, note + * that it still will be used if it has been handed out to the + * reorderbuffer earlier. + */ + if (builder->snapshot) + SnapBuildSnapDecRefcount(builder->snapshot); + + builder->snapshot = SnapBuildBuildSnapshot(builder); + + /* we might need to execute invalidations, add snapshot */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + /* refcount of the snapshot builder for the new snapshot */ + SnapBuildSnapIncRefcount(builder->snapshot); + + /* add a new catalog snapshot to all currently running transactions */ + SnapBuildDistributeNewCatalogSnapshot(builder, lsn); + } +} + + +/* ----------------------------------- + * Snapshot building functions dealing with xlog records + * ----------------------------------- + */ + +/* + * Process a running xacts record, and use its information to first build a + * historic snapshot and later to release resources that aren't needed + * anymore. + */ +void +SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + ReorderBufferTXN *txn; + TransactionId xmin; + + /* + * If we're not consistent yet, inspect the record to see whether it + * allows to get closer to being consistent. If we are consistent, dump + * our snapshot so others or we, after a restart, can use it. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* returns false if there's no point in performing cleanup just yet */ + if (!SnapBuildFindSnapshot(builder, lsn, running)) + return; + } + else + SnapBuildSerialize(builder, lsn); + + /* + * Update range of interesting xids based on the running xacts + * information. We don't increase ->xmax using it, because once we are in + * a consistent state we can do that ourselves and much more efficiently + * so, because we only need to do it for catalog transactions since we + * only ever look at those. + * + * NB: We only increase xmax when a catalog modifying transaction commits + * (see SnapBuildCommitTxn). Because of this, xmax can be lower than + * xmin, which looks odd but is correct and actually more efficient, since + * we hit fast paths in heapam_visibility.c. + */ + builder->xmin = running->oldestRunningXid; + + /* Remove transactions we don't need to keep track off anymore */ + SnapBuildPurgeCommittedTxn(builder); + + /* + * Advance the xmin limit for the current replication slot, to allow + * vacuum to clean up the tuples this slot has been protecting. + * + * The reorderbuffer might have an xmin among the currently running + * snapshots; use it if so. If not, we need only consider the snapshots + * we'll produce later, which can't be less than the oldest running xid in + * the record we're reading now. + */ + xmin = ReorderBufferGetOldestXmin(builder->reorder); + if (xmin == InvalidTransactionId) + xmin = running->oldestRunningXid; + elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u", + builder->xmin, builder->xmax, running->oldestRunningXid, xmin); + LogicalIncreaseXminForSlot(lsn, xmin); + + /* + * Also tell the slot where we can restart decoding from. We don't want to + * do that after every commit because changing that implies an fsync of + * the logical slot's state file, so we only do it every time we see a + * running xacts record. + * + * Do so by looking for the oldest in progress transaction (determined by + * the first LSN of any of its relevant records). Every transaction + * remembers the last location we stored the snapshot to disk before its + * beginning. That point is where we can restart from. + */ + + /* + * Can't know about a serialized snapshot's location if we're not + * consistent. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + txn = ReorderBufferGetOldestTXN(builder->reorder); + + /* + * oldest ongoing txn might have started when we didn't yet serialize + * anything because we hadn't reached a consistent state yet. + */ + if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); + + /* + * No in-progress transaction, can reuse the last serialized snapshot if + * we have one. + */ + else if (txn == NULL && + builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && + builder->last_serialized_snapshot != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, + builder->last_serialized_snapshot); +} + + +/* + * Build the start of a snapshot that's capable of decoding the catalog. + * + * Helper function for SnapBuildProcessRunningXacts() while we're not yet + * consistent. + * + * Returns true if there is a point in performing internal maintenance/cleanup + * using the xl_running_xacts record. + */ +static bool +SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + /* --- + * Build catalog decoding snapshot incrementally using information about + * the currently running transactions. There are several ways to do that: + * + * a) There were no running transactions when the xl_running_xacts record + * was inserted, jump to CONSISTENT immediately. We might find such a + * state while waiting on c)'s sub-states. + * + * b) This (in a previous run) or another decoding slot serialized a + * snapshot to disk that we can use. Can't use this method for the + * initial snapshot when slot is being created and needs full snapshot + * for export or direct use, as that snapshot will only contain catalog + * modifying transactions. + * + * c) First incrementally build a snapshot for catalog tuples + * (BUILDING_SNAPSHOT), that requires all, already in-progress, + * transactions to finish. Every transaction starting after that + * (FULL_SNAPSHOT state), has enough information to be decoded. But + * for older running transactions no viable snapshot exists yet, so + * CONSISTENT will only be reached once all of those have finished. + * --- + */ + + /* + * xl_running_xact record is older than what we can use, we might not have + * all necessary catalog rows anymore. + */ + if (TransactionIdIsNormal(builder->initial_xmin_horizon) && + NormalTransactionIdPrecedes(running->oldestRunningXid, + builder->initial_xmin_horizon)) + { + ereport(DEBUG1, + (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid))); + + + SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); + + return true; + } + + /* + * a) No transaction were running, we can jump to consistent. + * + * This is not affected by races around xl_running_xacts, because we can + * miss transaction commits, but currently not transactions starting. + * + * NB: We might have already started to incrementally assemble a snapshot, + * so we need to be careful to deal with that. + */ + if (running->oldestRunningXid == running->nextXid) + { + if (builder->start_decoding_at == InvalidXLogRecPtr || + builder->start_decoding_at <= lsn) + /* can decode everything after this */ + builder->start_decoding_at = lsn + 1; + + /* As no transactions were running xmin/xmax can be trivially set. */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions."))); + + return false; + } + /* b) valid on disk state and not building full snapshot */ + else if (!builder->building_full_snapshot && + SnapBuildRestore(builder, lsn)) + { + /* there won't be any state to cleanup */ + return false; + } + + /* + * c) transition from START to BUILDING_SNAPSHOT. + * + * In START state, and a xl_running_xacts record with running xacts is + * encountered. In that case, switch to BUILDING_SNAPSHOT state, and + * record xl_running_xacts->nextXid. Once all running xacts have finished + * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It + * might look that we could use xl_running_xact's ->xids information to + * get there quicker, but that is problematic because transactions marked + * as running, might already have inserted their commit record - it's + * infeasible to change that with locking. + */ + else if (builder->state == SNAPBUILD_START) + { + builder->state = SNAPBUILD_BUILDING_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + /* + * Start with an xmin/xmax that's correct for future, when all the + * currently running transactions have finished. We'll update both + * while waiting for the pending transactions to finish. + */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + ereport(LOG, + (errmsg("logical decoding found initial starting point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT. + * + * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid + * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This + * means all transactions starting afterwards have enough information to + * be decoded. Switch to FULL_SNAPSHOT. + */ + else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_FULL_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + ereport(LOG, + (errmsg("logical decoding found initial consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from FULL_SNAPSHOT to CONSISTENT. + * + * In FULL_SNAPSHOT state (see d) ), and this xl_running_xacts' + * oldestRunningXid is >= than nextXid from when we switched to + * FULL_SNAPSHOT. This means all transactions that are currently in + * progress have a catalog snapshot, and all their changes have been + * collected. Switch to CONSISTENT. + */ + else if (builder->state == SNAPBUILD_FULL_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore."))); + } + + /* + * We already started to track running xacts and need to wait for all + * in-progress ones to finish. We fall through to the normal processing of + * records so incremental cleanup can be performed. + */ + return true; + +} + +/* --- + * Iterate through xids in record, wait for all older than the cutoff to + * finish. Then, if possible, log a new xl_running_xacts record. + * + * This isn't required for the correctness of decoding, but to: + * a) allow isolationtester to notice that we're currently waiting for + * something. + * b) log a new xl_running_xacts record where it'd be helpful, without having + * to wait for bgwriter or checkpointer. + * --- + */ +static void +SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff) +{ + int off; + + for (off = 0; off < running->xcnt; off++) + { + TransactionId xid = running->xids[off]; + + /* + * Upper layers should prevent that we ever need to wait on ourselves. + * Check anyway, since failing to do so would either result in an + * endless wait or an Assert() failure. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + elog(ERROR, "waiting for ourselves"); + + if (TransactionIdFollows(xid, cutoff)) + continue; + + XactLockTableWait(xid, NULL, NULL, XLTW_None); + } + + /* + * All transactions we needed to finish finished - try to ensure there is + * another xl_running_xacts record in a timely manner, without having to + * wait for bgwriter or checkpointer to log one. During recovery we can't + * enforce that, so we'll have to wait. + */ + if (!RecoveryInProgress()) + { + LogStandbySnapshot(); + } +} + +/* ----------------------------------- + * Snapshot serialization support + * ----------------------------------- + */ + +/* + * We store current state of struct SnapBuild on disk in the following manner: + * + * struct SnapBuildOnDisk; + * TransactionId * running.xcnt_space; + * TransactionId * committed.xcnt; (*not xcnt_space*) + * + */ +typedef struct SnapBuildOnDisk +{ + /* first part of this struct needs to be version independent */ + + /* data not covered by checksum */ + uint32 magic; + pg_crc32c checksum; + + /* data covered by checksum */ + + /* version, in case we want to support pg_upgrade */ + uint32 version; + /* how large is the on disk data, excluding the constant sized part */ + uint32 length; + + /* version dependent part */ + SnapBuild builder; + + /* variable amount of TransactionIds follows */ +} SnapBuildOnDisk; + +#define SnapBuildOnDiskConstantSize \ + offsetof(SnapBuildOnDisk, builder) +#define SnapBuildOnDiskNotChecksummedSize \ + offsetof(SnapBuildOnDisk, version) + +#define SNAPBUILD_MAGIC 0x51A1E001 +#define SNAPBUILD_VERSION 4 + +/* + * Store/Load a snapshot from disk, depending on the snapshot builder's state. + * + * Supposed to be used by external (i.e. not snapbuild.c) code that just read + * a record that's a potential location for a serialized snapshot. + */ +void +SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn) +{ + if (builder->state < SNAPBUILD_CONSISTENT) + SnapBuildRestore(builder, lsn); + else + SnapBuildSerialize(builder, lsn); +} + +/* + * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already + * been done by another decoding process. + */ +static void +SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) +{ + Size needed_length; + SnapBuildOnDisk *ondisk = NULL; + char *ondisk_c; + int fd; + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + int ret; + struct stat stat_buf; + Size sz; + + Assert(lsn != InvalidXLogRecPtr); + Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || + builder->last_serialized_snapshot <= lsn); + + /* + * no point in serializing if we cannot continue to work immediately after + * restoring the snapshot + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + /* consistent snapshots have no next phase */ + Assert(builder->next_phase_at == InvalidTransactionId); + + /* + * We identify snapshots by the LSN they are valid for. We don't need to + * include timelines in the name as each LSN maps to exactly one timeline + * unless the user used pg_resetwal or similar. If a user did so, there's + * no hope continuing to decode anyway. + */ + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + /* + * first check whether some other backend already has written the snapshot + * for this LSN. It's perfectly fine if there's none, so we accept ENOENT + * as a valid state. Everything else is an unexpected error. + */ + ret = stat(path, &stat_buf); + + if (ret != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + + else if (ret == 0) + { + /* + * somebody else has already serialized to this point, don't overwrite + * but remember location, so we don't need to read old data again. + * + * To be sure it has been synced to disk after the rename() from the + * tempfile filename to the real filename, we just repeat the fsync. + * That ought to be cheap because in most scenarios it should already + * be safely on disk. + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + builder->last_serialized_snapshot = lsn; + goto out; + } + + /* + * there is an obvious race condition here between the time we stat(2) the + * file and us writing the file. But we rename the file into place + * atomically and all files created need to contain the same data anyway, + * so this is perfectly fine, although a bit of a resource waste. Locking + * seems like pointless complication. + */ + elog(DEBUG1, "serializing snapshot to %s", path); + + /* to make sure only we will write to this tempfile, include pid */ + sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%d.tmp", + LSN_FORMAT_ARGS(lsn), MyProcPid); + + /* + * Unlink temporary file if it already exists, needs to have been before a + * crash/error since we won't enter this function twice from within a + * single decoding slot/backend and the temporary file contains the pid of + * the current process. + */ + if (unlink(tmppath) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", tmppath))); + + needed_length = sizeof(SnapBuildOnDisk) + + sizeof(TransactionId) * builder->committed.xcnt; + + ondisk_c = MemoryContextAllocZero(builder->context, needed_length); + ondisk = (SnapBuildOnDisk *) ondisk_c; + ondisk->magic = SNAPBUILD_MAGIC; + ondisk->version = SNAPBUILD_VERSION; + ondisk->length = needed_length; + INIT_CRC32C(ondisk->checksum); + COMP_CRC32C(ondisk->checksum, + ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + ondisk_c += sizeof(SnapBuildOnDisk); + + memcpy(&ondisk->builder, builder, sizeof(SnapBuild)); + /* NULL-ify memory-only data */ + ondisk->builder.context = NULL; + ondisk->builder.snapshot = NULL; + ondisk->builder.reorder = NULL; + ondisk->builder.committed.xip = NULL; + + COMP_CRC32C(ondisk->checksum, + &ondisk->builder, + sizeof(SnapBuild)); + + /* copy committed xacts */ + sz = sizeof(TransactionId) * builder->committed.xcnt; + memcpy(ondisk_c, builder->committed.xip, sz); + COMP_CRC32C(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + + FIN_CRC32C(ondisk->checksum); + + /* we have valid data now, open tempfile and write it there */ + fd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", tmppath))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); + if ((write(fd, ondisk, needed_length)) != needed_length) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + /* + * fsync the file before renaming so that even if we crash after this we + * have either a fully valid file or nothing. + * + * It's safe to just ERROR on fsync() here because we'll retry the whole + * operation including the writes. + * + * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has + * some noticeable overhead since it's performed synchronously during + * decoding? + */ + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + fsync_fname("pg_logical/snapshots", true); + + /* + * We may overwrite the work from some other backend, but that's ok, our + * snapshot is valid as well, we'll just have done some superfluous work. + */ + if (rename(tmppath, path) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + } + + /* make sure we persist */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + /* + * Now there's no way we can loose the dumped state anymore, remember this + * as a serialization point. + */ + builder->last_serialized_snapshot = lsn; + +out: + ReorderBufferSetRestartPoint(builder->reorder, + builder->last_serialized_snapshot); + /* be tidy */ + if (ondisk) + pfree(ondisk); +} + +/* + * Restore a snapshot into 'builder' if previously one has been stored at the + * location indicated by 'lsn'. Returns true if successful, false otherwise. + */ +static bool +SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) +{ + SnapBuildOnDisk ondisk; + int fd; + char path[MAXPGPATH]; + Size sz; + int readBytes; + pg_crc32c checksum; + + /* no point in loading a snapshot if we're already there */ + if (builder->state == SNAPBUILD_CONSISTENT) + return false; + + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + if (fd < 0 && errno == ENOENT) + return false; + else if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* ---- + * Make sure the snapshot had been stored safely to disk, that's normally + * cheap. + * Note that we do not need PANIC here, nobody will be able to use the + * slot without fsyncing, and saving it won't succeed without an fsync() + * either... + * ---- + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + + /* read statically sized portion of snapshot */ + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); + readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize); + pgstat_report_wait_end(); + if (readBytes != SnapBuildOnDiskConstantSize) + { + int save_errno = errno; + + CloseTransientFile(fd); + + if (readBytes < 0) + { + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, + (Size) SnapBuildOnDiskConstantSize))); + } + + if (ondisk.magic != SNAPBUILD_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u", + path, ondisk.magic, SNAPBUILD_MAGIC))); + + if (ondisk.version != SNAPBUILD_VERSION) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u", + path, ondisk.version, SNAPBUILD_VERSION))); + + INIT_CRC32C(checksum); + COMP_CRC32C(checksum, + ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + + /* read SnapBuild */ + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); + readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild)); + pgstat_report_wait_end(); + if (readBytes != sizeof(SnapBuild)) + { + int save_errno = errno; + + CloseTransientFile(fd); + + if (readBytes < 0) + { + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(SnapBuild)))); + } + COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild)); + + /* restore committed xacts information */ + sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt; + ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz); + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); + readBytes = read(fd, ondisk.builder.committed.xip, sz); + pgstat_report_wait_end(); + if (readBytes != sz) + { + int save_errno = errno; + + CloseTransientFile(fd); + + if (readBytes < 0) + { + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sz))); + } + COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + FIN_CRC32C(checksum); + + /* verify checksum of what we've read */ + if (!EQ_CRC32C(checksum, ondisk.checksum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u", + path, checksum, ondisk.checksum))); + + /* + * ok, we now have a sensible snapshot here, figure out if it has more + * information than we have. + */ + + /* + * We are only interested in consistent snapshots for now, comparing + * whether one incomplete snapshot is more "advanced" seems to be + * unnecessarily complex. + */ + if (ondisk.builder.state < SNAPBUILD_CONSISTENT) + goto snapshot_not_interesting; + + /* + * Don't use a snapshot that requires an xmin that we cannot guarantee to + * be available. + */ + if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon)) + goto snapshot_not_interesting; + + /* consistent snapshots have no next phase */ + Assert(ondisk.builder.next_phase_at == InvalidTransactionId); + + /* ok, we think the snapshot is sensible, copy over everything important */ + builder->xmin = ondisk.builder.xmin; + builder->xmax = ondisk.builder.xmax; + builder->state = ondisk.builder.state; + + builder->committed.xcnt = ondisk.builder.committed.xcnt; + /* We only allocated/stored xcnt, not xcnt_space xids ! */ + /* don't overwrite preallocated xip, if we don't have anything here */ + if (builder->committed.xcnt > 0) + { + pfree(builder->committed.xip); + builder->committed.xcnt_space = ondisk.builder.committed.xcnt; + builder->committed.xip = ondisk.builder.committed.xip; + } + ondisk.builder.committed.xip = NULL; + + /* our snapshot is not interesting anymore, build a new one */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + } + builder->snapshot = SnapBuildBuildSnapshot(builder); + SnapBuildSnapIncRefcount(builder->snapshot); + + ReorderBufferSetRestartPoint(builder->reorder, lsn); + + Assert(builder->state == SNAPBUILD_CONSISTENT); + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot."))); + return true; + +snapshot_not_interesting: + if (ondisk.builder.committed.xip != NULL) + pfree(ondisk.builder.committed.xip); + return false; +} + +/* + * Remove all serialized snapshots that are not required anymore because no + * slot can need them. This doesn't actually have to run during a checkpoint, + * but it's a convenient point to schedule this. + * + * NB: We run this during checkpoints even if logical decoding is disabled so + * we cleanup old slots at some point after it got disabled. + */ +void +CheckPointSnapBuild(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *snap_dir; + struct dirent *snap_de; + char path[MAXPGPATH + 21]; + + /* + * We start off with a minimum of the last redo pointer. No new + * replication slot will start before that, so that's a safe upper bound + * for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (redo < cutoff) + cutoff = redo; + + snap_dir = AllocateDir("pg_logical/snapshots"); + while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL) + { + uint32 hi; + uint32 lo; + XLogRecPtr lsn; + struct stat statbuf; + + if (strcmp(snap_de->d_name, ".") == 0 || + strcmp(snap_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name); + + if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + { + elog(DEBUG1, "only regular files expected: %s", path); + continue; + } + + /* + * temporary filenames from SnapBuildSerialize() include the LSN and + * everything but are postfixed by .$pid.tmp. We can just remove them + * the same as other files because there can be none that are + * currently being written that are older than cutoff. + * + * We just log a message if a file doesn't fit the pattern, it's + * probably some editors lock/state file or similar... + */ + if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name \"%s\"", path))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + + /* check whether we still need it */ + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing snapbuild snapshot %s", path); + + /* + * It's not particularly harmful, though strange, if we can't + * remove the file here. Don't prevent the checkpoint from + * completing, that'd be a cure worse than the disease. + */ + if (unlink(path) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + path))); + continue; + } + } + } + FreeDir(snap_dir); +} diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c new file mode 100644 index 0000000..682c107 --- /dev/null +++ b/src/backend/replication/logical/tablesync.c @@ -0,0 +1,1160 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication: initial table data synchronization + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in a separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in the stream with + * the main apply worker. + * + * There are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in a single process for the whole database. + * - It allows us to synchronize any tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps: + * - Apply worker requests a tablesync worker to start, setting the new + * table state to INIT. + * - Tablesync worker starts; changes table state from INIT to DATASYNC while + * copying. + * - Tablesync worker does initial table copy; there is a FINISHEDCOPY (sync + * worker specific) state to indicate when the copy phase has completed, so + * if the worker crashes with this (non-memory) state then the copy will not + * be re-attempted. + * - Tablesync worker then sets table state to SYNCWAIT; waits for state change. + * - Apply worker periodically checks for tables in SYNCWAIT state. When + * any appear, it sets the table state to CATCHUP and starts loop-waiting + * until either the table state is set to SYNCDONE or the sync worker + * exits. + * - After the sync worker has seen the state change to CATCHUP, it will + * read the stream and apply changes (acting like an apply worker) until + * it catches up to the specified stream position. Then it sets the + * state to SYNCDONE. There might be zero changes applied between + * CATCHUP and SYNCDONE, because the sync worker might be ahead of the + * apply worker. + * - Once the state is set to SYNCDONE, the apply will continue tracking + * the table until it reaches the SYNCDONE stream position, at which + * point it sets state to READY and stops tracking. Again, there might + * be zero changes in between. + * + * So the state progression is always: INIT -> DATASYNC -> FINISHEDCOPY + * -> SYNCWAIT -> CATCHUP -> SYNCDONE -> READY. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state. The catalog holds all states + * except SYNCWAIT and CATCHUP which are only in shared memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:10 + * -> set in memory CATCHUP + * -> enter wait-loop + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * apply:11 + * -> set in catalog READY + * + * - Sync is in front: + * sync:10 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:8 + * -> set in memory CATCHUP + * -> continue per-table filtering + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> set in catalog READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "access/xact.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" +#include "commands/copy.h" +#include "miscadmin.h" +#include "parser/parse_relation.h" +#include "pgstat.h" +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "replication/slot.h" +#include "replication/origin.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" + +static bool table_states_valid = false; + +StringInfo copybuf = NULL; + +/* + * Exit routine for synchronization worker. + */ +static void +pg_attribute_noreturn() +finish_sync_worker(void) +{ + /* + * Commit any outstanding transaction. This is the usual case, unless + * there was nothing to do for the table. + */ + if (IsTransactionState()) + { + CommitTransactionCommand(); + pgstat_report_stat(false); + } + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + StartTransactionCommand(); + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid)))); + CommitTransactionCommand(); + + /* Find the main apply worker and signal it. */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + + /* Stop gracefully */ + proc_exit(0); +} + +/* + * Wait until the relation sync state is set in the catalog to the expected + * one; return true when it happens. + * + * Returns false if the table sync worker or the table itself have + * disappeared, or the table state has been reset. + * + * Currently, this is used in the apply worker when transitioning from + * CATCHUP state to SYNCDONE. + */ +static bool +wait_for_relation_state_change(Oid relid, char expected_state) +{ + char state; + + for (;;) + { + LogicalRepWorker *worker; + XLogRecPtr statelsn; + + CHECK_FOR_INTERRUPTS(); + + InvalidateCatalogSnapshot(); + state = GetSubscriptionRelState(MyLogicalRepWorker->subid, + relid, &statelsn); + + if (state == SUBREL_STATE_UNKNOWN) + break; + + if (state == expected_state) + return true; + + /* Check if the sync worker is still running and bail if not. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, relid, + false); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Wait until the apply worker changes the state of our synchronization + * worker to the expected one. + * + * Used when transitioning from SYNCWAIT state to CATCHUP. + * + * Returns false if the apply worker has disappeared. + */ +static bool +wait_for_worker_state_change(char expected_state) +{ + int rc; + + for (;;) + { + LogicalRepWorker *worker; + + CHECK_FOR_INTERRUPTS(); + + /* + * Done if already in correct state. (We assume this fetch is atomic + * enough to not give a misleading answer if we do it with no lock.) + */ + if (MyLogicalRepWorker->relstate == expected_state) + return true; + + /* + * Bail out if the apply worker has died, else signal it we're + * waiting. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, + InvalidOid, false); + if (worker && worker->proc) + logicalrep_worker_wakeup_ptr(worker); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + /* + * Wait. We expect to get a latch signal back from the apply worker, + * but use a timeout in case it dies without sending one. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void +invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + table_states_valid = false; +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in CATCHUP state and reached (or passed) the + * predetermined synchronization point in the WAL stream, mark the table as + * SYNCDONE and finish. + */ +static void +process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP && + current_lsn >= MyLogicalRepWorker->relstate_lsn) + { + TimeLineID tli; + char syncslotname[NAMEDATALEN] = {0}; + + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCDONE; + MyLogicalRepWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * UpdateSubscriptionRelState must be called within a transaction. + * That transaction will be ended within the finish_sync_worker(). + */ + if (!IsTransactionState()) + StartTransactionCommand(); + + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + + /* + * End streaming so that LogRepWorkerWalRcvConn can be used to drop + * the slot. + */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); + + /* + * Cleanup the tablesync slot. + * + * This has to be done after updating the state because otherwise if + * there is an error while doing the database operations we won't be + * able to rollback dropped slot. + */ + ReplicationSlotNameForTablesync(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + syncslotname, + sizeof(syncslotname)); + + /* + * It is important to give an error if we are unable to drop the slot, + * otherwise, it won't be dropped till the corresponding subscription + * is dropped. So passing missing_ok = false. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, syncslotname, false); + + finish_sync_worker(); + } + else + SpinLockRelease(&MyLogicalRepWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). To prevent starting the sync worker for the same relation at a + * high frequency after a failure, we store its last start time with each sync + * state info. We start the sync worker for the same relation after waiting + * at least wal_retrieve_retry_interval. + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. This is the + * SYNCWAIT to CATCHUP transition. + * + * If the synchronization position is reached (SYNCDONE), then the table can + * be marked as READY and is no longer tracked. + */ +static void +process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + struct tablesync_start_time_mapping + { + Oid relid; + TimestampTz last_start_time; + }; + static List *table_states = NIL; + static HTAB *last_start_times = NULL; + ListCell *lc; + bool started_tx = false; + + Assert(!IsTransactionState()); + + /* We need up-to-date sync state info for subscription tables here. */ + if (!table_states_valid) + { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + /* Clean the old list. */ + list_free_deep(table_states); + table_states = NIL; + + StartTransactionCommand(); + started_tx = true; + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionNotReadyRelations(MySubscription->oid); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach(lc, rstates) + { + rstate = palloc(sizeof(SubscriptionRelState)); + memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); + table_states = lappend(table_states, rstate); + } + MemoryContextSwitchTo(oldctx); + + table_states_valid = true; + } + + /* + * Prepare a hash table for tracking last start times of workers, to avoid + * immediate restarts. We don't need it if there are no tables that need + * syncing. + */ + if (table_states && !last_start_times) + { + HASHCTL ctl; + + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(struct tablesync_start_time_mapping); + last_start_times = hash_create("Logical replication table sync worker start times", + 256, &ctl, HASH_ELEM | HASH_BLOBS); + } + + /* + * Clean up the hash table when we're done with all tables (just to + * release the bit of memory). + */ + else if (!table_states && last_start_times) + { + hash_destroy(last_start_times); + last_start_times = NULL; + } + + /* + * Process all tables that are being synchronized. + */ + foreach(lc, table_states) + { + SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) + { + /* + * Apply has caught up to the position where the table sync has + * finished. Mark the table as ready so that the apply will just + * continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) + { + char originname[NAMEDATALEN]; + + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + if (!started_tx) + { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Remove the tablesync origin tracking if exists. + * + * The normal case origin drop is done here instead of in the + * process_syncing_tables_for_sync function because we don't + * allow to drop the origin till the process owning the origin + * is alive. + * + * There is a chance that the user is concurrently performing + * refresh for the subscription where we remove the table + * state and its origin and by this time the origin might be + * already removed. So passing missing_ok = true. + */ + ReplicationOriginNameForTablesync(MyLogicalRepWorker->subid, + rstate->relid, + originname, + sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + + /* + * Update the state to READY only after the origin cleanup. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + rstate->relid, rstate->state, + rstate->lsn); + } + } + else + { + LogicalRepWorker *syncworker; + + /* + * Look for a sync worker for this relation. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + rstate->relid, false); + + if (syncworker) + { + /* Found one, update our copy of its state */ + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* + * Sync worker is waiting for apply. Tell sync worker it + * can catchup now. + */ + syncworker->relstate = SUBREL_STATE_CATCHUP; + syncworker->relstate_lsn = + Max(syncworker->relstate_lsn, current_lsn); + } + SpinLockRelease(&syncworker->relmutex); + + /* If we told worker to catch up, wait for it. */ + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* Signal the sync worker, as it may be waiting for us. */ + if (syncworker->proc) + logicalrep_worker_wakeup_ptr(syncworker); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * Enter busy loop and wait for synchronization worker to + * reach expected state (or die trying). + */ + if (!started_tx) + { + StartTransactionCommand(); + started_tx = true; + } + + wait_for_relation_state_change(rstate->relid, + SUBREL_STATE_SYNCDONE); + } + else + LWLockRelease(LogicalRepWorkerLock); + } + else + { + /* + * If there is no sync worker for this table yet, count + * running sync workers for this subscription, while we have + * the lock. + */ + int nsyncworkers = + logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * If there are free sync worker slot(s), start a new sync + * worker for the table. + */ + if (nsyncworkers < max_sync_workers_per_subscription) + { + TimestampTz now = GetCurrentTimestamp(); + struct tablesync_start_time_mapping *hentry; + bool found; + + hentry = hash_search(last_start_times, &rstate->relid, + HASH_ENTER, &found); + + if (!found || + TimestampDifferenceExceeds(hentry->last_start_time, now, + wal_retrieve_retry_interval)) + { + logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid); + hentry->last_start_time = now; + } + } + } + } + } + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(false); + } +} + +/* + * Process possible state change(s) of tables that are being synchronized. + */ +void +process_syncing_tables(XLogRecPtr current_lsn) +{ + if (am_tablesync_worker()) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List * +make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + int i; + + for (i = 0; i < rel->remoterel.natts; i++) + { + attnamelist = lappend(attnamelist, + makeString(rel->remoterel.attnames[i])); + } + + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use it. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (maxread > 0 && bytesread < minread) + { + pgsocket fd = PGINVALID_SOCKET; + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + + /* + * Wait for more data or latch. + */ + (void) WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA); + + ResetLatch(MyLatch); + } + + return bytesread; +} + + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. + */ +static void +fetch_remote_table_info(char *nspname, char *relname, + LogicalRepRelation *lrel) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[] = {OIDOID, CHAROID, CHAROID}; + Oid attrRow[] = {TEXTOID, OIDOID, BOOLOID}; + bool isnull; + int natt; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT c.oid, c.relreplident, c.relkind" + " FROM pg_catalog.pg_class c" + " INNER JOIN pg_catalog.pg_namespace n" + " ON (c.relnamespace = n.oid)" + " WHERE n.nspname = %s" + " AND c.relname = %s", + quote_literal_cstr(nspname), + quote_literal_cstr(relname)); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(tableRow), tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("table \"%s.%s\" not found on publisher", + nspname, relname))); + + lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + lrel->relkind = DatumGetChar(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + /* Now fetch columns. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attname," + " a.atttypid," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped %s" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, + (walrcv_server_version(LogRepWorkerWalRcvConn) >= 120000 ? + "AND a.attgenerated = ''" : ""), + lrel->remoteid); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(attrRow), attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + /* We don't know the number of rows coming, so allocate enough space. */ + lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + lrel->attnames[natt] = + TextDatumGetCString(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + if (DatumGetBool(slot_getattr(slot, 3, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", + nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void +copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + WalRcvExecResult *res; + StringInfoData cmd; + CopyFromState cstate; + List *attnamelist; + ParseState *pstate; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), &lrel); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + if (lrel.relkind == RELKIND_RELATION) + appendStringInfo(&cmd, "COPY %s TO STDOUT", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + else + { + /* + * For non-tables, we need to do COPY (SELECT ...), but we can't just + * do SELECT * because we need to not copy generated columns. + */ + appendStringInfoString(&cmd, "COPY (SELECT "); + for (int i = 0; i < lrel.natts; i++) + { + appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i])); + if (i < lrel.natts - 1) + appendStringInfoString(&cmd, ", "); + } + appendStringInfo(&cmd, " FROM %s) TO STDOUT", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + } + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not start initial contents copy for table \"%s.%s\": %s", + lrel.nspname, lrel.relname, res->err))); + walrcv_clear_result(res); + + copybuf = makeStringInfo(); + + pstate = make_parsestate(NULL); + (void) addRangeTableEntryForRelation(pstate, rel, AccessShareLock, + NULL, false, false); + + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(pstate, rel, NULL, NULL, false, copy_read_data, attnamelist, NIL); + + /* Do the copy */ + (void) CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Determine the tablesync slot name. + * + * The name must not exceed NAMEDATALEN - 1 because of remote node constraints + * on slot name length. We append system_identifier to avoid slot_name + * collision with subscriptions in other clusters. With the current scheme + * pg_%u_sync_%u_UINT64_FORMAT (3 + 10 + 6 + 10 + 20 + '\0'), the maximum + * length of slot_name will be 50. + * + * The returned slot name is stored in the supplied buffer (syncslotname) with + * the given size. + * + * Note: We don't use the subscription slot name as part of tablesync slot name + * because we are responsible for cleaning up these slots and it could become + * impossible to recalculate what name to cleanup if the subscription slot name + * had changed. + */ +void +ReplicationSlotNameForTablesync(Oid suboid, Oid relid, + char *syncslotname, int szslot) +{ + snprintf(syncslotname, szslot, "pg_%u_sync_%u_" UINT64_FORMAT, suboid, + relid, GetSystemIdentifier()); +} + +/* + * Form the origin name for tablesync. + * + * Return the name in the supplied buffer. + */ +void +ReplicationOriginNameForTablesync(Oid suboid, Oid relid, + char *originname, int szorgname) +{ + snprintf(originname, szorgname, "pg_%u_%u", suboid, relid); +} + +/* + * Start syncing the table in the sync worker. + * + * If nothing needs to be done to sync the table, we exit the worker without + * any further action. + * + * The returned slot name is palloc'ed in current memory context. + */ +char * +LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char *err; + char relstate; + XLogRecPtr relstate_lsn; + Relation rel; + WalRcvExecResult *res; + char originname[NAMEDATALEN]; + RepOriginId originid; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + relstate = GetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + &relstate_lsn); + CommitTransactionCommand(); + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = relstate; + MyLogicalRepWorker->relstate_lsn = relstate_lsn; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * If synchronization is already done or no longer necessary, exit now + * that we've updated shared memory state. + */ + switch (relstate) + { + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + case SUBREL_STATE_UNKNOWN: + finish_sync_worker(); /* doesn't return */ + } + + /* Calculate the name of the tablesync slot. */ + slotname = (char *) palloc(NAMEDATALEN); + ReplicationSlotNameForTablesync(MySubscription->oid, + MyLogicalRepWorker->relid, + slotname, + NAMEDATALEN); + + /* + * Here we use the slot name instead of the subscription name as the + * application_name, so that it is different from the main apply worker, + * so that synchronous replication can distinguish them. + */ + LogRepWorkerWalRcvConn = + walrcv_connect(MySubscription->conninfo, true, slotname, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + Assert(MyLogicalRepWorker->relstate == SUBREL_STATE_INIT || + MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC || + MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY); + + /* Assign the origin tracking record name. */ + ReplicationOriginNameForTablesync(MySubscription->oid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC) + { + /* + * We have previously errored out before finishing the copy so the + * replication slot might exist. We want to remove the slot if it + * already exists and proceed. + * + * XXX We could also instead try to drop the slot, last time we failed + * but for that, we might need to clean up the copy state as it might + * be in the middle of fetching the rows. Also, if there is a network + * breakdown then it wouldn't have succeeded so trying it next time + * seems like a better bet. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, slotname, true); + } + else if (MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY) + { + /* + * The COPY phase was previously done, but tablesync then crashed + * before it was able to finish normally. + */ + StartTransactionCommand(); + + /* + * The origin tracking name must already exist. It was created first + * time this tablesync was launched. + */ + originid = replorigin_by_name(originname, false); + replorigin_session_setup(originid); + replorigin_session_origin = originid; + *origin_startpos = replorigin_session_get_progress(false); + + CommitTransactionCommand(); + + goto copy_table_done; + } + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC; + MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + CommitTransactionCommand(); + pgstat_report_stat(false); + + StartTransactionCommand(); + + /* + * Use a standard write lock here. It might be better to disallow access + * to the table while it's being synchronized. But we don't want to block + * the main apply process from working and it has to open the relation in + * RowExclusiveLock when remapping remote relation id to local one. + */ + rel = table_open(MyLogicalRepWorker->relid, RowExclusiveLock); + + /* + * Start a transaction in the remote node in REPEATABLE READ mode. This + * ensures that both the replication slot we create (see below) and the + * COPY are consistent with each other. + */ + res = walrcv_exec(LogRepWorkerWalRcvConn, + "BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ", + 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not start transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + /* + * Create a new permanent logical decoding slot. This slot will be used + * for the catchup phase after COPY is done, so tell it to use the + * snapshot to make the final data consistent. + * + * Prevent cancel/die interrupts while creating slot here because it is + * possible that before the server finishes this command, a concurrent + * drop subscription happens which would complete without removing this + * slot leading to a dangling slot on the server. + */ + HOLD_INTERRUPTS(); + walrcv_create_slot(LogRepWorkerWalRcvConn, slotname, false /* permanent */ , + CRS_USE_SNAPSHOT, origin_startpos); + RESUME_INTERRUPTS(); + + /* + * Setup replication origin tracking. The purpose of doing this before the + * copy is to avoid doing the copy again due to any error in setting up + * origin tracking. + */ + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + { + /* + * Origin tracking does not exist, so create it now. + * + * Then advance to the LSN got from walrcv_create_slot. This is WAL + * logged for the purpose of recovery. Locks are to prevent the + * replication origin from vanishing while advancing. + */ + originid = replorigin_create(originname); + + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + replorigin_session_setup(originid); + replorigin_session_origin = originid; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("replication origin \"%s\" already exists", + originname))); + } + + /* Now do the initial data copy */ + PushActiveSnapshot(GetTransactionSnapshot()); + copy_table(rel); + PopActiveSnapshot(); + + res = walrcv_exec(LogRepWorkerWalRcvConn, "COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not finish transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + table_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * Update the persisted state to indicate the COPY phase is done; make it + * visible to others. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + SUBREL_STATE_FINISHEDCOPY, + MyLogicalRepWorker->relstate_lsn); + + CommitTransactionCommand(); + +copy_table_done: + + elog(DEBUG1, + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + originname, LSN_FORMAT_ARGS(*origin_startpos)); + + /* + * We are done with the initial data synchronization, update the state. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT; + MyLogicalRepWorker->relstate_lsn = *origin_startpos; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * Finally, wait until the main apply worker tells us to catch up and then + * return to let LogicalRepApplyLoop do it. + */ + wait_for_worker_state_change(SUBREL_STATE_CATCHUP); + return slotname; +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c new file mode 100644 index 0000000..8c9a4b5 --- /dev/null +++ b/src/backend/replication/logical/worker.c @@ -0,0 +1,3254 @@ +/*------------------------------------------------------------------------- + * worker.c + * PostgreSQL logical replication worker (apply) + * + * Copyright (c) 2016-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/worker.c + * + * NOTES + * This file contains the worker which applies logical changes as they come + * from remote logical replication stream. + * + * The main worker (apply) is started by logical replication worker + * launcher for every enabled subscription in a database. It uses + * walsender protocol to communicate with publisher. + * + * This module includes server facing code and shares libpqwalreceiver + * module with walreceiver for providing the libpq specific functionality. + * + * + * STREAMED TRANSACTIONS + * --------------------- + * Streamed transactions (large transactions exceeding a memory limit on the + * upstream) are not applied immediately, but instead, the data is written + * to temporary files and then applied at once when the final commit arrives. + * + * Unlike the regular (non-streamed) case, handling streamed transactions has + * to handle aborts of both the toplevel transaction and subtransactions. This + * is achieved by tracking offsets for subtransactions, which is then used + * to truncate the file with serialized changes. + * + * The files are placed in tmp file directory by default, and the filenames + * include both the XID of the toplevel transaction and OID of the + * subscription. This is necessary so that different workers processing a + * remote transaction with the same XID doesn't interfere. + * + * We use BufFiles instead of using normal temporary files because (a) the + * BufFile infrastructure supports temporary files that exceed the OS file size + * limit, (b) provides a way for automatic clean up on the error and (c) provides + * a way to survive these files across local transactions and allow to open and + * close at stream start and close. We decided to use SharedFileSet + * infrastructure as without that it deletes the files on the closure of the + * file and if we decide to keep stream files open across the start/stop stream + * then it will consume a lot of memory (more than 8K for each BufFile and + * there could be multiple such BufFiles as the subscriber could receive + * multiple start/stop streams for different transactions before getting the + * commit). Moreover, if we don't use SharedFileSet then we also need to invent + * a new way to pass filenames to BufFile APIs so that we are allowed to open + * the file we desired across multiple stream-open calls for the same + * transaction. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "access/table.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "catalog/namespace.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablecmds.h" +#include "commands/tablespace.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "executor/execPartition.h" +#include "executor/nodeModifyTable.h" +#include "funcapi.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "postmaster/walwriter.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/logicalproto.h" +#include "replication/logicalrelation.h" +#include "replication/logicalworker.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "rewrite/rewriteHandler.h" +#include "storage/buffile.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/dynahash.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/timeout.h" + +#define NAPTIME_PER_CYCLE 1000 /* max sleep time between cycles (1s) */ + +typedef struct FlushPosition +{ + dlist_node node; + XLogRecPtr local_end; + XLogRecPtr remote_end; +} FlushPosition; + +static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping); + +typedef struct SlotErrCallbackArg +{ + LogicalRepRelMapEntry *rel; + int remote_attnum; +} SlotErrCallbackArg; + +typedef struct ApplyExecutionData +{ + EState *estate; /* executor state, used to track resources */ + + LogicalRepRelMapEntry *targetRel; /* replication target rel */ + ResultRelInfo *targetRelInfo; /* ResultRelInfo for same */ + + /* These fields are used when the target relation is partitioned: */ + ModifyTableState *mtstate; /* dummy ModifyTable state */ + PartitionTupleRouting *proute; /* partition routing info */ +} ApplyExecutionData; + +/* + * Stream xid hash entry. Whenever we see a new xid we create this entry in the + * xidhash and along with it create the streaming file and store the fileset handle. + * The subxact file is created iff there is any subxact info under this xid. This + * entry is used on the subsequent streams for the xid to get the corresponding + * fileset handles, so storing them in hash makes the search faster. + */ +typedef struct StreamXidHash +{ + TransactionId xid; /* xid is the hash key and must be first */ + SharedFileSet *stream_fileset; /* shared file set for stream data */ + SharedFileSet *subxact_fileset; /* shared file set for subxact info */ +} StreamXidHash; + +static MemoryContext ApplyMessageContext = NULL; +MemoryContext ApplyContext = NULL; + +/* per stream context for streaming transactions */ +static MemoryContext LogicalStreamingContext = NULL; + +WalReceiverConn *LogRepWorkerWalRcvConn = NULL; + +Subscription *MySubscription = NULL; +bool MySubscriptionValid = false; + +bool in_remote_transaction = false; +static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; + +/* fields valid only when processing streamed transaction */ +static bool in_streamed_transaction = false; + +static TransactionId stream_xid = InvalidTransactionId; + +/* + * Hash table for storing the streaming xid information along with shared file + * set for streaming and subxact files. + */ +static HTAB *xidhash = NULL; + +/* BufFile handle of the current streaming file */ +static BufFile *stream_fd = NULL; + +typedef struct SubXactInfo +{ + TransactionId xid; /* XID of the subxact */ + int fileno; /* file number in the buffile */ + off_t offset; /* offset in the file */ +} SubXactInfo; + +/* Sub-transaction data for the current streaming transaction */ +typedef struct ApplySubXactData +{ + uint32 nsubxacts; /* number of sub-transactions */ + uint32 nsubxacts_max; /* current capacity of subxacts */ + TransactionId subxact_last; /* xid of the last sub-transaction */ + SubXactInfo *subxacts; /* sub-xact offset in changes file */ +} ApplySubXactData; + +static ApplySubXactData subxact_data = {0, 0, InvalidTransactionId, NULL}; + +static inline void subxact_filename(char *path, Oid subid, TransactionId xid); +static inline void changes_filename(char *path, Oid subid, TransactionId xid); + +/* + * Information about subtransactions of a given toplevel transaction. + */ +static void subxact_info_write(Oid subid, TransactionId xid); +static void subxact_info_read(Oid subid, TransactionId xid); +static void subxact_info_add(TransactionId xid); +static inline void cleanup_subxact_info(void); + +/* + * Serialize and deserialize changes for a toplevel transaction. + */ +static void stream_cleanup_files(Oid subid, TransactionId xid); +static void stream_open_file(Oid subid, TransactionId xid, bool first); +static void stream_write_change(char action, StringInfo s); +static void stream_close_file(void); + +static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); + +static void store_flush_position(XLogRecPtr remote_lsn); + +static void maybe_reread_subscription(void); + +/* prototype needed because of stream_commit */ +static void apply_dispatch(StringInfo s); + +static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); +static void apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot); +static void apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup); +static void apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot); +static bool FindReplTupleInLocalRel(EState *estate, Relation localrel, + LogicalRepRelation *remoterel, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot); +static void apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation); + +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the main apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of initial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + */ +static bool +should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (am_tablesync_worker()) + return MyLogicalRepWorker->relid == rel->localreloid; + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && + rel->statelsn <= remote_final_lsn)); +} + +/* + * Begin one step (one INSERT, UPDATE, etc) of a replication transaction. + * + * Start a transaction, if this is the first step (else we keep using the + * existing transaction). + * Also provide a global snapshot and ensure we run in ApplyMessageContext. + */ +static void +begin_replication_step(void) +{ + SetCurrentStatementStartTimestamp(); + + if (!IsTransactionState()) + { + StartTransactionCommand(); + maybe_reread_subscription(); + } + + PushActiveSnapshot(GetTransactionSnapshot()); + + MemoryContextSwitchTo(ApplyMessageContext); +} + +/* + * Finish up one step of a replication transaction. + * Callers of begin_replication_step() must also call this. + * + * We don't close out the transaction here, but we should increment + * the command counter to make the effects of this step visible. + */ +static void +end_replication_step(void) +{ + PopActiveSnapshot(); + + CommandCounterIncrement(); +} + +/* + * Handle streamed transactions. + * + * If in streaming mode (receiving a block of streamed transaction), we + * simply redirect it to a file for the proper toplevel transaction. + * + * Returns true for streamed transactions, false otherwise (regular mode). + */ +static bool +handle_streamed_transaction(LogicalRepMsgType action, StringInfo s) +{ + TransactionId xid; + + /* not in streaming mode */ + if (!in_streamed_transaction) + return false; + + Assert(stream_fd != NULL); + Assert(TransactionIdIsValid(stream_xid)); + + /* + * We should have received XID of the subxact as the first part of the + * message, so extract it. + */ + xid = pq_getmsgint(s, 4); + + if (!TransactionIdIsValid(xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + /* Add the new subxact to the array (unless already there). */ + subxact_info_add(xid); + + /* write the change to the current file */ + stream_write_change(action, s); + + return true; +} + +/* + * Executor state preparation for evaluation of constraint expressions, + * indexes and triggers for the specified relation. + * + * Note that the caller must open and close any indexes to be updated. + */ +static ApplyExecutionData * +create_edata_for_relation(LogicalRepRelMapEntry *rel) +{ + ApplyExecutionData *edata; + EState *estate; + RangeTblEntry *rte; + ResultRelInfo *resultRelInfo; + + edata = (ApplyExecutionData *) palloc0(sizeof(ApplyExecutionData)); + edata->targetRel = rel; + + edata->estate = estate = CreateExecutorState(); + + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = RelationGetRelid(rel->localrel); + rte->relkind = rel->localrel->rd_rel->relkind; + rte->rellockmode = AccessShareLock; + ExecInitRangeTable(estate, list_make1(rte)); + + edata->targetRelInfo = resultRelInfo = makeNode(ResultRelInfo); + + /* + * Use Relation opened by logicalrep_rel_open() instead of opening it + * again. + */ + InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0); + + /* + * We put the ResultRelInfo in the es_opened_result_relations list, even + * though we don't populate the es_result_relations array. That's a bit + * bogus, but it's enough to make ExecGetTriggerResultRel() find them. + * + * ExecOpenIndices() is not called here either, each execution path doing + * an apply operation being responsible for that. + */ + estate->es_opened_result_relations = + lappend(estate->es_opened_result_relations, resultRelInfo); + + estate->es_output_cid = GetCurrentCommandId(true); + + /* Prepare to catch AFTER triggers. */ + AfterTriggerBeginQuery(); + + /* other fields of edata remain NULL for now */ + + return edata; +} + +/* + * Finish any operations related to the executor state created by + * create_edata_for_relation(). + */ +static void +finish_edata(ApplyExecutionData *edata) +{ + EState *estate = edata->estate; + + /* Handle any queued AFTER triggers. */ + AfterTriggerEndQuery(estate); + + /* Shut down tuple routing, if any was done. */ + if (edata->proute) + ExecCleanupTupleRouting(edata->mtstate, edata->proute); + + /* + * Cleanup. It might seem that we should call ExecCloseResultRelations() + * here, but we intentionally don't. It would close the rel we added to + * es_opened_result_relations above, which is wrong because we took no + * corresponding refcount. We rely on ExecCleanupTupleRouting() to close + * any other relations opened during execution. + */ + ExecResetTupleTable(estate->es_tupleTable, false); + FreeExecutorState(estate); + pfree(edata); +} + +/* + * Executes default values for columns for which we can't map to remote + * relation columns. + * + * This allows us to support tables which have more columns on the downstream + * than on the upstream. + */ +static void +slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate, + TupleTableSlot *slot) +{ + TupleDesc desc = RelationGetDescr(rel->localrel); + int num_phys_attrs = desc->natts; + int i; + int attnum, + num_defaults = 0; + int *defmap; + ExprState **defexprs; + ExprContext *econtext; + + econtext = GetPerTupleExprContext(estate); + + /* We got all the data via replication, no need to evaluate anything. */ + if (num_phys_attrs == rel->remoterel.natts) + return; + + defmap = (int *) palloc(num_phys_attrs * sizeof(int)); + defexprs = (ExprState **) palloc(num_phys_attrs * sizeof(ExprState *)); + + Assert(rel->attrmap->maplen == num_phys_attrs); + for (attnum = 0; attnum < num_phys_attrs; attnum++) + { + Expr *defexpr; + + if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated) + continue; + + if (rel->attrmap->attnums[attnum] >= 0) + continue; + + defexpr = (Expr *) build_column_default(rel->localrel, attnum + 1); + + if (defexpr != NULL) + { + /* Run the expression through planner */ + defexpr = expression_planner(defexpr); + + /* Initialize executable expression in copycontext */ + defexprs[num_defaults] = ExecInitExpr(defexpr, NULL); + defmap[num_defaults] = attnum; + num_defaults++; + } + + } + + for (i = 0; i < num_defaults; i++) + slot->tts_values[defmap[i]] = + ExecEvalExpr(defexprs[i], econtext, &slot->tts_isnull[defmap[i]]); +} + +/* + * Error callback to give more context info about data conversion failures + * while reading data from the remote server. + */ +static void +slot_store_error_callback(void *arg) +{ + SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg; + LogicalRepRelMapEntry *rel; + + /* Nothing to do if remote attribute number is not set */ + if (errarg->remote_attnum < 0) + return; + + rel = errarg->rel; + errcontext("processing remote data for replication target relation \"%s.%s\" column \"%s\"", + rel->remoterel.nspname, rel->remoterel.relname, + rel->remoterel.attnames[errarg->remote_attnum]); +} + +/* + * Store tuple data into slot. + * + * Incoming data can be either text or binary format. + */ +static void +slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + SlotErrCallbackArg errarg; + ErrorContextCallback errcallback; + + ExecClearTuple(slot); + + /* Push callback + info on the error context stack */ + errarg.rel = rel; + errarg.remote_attnum = -1; + errcallback.callback = slot_store_error_callback; + errcallback.arg = (void *) &errarg; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Call the "in" function for each non-dropped, non-null attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + Assert(remoteattnum < tupleData->ncols); + + errarg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* + * NULL value from remote. (We don't expect to see + * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as + * NULL.) + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + errarg.remote_attnum = -1; + } + else + { + /* + * We assign NULL to dropped attributes and missing values + * (missing values should be later filled using + * slot_fill_defaults). + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + ExecStoreVirtualTuple(slot); +} + +/* + * Replace updated columns with data from the LogicalRepTupleData struct. + * This is somewhat similar to heap_modify_tuple but also calls the type + * input functions on the user data. + * + * "slot" is filled with a copy of the tuple in "srcslot", replacing + * columns provided in "tupleData" and leaving others as-is. + * + * Caution: unreplaced pass-by-ref columns in "slot" will point into the + * storage for "srcslot". This is OK for current usage, but someday we may + * need to materialize "slot" at the end to make it independent of "srcslot". + */ +static void +slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot, + LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + SlotErrCallbackArg errarg; + ErrorContextCallback errcallback; + + /* We'll fill "slot" with a virtual tuple, so we must start with ... */ + ExecClearTuple(slot); + + /* + * Copy all the column data from srcslot, so that we'll have valid values + * for unreplaced columns. + */ + Assert(natts == srcslot->tts_tupleDescriptor->natts); + slot_getallattrs(srcslot); + memcpy(slot->tts_values, srcslot->tts_values, natts * sizeof(Datum)); + memcpy(slot->tts_isnull, srcslot->tts_isnull, natts * sizeof(bool)); + + /* For error reporting, push callback + info on the error context stack */ + errarg.rel = rel; + errarg.remote_attnum = -1; + errcallback.callback = slot_store_error_callback; + errcallback.arg = (void *) &errarg; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Call the "in" function for each replaced attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (remoteattnum < 0) + continue; + + Assert(remoteattnum < tupleData->ncols); + + if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + errarg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* must be LOGICALREP_COLUMN_NULL */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + errarg.remote_attnum = -1; + } + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* And finally, declare that "slot" contains a valid virtual tuple */ + ExecStoreVirtualTuple(slot); +} + +/* + * Handle BEGIN message. + */ +static void +apply_handle_begin(StringInfo s) +{ + LogicalRepBeginData begin_data; + + logicalrep_read_begin(s, &begin_data); + + remote_final_lsn = begin_data.final_lsn; + + in_remote_transaction = true; + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Handle COMMIT message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_commit(StringInfo s) +{ + LogicalRepCommitData commit_data; + + logicalrep_read_commit(s, &commit_data); + + if (commit_data.commit_lsn != remote_final_lsn) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + LSN_FORMAT_ARGS(commit_data.commit_lsn), + LSN_FORMAT_ARGS(remote_final_lsn)))); + + apply_handle_commit_internal(&commit_data); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); +} + +/* + * Handle ORIGIN message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_origin(StringInfo s) +{ + /* + * ORIGIN message can only come inside streaming transaction or inside + * remote transaction and before any actual writes. + */ + if (!in_streamed_transaction && + (!in_remote_transaction || + (IsTransactionState() && !am_tablesync_worker()))) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("ORIGIN message sent out of order"))); +} + +/* + * Handle STREAM START message. + */ +static void +apply_handle_stream_start(StringInfo s) +{ + bool first_segment; + HASHCTL hash_ctl; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("duplicate STREAM START message"))); + + /* + * Start a transaction on stream start, this transaction will be committed + * on the stream stop unless it is a tablesync worker in which case it + * will be committed after processing all the messages. We need the + * transaction for handling the buffile, used for serializing the + * streaming data and subxact info. + */ + begin_replication_step(); + + /* notify handle methods we're processing a remote transaction */ + in_streamed_transaction = true; + + /* extract XID of the top-level transaction */ + stream_xid = logicalrep_read_stream_start(s, &first_segment); + + if (!TransactionIdIsValid(stream_xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + /* + * Initialize the xidhash table if we haven't yet. This will be used for + * the entire duration of the apply worker so create it in permanent + * context. + */ + if (xidhash == NULL) + { + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(StreamXidHash); + hash_ctl.hcxt = ApplyContext; + xidhash = hash_create("StreamXidHash", 1024, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + /* open the spool file for this transaction */ + stream_open_file(MyLogicalRepWorker->subid, stream_xid, first_segment); + + /* if this is not the first segment, open existing subxact file */ + if (!first_segment) + subxact_info_read(MyLogicalRepWorker->subid, stream_xid); + + pgstat_report_activity(STATE_RUNNING, NULL); + + end_replication_step(); +} + +/* + * Handle STREAM STOP message. + */ +static void +apply_handle_stream_stop(StringInfo s) +{ + if (!in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM STOP message without STREAM START"))); + + /* + * Close the file with serialized changes, and serialize information about + * subxacts for the toplevel transaction. + */ + subxact_info_write(MyLogicalRepWorker->subid, stream_xid); + stream_close_file(); + + /* We must be in a valid transaction state */ + Assert(IsTransactionState()); + + /* Commit the per-stream transaction */ + CommitTransactionCommand(); + + in_streamed_transaction = false; + + /* Reset per-stream context */ + MemoryContextReset(LogicalStreamingContext); + + pgstat_report_activity(STATE_IDLE, NULL); +} + +/* + * Handle STREAM abort message. + */ +static void +apply_handle_stream_abort(StringInfo s) +{ + TransactionId xid; + TransactionId subxid; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM ABORT message without STREAM STOP"))); + + logicalrep_read_stream_abort(s, &xid, &subxid); + + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just delete the files with serialized info. + */ + if (xid == subxid) + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + else + { + /* + * OK, so it's a subxact. We need to read the subxact file for the + * toplevel transaction, determine the offset tracked for the subxact, + * and truncate the file with changes. We also remove the subxacts + * with higher offsets (or rather higher XIDs). + * + * We intentionally scan the array from the tail, because we're likely + * aborting a change for the most recent subtransactions. + * + * We can't use the binary search here as subxact XIDs won't + * necessarily arrive in sorted order, consider the case where we have + * released the savepoint for multiple subtransactions and then + * performed rollback to savepoint for one of the earlier + * sub-transaction. + */ + int64 i; + int64 subidx; + BufFile *fd; + bool found = false; + char path[MAXPGPATH]; + StreamXidHash *ent; + + subidx = -1; + begin_replication_step(); + subxact_info_read(MyLogicalRepWorker->subid, xid); + + for (i = subxact_data.nsubxacts; i > 0; i--) + { + if (subxact_data.subxacts[i - 1].xid == subxid) + { + subidx = (i - 1); + found = true; + break; + } + } + + /* + * If it's an empty sub-transaction then we will not find the subxid + * here so just cleanup the subxact info and return. + */ + if (!found) + { + /* Cleanup the subxact info */ + cleanup_subxact_info(); + end_replication_step(); + CommitTransactionCommand(); + return; + } + + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + NULL); + if (!ent) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("transaction %u not found in stream XID hash table", + xid))); + + /* open the changes file */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR); + + /* OK, truncate the file at the right offset */ + BufFileTruncateShared(fd, subxact_data.subxacts[subidx].fileno, + subxact_data.subxacts[subidx].offset); + BufFileClose(fd); + + /* discard the subxacts added later */ + subxact_data.nsubxacts = subidx; + + /* write the updated subxact list */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + + end_replication_step(); + CommitTransactionCommand(); + } +} + +/* + * Handle STREAM COMMIT message. + */ +static void +apply_handle_stream_commit(StringInfo s) +{ + TransactionId xid; + StringInfoData s2; + int nchanges; + char path[MAXPGPATH]; + char *buffer = NULL; + LogicalRepCommitData commit_data; + StreamXidHash *ent; + MemoryContext oldcxt; + BufFile *fd; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM COMMIT message without STREAM STOP"))); + + xid = logicalrep_read_stream_commit(s, &commit_data); + + elog(DEBUG1, "received commit for streamed transaction %u", xid); + + /* Make sure we have an open transaction */ + begin_replication_step(); + + /* + * Allocate file handle and memory required to process all the messages in + * TopTransactionContext to avoid them getting reset after each message is + * processed. + */ + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + + /* open the spool file for the committed transaction */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + elog(DEBUG1, "replaying changes from file \"%s\"", path); + + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + NULL); + if (!ent) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("transaction %u not found in stream XID hash table", + xid))); + + fd = BufFileOpenShared(ent->stream_fileset, path, O_RDONLY); + + buffer = palloc(BLCKSZ); + initStringInfo(&s2); + + MemoryContextSwitchTo(oldcxt); + + remote_final_lsn = commit_data.commit_lsn; + + /* + * Make sure the handle apply_dispatch methods are aware we're in a remote + * transaction. + */ + in_remote_transaction = true; + pgstat_report_activity(STATE_RUNNING, NULL); + + end_replication_step(); + + /* + * Read the entries one by one and pass them through the same logic as in + * apply_dispatch. + */ + nchanges = 0; + while (true) + { + int nbytes; + int len; + + CHECK_FOR_INTERRUPTS(); + + /* read length of the on-disk record */ + nbytes = BufFileRead(fd, &len, sizeof(len)); + + /* have we reached end of the file? */ + if (nbytes == 0) + break; + + /* do we have a correct length? */ + if (nbytes != sizeof(len)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from streaming transaction's changes file \"%s\": %m", + path))); + + if (len <= 0) + elog(ERROR, "incorrect length %d in streaming transaction's changes file \"%s\"", + len, path); + + /* make sure we have sufficiently large buffer */ + buffer = repalloc(buffer, len); + + /* and finally read the data into the buffer */ + if (BufFileRead(fd, buffer, len) != len) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from streaming transaction's changes file \"%s\": %m", + path))); + + /* copy the buffer to the stringinfo and call apply_dispatch */ + resetStringInfo(&s2); + appendBinaryStringInfo(&s2, buffer, len); + + /* Ensure we are reading the data into our memory context. */ + oldcxt = MemoryContextSwitchTo(ApplyMessageContext); + + apply_dispatch(&s2); + + MemoryContextReset(ApplyMessageContext); + + MemoryContextSwitchTo(oldcxt); + + nchanges++; + + if (nchanges % 1000 == 0) + elog(DEBUG1, "replayed %d changes from file \"%s\"", + nchanges, path); + } + + BufFileClose(fd); + + pfree(buffer); + pfree(s2.data); + + elog(DEBUG1, "replayed %d (all) changes from file \"%s\"", + nchanges, path); + + apply_handle_commit_internal(&commit_data); + + /* unlink the files with serialized changes and subxact info */ + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); +} + +/* + * Helper function for apply_handle_commit and apply_handle_stream_commit. + */ +static void +apply_handle_commit_internal(LogicalRepCommitData *commit_data) +{ + if (IsTransactionState()) + { + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = commit_data->end_lsn; + replorigin_session_origin_timestamp = commit_data->committime; + + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(commit_data->end_lsn); + } + else + { + /* Process any invalidation messages that might have accumulated. */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + } + + in_remote_transaction = false; +} + +/* + * Handle RELATION message. + * + * Note we don't do validation against local schema here. The validation + * against local schema is postponed until first change for given relation + * comes as we only care about it when applying changes for it anyway and we + * do less locking this way. + */ +static void +apply_handle_relation(StringInfo s) +{ + LogicalRepRelation *rel; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_RELATION, s)) + return; + + rel = logicalrep_read_rel(s); + logicalrep_relmap_update(rel); + + /* Also reset all entries in the partition map that refer to remoterel. */ + logicalrep_partmap_reset_relmap(rel); +} + +/* + * Handle TYPE message. + * + * This implementation pays no attention to TYPE messages; we expect the user + * to have set things up so that the incoming data is acceptable to the input + * functions for the locally subscribed tables. Hence, we just read and + * discard the message. + */ +static void +apply_handle_type(StringInfo s) +{ + LogicalRepTyp typ; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_TYPE, s)) + return; + + logicalrep_read_typ(s, &typ); +} + +/* + * Get replica identity index or if it is not defined a primary key. + * + * If neither is defined, returns InvalidOid + */ +static Oid +GetRelationIdentityOrPK(Relation rel) +{ + Oid idxoid; + + idxoid = RelationGetReplicaIndex(rel); + + if (!OidIsValid(idxoid)) + idxoid = RelationGetPrimaryKeyIndex(rel); + + return idxoid; +} + +/* + * Handle INSERT message. + */ + +static void +apply_handle_insert(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData newtup; + LogicalRepRelId relid; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_INSERT, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_insert(s, &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &newtup); + slot_fill_defaults(rel, estate, remoteslot); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, insert the tuple into a partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_INSERT); + else + apply_handle_insert_internal(edata, edata->targetRelInfo, + remoteslot); + + finish_edata(edata); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_insert() + * relinfo is for the relation we're actually inserting into + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot) +{ + EState *estate = edata->estate; + + /* We must open indexes here. */ + ExecOpenIndices(relinfo, false); + + /* Do the insert. */ + ExecSimpleRelationInsert(relinfo, estate, remoteslot); + + /* Cleanup. */ + ExecCloseIndices(relinfo); +} + +/* + * Check if the logical replication relation is updatable and throw + * appropriate error if it isn't. + */ +static void +check_relation_updatable(LogicalRepRelMapEntry *rel) +{ + /* + * For partitioned tables, we only need to care if the target partition is + * updatable (aka has PK or RI defined for it). + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return; + + /* Updatable, no error. */ + if (rel->updatable) + return; + + /* + * We are in error mode so it's fine this is somewhat slow. It's better to + * give user correct error. + */ + if (OidIsValid(GetRelationIdentityOrPK(rel->localrel))) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publisher did not send replica identity column " + "expected by the logical replication target relation \"%s.%s\"", + rel->remoterel.nspname, rel->remoterel.relname))); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" has " + "neither REPLICA IDENTITY index nor PRIMARY " + "KEY and published relation does not have " + "REPLICA IDENTITY FULL", + rel->remoterel.nspname, rel->remoterel.relname))); +} + +/* + * Handle UPDATE message. + * + * TODO: FDW support + */ +static void +apply_handle_update(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepRelId relid; + ApplyExecutionData *edata; + EState *estate; + LogicalRepTupleData oldtup; + LogicalRepTupleData newtup; + bool has_oldtup; + TupleTableSlot *remoteslot; + RangeTblEntry *target_rte; + MemoryContext oldctx; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_UPDATE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_update(s, &has_oldtup, &oldtup, + &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Check if we can do the update. */ + check_relation_updatable(rel); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* + * Populate updatedCols so that per-column triggers can fire, and so + * executor can correctly pass down indexUnchanged hint. This could + * include more columns than were actually changed on the publisher + * because the logical replication protocol doesn't contain that + * information. But it would for example exclude columns that only exist + * on the subscriber, since we are not touching those. + */ + target_rte = list_nth(estate->es_range_table, 0); + for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + Assert(remoteattnum < newtup.ncols); + if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + target_rte->updatedCols = + bms_add_member(target_rte->updatedCols, + i + 1 - FirstLowInvalidHeapAttributeNumber); + } + } + + /* Also populate extraUpdatedCols, in case we have generated columns */ + fill_extraUpdatedCols(target_rte, rel->localrel); + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, + has_oldtup ? &oldtup : &newtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply update to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, &newtup, CMD_UPDATE); + else + apply_handle_update_internal(edata, edata->targetRelInfo, + remoteslot, &newtup); + + finish_edata(edata); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_update() + * relinfo is for the relation we're actually updating in + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + Relation localrel = relinfo->ri_RelationDesc; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + MemoryContext oldctx; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(estate, localrel, + &relmapentry->remoterel, + remoteslot, &localslot); + ExecClearTuple(remoteslot); + + /* + * Tuple found. + * + * Note this will fail if there are other conflicting unique indexes. + */ + if (found) + { + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot, localslot, relmapentry, newtup); + MemoryContextSwitchTo(oldctx); + + EvalPlanQualSetSlot(&epqstate, remoteslot); + + /* Do the actual update. */ + ExecSimpleRelationUpdate(relinfo, estate, &epqstate, localslot, + remoteslot); + } + else + { + /* + * The tuple to be updated could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Handle DELETE message. + * + * TODO: FDW support + */ +static void +apply_handle_delete(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData oldtup; + LogicalRepRelId relid; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_DELETE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_delete(s, &oldtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Check if we can do the delete. */ + check_relation_updatable(rel); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &oldtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply delete to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_DELETE); + else + apply_handle_delete_internal(edata, edata->targetRelInfo, + remoteslot); + + finish_edata(edata); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_delete() + * relinfo is for the relation we're actually deleting from + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot) +{ + EState *estate = edata->estate; + Relation localrel = relinfo->ri_RelationDesc; + LogicalRepRelation *remoterel = &edata->targetRel->remoterel; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(estate, localrel, remoterel, + remoteslot, &localslot); + + /* If found delete it. */ + if (found) + { + EvalPlanQualSetSlot(&epqstate, localslot); + + /* Do the actual delete. */ + ExecSimpleRelationDelete(relinfo, estate, &epqstate, localslot); + } + else + { + /* + * The tuple to be deleted could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be deleted " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Try to find a tuple received from the publication side (in 'remoteslot') in + * the corresponding local relation using either replica identity index, + * primary key or if needed, sequential scan. + * + * Local tuple, if found, is returned in '*localslot'. + */ +static bool +FindReplTupleInLocalRel(EState *estate, Relation localrel, + LogicalRepRelation *remoterel, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot) +{ + Oid idxoid; + bool found; + + *localslot = table_slot_create(localrel, &estate->es_tupleTable); + + idxoid = GetRelationIdentityOrPK(localrel); + Assert(OidIsValid(idxoid) || + (remoterel->replident == REPLICA_IDENTITY_FULL)); + + if (OidIsValid(idxoid)) + found = RelationFindReplTupleByIndex(localrel, idxoid, + LockTupleExclusive, + remoteslot, *localslot); + else + found = RelationFindReplTupleSeq(localrel, LockTupleExclusive, + remoteslot, *localslot); + + return found; +} + +/* + * This handles insert, update, delete on a partitioned table. + */ +static void +apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + ResultRelInfo *relinfo = edata->targetRelInfo; + Relation parentrel = relinfo->ri_RelationDesc; + ModifyTableState *mtstate; + PartitionTupleRouting *proute; + ResultRelInfo *partrelinfo; + Relation partrel; + TupleTableSlot *remoteslot_part; + TupleConversionMap *map; + MemoryContext oldctx; + LogicalRepRelMapEntry *part_entry = NULL; + AttrMap *attrmap = NULL; + + /* ModifyTableState is needed for ExecFindPartition(). */ + edata->mtstate = mtstate = makeNode(ModifyTableState); + mtstate->ps.plan = NULL; + mtstate->ps.state = estate; + mtstate->operation = operation; + mtstate->resultRelInfo = relinfo; + + /* ... as is PartitionTupleRouting. */ + edata->proute = proute = ExecSetupPartitionTupleRouting(estate, parentrel); + + /* + * Find the partition to which the "search tuple" belongs. + */ + Assert(remoteslot != NULL); + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo = ExecFindPartition(mtstate, relinfo, proute, + remoteslot, estate); + Assert(partrelinfo != NULL); + partrel = partrelinfo->ri_RelationDesc; + + /* + * To perform any of the operations below, the tuple must match the + * partition's rowtype. Convert if needed or just copy, using a dedicated + * slot to store the tuple in any case. + */ + remoteslot_part = partrelinfo->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel, &estate->es_tupleTable); + map = partrelinfo->ri_RootToPartitionMap; + if (map != NULL) + { + attrmap = map->attrMap; + remoteslot_part = execute_attr_map_slot(attrmap, remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, remoteslot); + slot_getallattrs(remoteslot_part); + } + MemoryContextSwitchTo(oldctx); + + /* Check if we can do the update or delete on the leaf partition. */ + if (operation == CMD_UPDATE || operation == CMD_DELETE) + { + part_entry = logicalrep_partition_open(relmapentry, partrel, + attrmap); + check_relation_updatable(part_entry); + } + + switch (operation) + { + case CMD_INSERT: + apply_handle_insert_internal(edata, partrelinfo, + remoteslot_part); + break; + + case CMD_DELETE: + apply_handle_delete_internal(edata, partrelinfo, + remoteslot_part); + break; + + case CMD_UPDATE: + + /* + * For UPDATE, depending on whether or not the updated tuple + * satisfies the partition's constraint, perform a simple UPDATE + * of the partition or move the updated tuple into a different + * suitable partition. + */ + { + TupleTableSlot *localslot; + ResultRelInfo *partrelinfo_new; + bool found; + + /* Get the matching local tuple from the partition. */ + found = FindReplTupleInLocalRel(estate, partrel, + &part_entry->remoterel, + remoteslot_part, &localslot); + if (!found) + { + /* + * The tuple to be updated could not be found. Do nothing + * except for emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation's partition \"%s\"", + RelationGetRelationName(partrel)); + return; + } + + /* + * Apply the update to the local tuple, putting the result in + * remoteslot_part. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot_part, localslot, part_entry, + newtup); + MemoryContextSwitchTo(oldctx); + + /* + * Does the updated tuple still satisfy the current + * partition's constraint? + */ + if (!partrel->rd_rel->relispartition || + ExecPartitionCheck(partrelinfo, remoteslot_part, estate, + false)) + { + /* + * Yes, so simply UPDATE the partition. We don't call + * apply_handle_update_internal() here, which would + * normally do the following work, to avoid repeating some + * work already done above to find the local tuple in the + * partition. + */ + EPQState epqstate; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1); + ExecOpenIndices(partrelinfo, false); + + EvalPlanQualSetSlot(&epqstate, remoteslot_part); + ExecSimpleRelationUpdate(partrelinfo, estate, &epqstate, + localslot, remoteslot_part); + ExecCloseIndices(partrelinfo); + EvalPlanQualEnd(&epqstate); + } + else + { + /* Move the tuple into the new partition. */ + + /* + * New partition will be found using tuple routing, which + * can only occur via the parent table. We might need to + * convert the tuple to the parent's rowtype. Note that + * this is the tuple found in the partition, not the + * original search tuple received by this function. + */ + if (map) + { + TupleConversionMap *PartitionToRootMap = + convert_tuples_by_name(RelationGetDescr(partrel), + RelationGetDescr(parentrel)); + + remoteslot = + execute_attr_map_slot(PartitionToRootMap->attrMap, + remoteslot_part, remoteslot); + } + else + { + remoteslot = ExecCopySlot(remoteslot, remoteslot_part); + slot_getallattrs(remoteslot); + } + + + /* Find the new partition. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo_new = ExecFindPartition(mtstate, relinfo, + proute, remoteslot, + estate); + MemoryContextSwitchTo(oldctx); + Assert(partrelinfo_new != partrelinfo); + + /* DELETE old tuple found in the old partition. */ + apply_handle_delete_internal(edata, partrelinfo, + localslot); + + /* INSERT new tuple into the new partition. */ + + /* + * Convert the replacement tuple to match the destination + * partition rowtype. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrel = partrelinfo_new->ri_RelationDesc; + remoteslot_part = partrelinfo_new->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel, + &estate->es_tupleTable); + map = partrelinfo_new->ri_RootToPartitionMap; + if (map != NULL) + { + remoteslot_part = execute_attr_map_slot(map->attrMap, + remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, + remoteslot); + slot_getallattrs(remoteslot); + } + MemoryContextSwitchTo(oldctx); + apply_handle_insert_internal(edata, partrelinfo_new, + remoteslot_part); + } + } + break; + + default: + elog(ERROR, "unrecognized CmdType: %d", (int) operation); + break; + } +} + +/* + * Handle TRUNCATE message. + * + * TODO: FDW support + */ +static void +apply_handle_truncate(StringInfo s) +{ + bool cascade = false; + bool restart_seqs = false; + List *remote_relids = NIL; + List *remote_rels = NIL; + List *rels = NIL; + List *part_rels = NIL; + List *relids = NIL; + List *relids_logged = NIL; + ListCell *lc; + LOCKMODE lockmode = AccessExclusiveLock; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_TRUNCATE, s)) + return; + + begin_replication_step(); + + remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs); + + foreach(lc, remote_relids) + { + LogicalRepRelId relid = lfirst_oid(lc); + LogicalRepRelMapEntry *rel; + + rel = logicalrep_rel_open(relid, lockmode); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, lockmode); + continue; + } + + remote_rels = lappend(remote_rels, rel); + rels = lappend(rels, rel->localrel); + relids = lappend_oid(relids, rel->localreloid); + if (RelationIsLogicallyLogged(rel->localrel)) + relids_logged = lappend_oid(relids_logged, rel->localreloid); + + /* + * Truncate partitions if we got a message to truncate a partitioned + * table. + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + ListCell *child; + List *children = find_all_inheritors(rel->localreloid, + lockmode, + NULL); + + foreach(child, children) + { + Oid childrelid = lfirst_oid(child); + Relation childrel; + + if (list_member_oid(relids, childrelid)) + continue; + + /* find_all_inheritors already got lock */ + childrel = table_open(childrelid, NoLock); + + /* + * Ignore temp tables of other backends. See similar code in + * ExecuteTruncate(). + */ + if (RELATION_IS_OTHER_TEMP(childrel)) + { + table_close(childrel, lockmode); + continue; + } + + rels = lappend(rels, childrel); + part_rels = lappend(part_rels, childrel); + relids = lappend_oid(relids, childrelid); + /* Log this relation only if needed for logical decoding */ + if (RelationIsLogicallyLogged(childrel)) + relids_logged = lappend_oid(relids_logged, childrelid); + } + } + } + + /* + * Even if we used CASCADE on the upstream primary we explicitly default + * to replaying changes without further cascading. This might be later + * changeable with a user specified option. + */ + ExecuteTruncateGuts(rels, + relids, + relids_logged, + DROP_RESTRICT, + restart_seqs); + foreach(lc, remote_rels) + { + LogicalRepRelMapEntry *rel = lfirst(lc); + + logicalrep_rel_close(rel, NoLock); + } + foreach(lc, part_rels) + { + Relation rel = lfirst(lc); + + table_close(rel, NoLock); + } + + end_replication_step(); +} + + +/* + * Logical replication protocol message dispatcher. + */ +static void +apply_dispatch(StringInfo s) +{ + LogicalRepMsgType action = pq_getmsgbyte(s); + + switch (action) + { + case LOGICAL_REP_MSG_BEGIN: + apply_handle_begin(s); + return; + + case LOGICAL_REP_MSG_COMMIT: + apply_handle_commit(s); + return; + + case LOGICAL_REP_MSG_INSERT: + apply_handle_insert(s); + return; + + case LOGICAL_REP_MSG_UPDATE: + apply_handle_update(s); + return; + + case LOGICAL_REP_MSG_DELETE: + apply_handle_delete(s); + return; + + case LOGICAL_REP_MSG_TRUNCATE: + apply_handle_truncate(s); + return; + + case LOGICAL_REP_MSG_RELATION: + apply_handle_relation(s); + return; + + case LOGICAL_REP_MSG_TYPE: + apply_handle_type(s); + return; + + case LOGICAL_REP_MSG_ORIGIN: + apply_handle_origin(s); + return; + + case LOGICAL_REP_MSG_MESSAGE: + + /* + * Logical replication does not use generic logical messages yet. + * Although, it could be used by other applications that use this + * output plugin. + */ + return; + + case LOGICAL_REP_MSG_STREAM_START: + apply_handle_stream_start(s); + return; + + case LOGICAL_REP_MSG_STREAM_END: + apply_handle_stream_stop(s); + return; + + case LOGICAL_REP_MSG_STREAM_ABORT: + apply_handle_stream_abort(s); + return; + + case LOGICAL_REP_MSG_STREAM_COMMIT: + apply_handle_stream_commit(s); + return; + } + + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid logical replication message type \"%c\"", + action))); +} + +/* + * Figure out which write/flush positions to report to the walsender process. + * + * We can't simply report back the last LSN the walsender sent us because the + * local transaction might not yet be flushed to disk locally. Instead we + * build a list that associates local with remote LSNs for every commit. When + * reporting back the flush position to the sender we iterate that list and + * check which entries on it are already locally flushed. Those we can report + * as having been flushed. + * + * The have_pending_txes is true if there are outstanding transactions that + * need to be flushed. + */ +static void +get_flush_position(XLogRecPtr *write, XLogRecPtr *flush, + bool *have_pending_txes) +{ + dlist_mutable_iter iter; + XLogRecPtr local_flush = GetFlushRecPtr(); + + *write = InvalidXLogRecPtr; + *flush = InvalidXLogRecPtr; + + dlist_foreach_modify(iter, &lsn_mapping) + { + FlushPosition *pos = + dlist_container(FlushPosition, node, iter.cur); + + *write = pos->remote_end; + + if (pos->local_end <= local_flush) + { + *flush = pos->remote_end; + dlist_delete(iter.cur); + pfree(pos); + } + else + { + /* + * Don't want to uselessly iterate over the rest of the list which + * could potentially be long. Instead get the last element and + * grab the write position from there. + */ + pos = dlist_tail_element(FlushPosition, node, + &lsn_mapping); + *write = pos->remote_end; + *have_pending_txes = true; + return; + } + } + + *have_pending_txes = !dlist_is_empty(&lsn_mapping); +} + +/* + * Store current remote/local lsn pair in the tracking list. + */ +static void +store_flush_position(XLogRecPtr remote_lsn) +{ + FlushPosition *flushpos; + + /* Need to do this in permanent context */ + MemoryContextSwitchTo(ApplyContext); + + /* Track commit lsn */ + flushpos = (FlushPosition *) palloc(sizeof(FlushPosition)); + flushpos->local_end = XactLastCommitEnd; + flushpos->remote_end = remote_lsn; + + dlist_push_tail(&lsn_mapping, &flushpos->node); + MemoryContextSwitchTo(ApplyMessageContext); +} + + +/* Update statistics of the worker. */ +static void +UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) +{ + MyLogicalRepWorker->last_lsn = last_lsn; + MyLogicalRepWorker->last_send_time = send_time; + MyLogicalRepWorker->last_recv_time = GetCurrentTimestamp(); + if (reply) + { + MyLogicalRepWorker->reply_lsn = last_lsn; + MyLogicalRepWorker->reply_time = send_time; + } +} + +/* + * Apply main loop. + */ +static void +LogicalRepApplyLoop(XLogRecPtr last_received) +{ + TimestampTz last_recv_timestamp = GetCurrentTimestamp(); + bool ping_sent = false; + TimeLineID tli; + + /* + * Init the ApplyMessageContext which we clean up after each replication + * protocol message. + */ + ApplyMessageContext = AllocSetContextCreate(ApplyContext, + "ApplyMessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * This memory context is used for per-stream data when the streaming mode + * is enabled. This context is reset on each stream stop. + */ + LogicalStreamingContext = AllocSetContextCreate(ApplyContext, + "LogicalStreamingContext", + ALLOCSET_DEFAULT_SIZES); + + /* mark as idle, before starting to loop */ + pgstat_report_activity(STATE_IDLE, NULL); + + /* This outer loop iterates once per wait. */ + for (;;) + { + pgsocket fd = PGINVALID_SOCKET; + int rc; + int len; + char *buf = NULL; + bool endofstream = false; + long wait_time; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextSwitchTo(ApplyMessageContext); + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + if (len != 0) + { + /* Loop to process all available data (without blocking). */ + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + { + break; + } + else if (len < 0) + { + ereport(LOG, + (errmsg("data stream from publisher has ended"))); + endofstream = true; + break; + } + else + { + int c; + StringInfoData s; + + /* Reset timeout. */ + last_recv_timestamp = GetCurrentTimestamp(); + ping_sent = false; + + /* Ensure we are reading the data into our memory context. */ + MemoryContextSwitchTo(ApplyMessageContext); + + s.data = buf; + s.len = len; + s.cursor = 0; + s.maxlen = -1; + + c = pq_getmsgbyte(&s); + + if (c == 'w') + { + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + TimestampTz send_time; + + start_lsn = pq_getmsgint64(&s); + end_lsn = pq_getmsgint64(&s); + send_time = pq_getmsgint64(&s); + + if (last_received < start_lsn) + last_received = start_lsn; + + if (last_received < end_lsn) + last_received = end_lsn; + + UpdateWorkerStats(last_received, send_time, false); + + apply_dispatch(&s); + } + else if (c == 'k') + { + XLogRecPtr end_lsn; + TimestampTz timestamp; + bool reply_requested; + + end_lsn = pq_getmsgint64(&s); + timestamp = pq_getmsgint64(&s); + reply_requested = pq_getmsgbyte(&s); + + if (last_received < end_lsn) + last_received = end_lsn; + + send_feedback(last_received, reply_requested, false); + UpdateWorkerStats(last_received, timestamp, true); + } + /* other message types are purposefully ignored */ + + MemoryContextReset(ApplyMessageContext); + } + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + } + } + + /* confirm all writes so far */ + send_feedback(last_received, false, false); + + if (!in_remote_transaction && !in_streamed_transaction) + { + /* + * If we didn't get any transactions for a while there might be + * unconsumed invalidation messages in the queue, consume them + * now. + */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); + } + + /* Cleanup the memory. */ + MemoryContextResetAndDeleteChildren(ApplyMessageContext); + MemoryContextSwitchTo(TopMemoryContext); + + /* Check if we need to exit the streaming loop. */ + if (endofstream) + break; + + /* + * Wait for more data or latch. If we have unflushed transactions, + * wake up after WalWriterDelay to see if they've been flushed yet (in + * which case we should send a feedback message). Otherwise, there's + * no particular urgency about waking up unless we get data or a + * signal. + */ + if (!dlist_is_empty(&lsn_mapping)) + wait_time = WalWriterDelay; + else + wait_time = NAPTIME_PER_CYCLE; + + rc = WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, wait_time, + WAIT_EVENT_LOGICAL_APPLY_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (rc & WL_TIMEOUT) + { + /* + * We didn't receive anything new. If we haven't heard anything + * from the server for more than wal_receiver_timeout / 2, ping + * the server. Also, if it's been longer than + * wal_receiver_status_interval since the last update we sent, + * send a status update to the primary anyway, to report any + * progress in applying WAL. + */ + bool requestReply = false; + + /* + * Check if time since last receive from primary has reached the + * configured limit. + */ + if (wal_receiver_timeout > 0) + { + TimestampTz now = GetCurrentTimestamp(); + TimestampTz timeout; + + timeout = + TimestampTzPlusMilliseconds(last_recv_timestamp, + wal_receiver_timeout); + + if (now >= timeout) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("terminating logical replication worker due to timeout"))); + + /* Check to see if it's time for a ping. */ + if (!ping_sent) + { + timeout = TimestampTzPlusMilliseconds(last_recv_timestamp, + (wal_receiver_timeout / 2)); + if (now >= timeout) + { + requestReply = true; + ping_sent = true; + } + } + } + + send_feedback(last_received, requestReply, requestReply); + } + } + + /* All done */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); +} + +/* + * Send a Standby Status Update message to server. + * + * 'recvpos' is the latest LSN we've received data to, force is set if we need + * to send a response to avoid timeouts. + */ +static void +send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) +{ + static StringInfo reply_message = NULL; + static TimestampTz send_time = 0; + + static XLogRecPtr last_recvpos = InvalidXLogRecPtr; + static XLogRecPtr last_writepos = InvalidXLogRecPtr; + static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + + XLogRecPtr writepos; + XLogRecPtr flushpos; + TimestampTz now; + bool have_pending_txes; + + /* + * If the user doesn't want status to be reported to the publisher, be + * sure to exit before doing anything at all. + */ + if (!force && wal_receiver_status_interval <= 0) + return; + + /* It's legal to not pass a recvpos */ + if (recvpos < last_recvpos) + recvpos = last_recvpos; + + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + /* + * No outstanding transactions to flush, we can report the latest received + * position. This is important for synchronous replication. + */ + if (!have_pending_txes) + flushpos = writepos = recvpos; + + if (writepos < last_writepos) + writepos = last_writepos; + + if (flushpos < last_flushpos) + flushpos = last_flushpos; + + now = GetCurrentTimestamp(); + + /* if we've already reported everything we're good */ + if (!force && + writepos == last_writepos && + flushpos == last_flushpos && + !TimestampDifferenceExceeds(send_time, now, + wal_receiver_status_interval * 1000)) + return; + send_time = now; + + if (!reply_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + reply_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(reply_message); + + pq_sendbyte(reply_message, 'r'); + pq_sendint64(reply_message, recvpos); /* write */ + pq_sendint64(reply_message, flushpos); /* flush */ + pq_sendint64(reply_message, writepos); /* apply */ + pq_sendint64(reply_message, now); /* sendTime */ + pq_sendbyte(reply_message, requestReply); /* replyRequested */ + + elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + force, + LSN_FORMAT_ARGS(recvpos), + LSN_FORMAT_ARGS(writepos), + LSN_FORMAT_ARGS(flushpos)); + + walrcv_send(LogRepWorkerWalRcvConn, + reply_message->data, reply_message->len); + + if (recvpos > last_recvpos) + last_recvpos = recvpos; + if (writepos > last_writepos) + last_writepos = writepos; + if (flushpos > last_flushpos) + last_flushpos = flushpos; +} + +/* + * Reread subscription info if needed. Most changes will be exit. + */ +static void +maybe_reread_subscription(void) +{ + MemoryContext oldctx; + Subscription *newsub; + bool started_tx = false; + + /* When cache state is valid there is nothing to do here. */ + if (MySubscriptionValid) + return; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + /* Ensure allocations in permanent context. */ + oldctx = MemoryContextSwitchTo(ApplyContext); + + newsub = GetSubscription(MyLogicalRepWorker->subid, true); + + /* + * Exit if the subscription was removed. This normally should not happen + * as the worker gets killed during DROP SUBSCRIPTION. + */ + if (!newsub) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will " + "stop because the subscription was removed", + MySubscription->name))); + + proc_exit(0); + } + + /* + * Exit if the subscription was disabled. This normally should not happen + * as the worker gets killed during ALTER SUBSCRIPTION ... DISABLE. + */ + if (!newsub->enabled) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will " + "stop because the subscription was disabled", + MySubscription->name))); + + proc_exit(0); + } + + /* !slotname should never happen when enabled is true. */ + Assert(newsub->slotname); + + /* + * Exit if any parameter that affects the remote connection was changed. + * The launcher will start a new worker. + */ + if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 || + strcmp(newsub->name, MySubscription->name) != 0 || + strcmp(newsub->slotname, MySubscription->slotname) != 0 || + newsub->binary != MySubscription->binary || + newsub->stream != MySubscription->stream || + !equal(newsub->publications, MySubscription->publications)) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will restart because of a parameter change", + MySubscription->name))); + + proc_exit(0); + } + + /* Check for other changes that should never happen too. */ + if (newsub->dbid != MySubscription->dbid) + { + elog(ERROR, "subscription %u changed unexpectedly", + MyLogicalRepWorker->subid); + } + + /* Clean old subscription info and switch to new one. */ + FreeSubscription(MySubscription); + MySubscription = newsub; + + MemoryContextSwitchTo(oldctx); + + /* Change synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + if (started_tx) + CommitTransactionCommand(); + + MySubscriptionValid = true; +} + +/* + * Callback from subscription syscache invalidation. + */ +static void +subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + MySubscriptionValid = false; +} + +/* + * subxact_info_write + * Store information about subxacts for a toplevel transaction. + * + * For each subxact we store offset of it's first change in the main file. + * The file is always over-written as a whole. + * + * XXX We should only store subxacts that were not aborted yet. + */ +static void +subxact_info_write(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + StreamXidHash *ent; + BufFile *fd; + + Assert(TransactionIdIsValid(xid)); + + /* Find the xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + NULL); + /* By this time we must have created the transaction entry */ + Assert(ent); + + /* + * If there is no subtransaction then nothing to do, but if already have + * subxact file then delete that. + */ + if (subxact_data.nsubxacts == 0) + { + if (ent->subxact_fileset) + { + cleanup_subxact_info(); + SharedFileSetDeleteAll(ent->subxact_fileset); + pfree(ent->subxact_fileset); + ent->subxact_fileset = NULL; + } + return; + } + + subxact_filename(path, subid, xid); + + /* + * Create the subxact file if it not already created, otherwise open the + * existing file. + */ + if (ent->subxact_fileset == NULL) + { + MemoryContext oldctx; + + /* + * We need to maintain shared fileset across multiple stream + * start/stop calls. So, need to allocate it in a persistent context. + */ + oldctx = MemoryContextSwitchTo(ApplyContext); + ent->subxact_fileset = palloc(sizeof(SharedFileSet)); + SharedFileSetInit(ent->subxact_fileset, NULL); + MemoryContextSwitchTo(oldctx); + + fd = BufFileCreateShared(ent->subxact_fileset, path); + } + else + fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDWR); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* Write the subxact count and subxact info */ + BufFileWrite(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts)); + BufFileWrite(fd, subxact_data.subxacts, len); + + BufFileClose(fd); + + /* free the memory allocated for subxact info */ + cleanup_subxact_info(); +} + +/* + * subxact_info_read + * Restore information about subxacts of a streamed transaction. + * + * Read information about subxacts into the structure subxact_data that can be + * used later. + */ +static void +subxact_info_read(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + BufFile *fd; + StreamXidHash *ent; + MemoryContext oldctx; + + Assert(!subxact_data.subxacts); + Assert(subxact_data.nsubxacts == 0); + Assert(subxact_data.nsubxacts_max == 0); + + /* Find the stream xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + NULL); + if (!ent) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("transaction %u not found in stream XID hash table", + xid))); + + /* + * If subxact_fileset is not valid that mean we don't have any subxact + * info + */ + if (ent->subxact_fileset == NULL) + return; + + subxact_filename(path, subid, xid); + + fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDONLY); + + /* read number of subxact items */ + if (BufFileRead(fd, &subxact_data.nsubxacts, + sizeof(subxact_data.nsubxacts)) != + sizeof(subxact_data.nsubxacts)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from streaming transaction's subxact file \"%s\": %m", + path))); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* we keep the maximum as a power of 2 */ + subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + + /* + * Allocate subxact information in the logical streaming context. We need + * this information during the complete stream so that we can add the sub + * transaction info to this. On stream stop we will flush this information + * to the subxact file and reset the logical streaming context. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxact_data.subxacts = palloc(subxact_data.nsubxacts_max * + sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + + if ((len > 0) && ((BufFileRead(fd, subxact_data.subxacts, len)) != len)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from streaming transaction's subxact file \"%s\": %m", + path))); + + BufFileClose(fd); +} + +/* + * subxact_info_add + * Add information about a subxact (offset in the main file). + */ +static void +subxact_info_add(TransactionId xid) +{ + SubXactInfo *subxacts = subxact_data.subxacts; + int64 i; + + /* We must have a valid top level stream xid and a stream fd. */ + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != NULL); + + /* + * If the XID matches the toplevel transaction, we don't want to add it. + */ + if (stream_xid == xid) + return; + + /* + * In most cases we're checking the same subxact as we've already seen in + * the last call, so make sure to ignore it (this change comes later). + */ + if (subxact_data.subxact_last == xid) + return; + + /* OK, remember we're processing this XID. */ + subxact_data.subxact_last = xid; + + /* + * Check if the transaction is already present in the array of subxact. We + * intentionally scan the array from the tail, because we're likely adding + * a change for the most recent subtransactions. + * + * XXX Can we rely on the subxact XIDs arriving in sorted order? That + * would allow us to use binary search here. + */ + for (i = subxact_data.nsubxacts; i > 0; i--) + { + /* found, so we're done */ + if (subxacts[i - 1].xid == xid) + return; + } + + /* This is a new subxact, so we need to add it to the array. */ + if (subxact_data.nsubxacts == 0) + { + MemoryContext oldctx; + + subxact_data.nsubxacts_max = 128; + + /* + * Allocate this memory for subxacts in per-stream context, see + * subxact_info_read. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxacts = palloc(subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + } + else if (subxact_data.nsubxacts == subxact_data.nsubxacts_max) + { + subxact_data.nsubxacts_max *= 2; + subxacts = repalloc(subxacts, + subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + } + + subxacts[subxact_data.nsubxacts].xid = xid; + + /* + * Get the current offset of the stream file and store it as offset of + * this subxact. + */ + BufFileTell(stream_fd, + &subxacts[subxact_data.nsubxacts].fileno, + &subxacts[subxact_data.nsubxacts].offset); + + subxact_data.nsubxacts++; + subxact_data.subxacts = subxacts; +} + +/* format filename for file containing the info about subxacts */ +static inline void +subxact_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid); +} + +/* format filename for file containing serialized changes */ +static inline void +changes_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid); +} + +/* + * stream_cleanup_files + * Cleanup files for a subscription / toplevel transaction. + * + * Remove files with serialized changes and subxact info for a particular + * toplevel transaction. Each subscription has a separate set of files. + */ +static void +stream_cleanup_files(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + StreamXidHash *ent; + + /* Find the xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_FIND, + NULL); + if (!ent) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("transaction %u not found in stream XID hash table", + xid))); + + /* Delete the change file and release the stream fileset memory */ + changes_filename(path, subid, xid); + SharedFileSetDeleteAll(ent->stream_fileset); + pfree(ent->stream_fileset); + ent->stream_fileset = NULL; + + /* Delete the subxact file and release the memory, if it exist */ + if (ent->subxact_fileset) + { + subxact_filename(path, subid, xid); + SharedFileSetDeleteAll(ent->subxact_fileset); + pfree(ent->subxact_fileset); + ent->subxact_fileset = NULL; + } + + /* Remove the xid entry from the stream xid hash */ + hash_search(xidhash, (void *) &xid, HASH_REMOVE, NULL); +} + +/* + * stream_open_file + * Open a file that we'll use to serialize changes for a toplevel + * transaction. + * + * Open a file for streamed changes from a toplevel transaction identified + * by stream_xid (global variable). If it's the first chunk of streamed + * changes for this transaction, initialize the shared fileset and create the + * buffile, otherwise open the previously created file. + * + * This can only be called at the beginning of a "streaming" block, i.e. + * between stream_start/stream_stop messages from the upstream. + */ +static void +stream_open_file(Oid subid, TransactionId xid, bool first_segment) +{ + char path[MAXPGPATH]; + bool found; + MemoryContext oldcxt; + StreamXidHash *ent; + + Assert(in_streamed_transaction); + Assert(OidIsValid(subid)); + Assert(TransactionIdIsValid(xid)); + Assert(stream_fd == NULL); + + /* create or find the xid entry in the xidhash */ + ent = (StreamXidHash *) hash_search(xidhash, + (void *) &xid, + HASH_ENTER, + &found); + + changes_filename(path, subid, xid); + elog(DEBUG1, "opening file \"%s\" for streamed changes", path); + + /* + * Create/open the buffiles under the logical streaming context so that we + * have those files until stream stop. + */ + oldcxt = MemoryContextSwitchTo(LogicalStreamingContext); + + /* + * If this is the first streamed segment, the file must not exist, so make + * sure we're the ones creating it. Otherwise just open the file for + * writing, in append mode. + */ + if (first_segment) + { + MemoryContext savectx; + SharedFileSet *fileset; + + if (found) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect first-segment flag for streamed replication transaction"))); + + /* + * We need to maintain shared fileset across multiple stream + * start/stop calls. So, need to allocate it in a persistent context. + */ + savectx = MemoryContextSwitchTo(ApplyContext); + fileset = palloc(sizeof(SharedFileSet)); + + SharedFileSetInit(fileset, NULL); + MemoryContextSwitchTo(savectx); + + stream_fd = BufFileCreateShared(fileset, path); + + /* Remember the fileset for the next stream of the same transaction */ + ent->xid = xid; + ent->stream_fileset = fileset; + ent->subxact_fileset = NULL; + } + else + { + if (!found) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect first-segment flag for streamed replication transaction"))); + + /* + * Open the file and seek to the end of the file because we always + * append the changes file. + */ + stream_fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR); + BufFileSeek(stream_fd, 0, 0, SEEK_END); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * stream_close_file + * Close the currently open file with streamed changes. + * + * This can only be called at the end of a streaming block, i.e. at stream_stop + * message from the upstream. + */ +static void +stream_close_file(void) +{ + Assert(in_streamed_transaction); + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != NULL); + + BufFileClose(stream_fd); + + stream_xid = InvalidTransactionId; + stream_fd = NULL; +} + +/* + * stream_write_change + * Serialize a change to a file for the current toplevel transaction. + * + * The change is serialized in a simple format, with length (not including + * the length), action code (identifying the message type) and message + * contents (without the subxact TransactionId value). + */ +static void +stream_write_change(char action, StringInfo s) +{ + int len; + + Assert(in_streamed_transaction); + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != NULL); + + /* total on-disk size, including the action type character */ + len = (s->len - s->cursor) + sizeof(char); + + /* first write the size */ + BufFileWrite(stream_fd, &len, sizeof(len)); + + /* then the action */ + BufFileWrite(stream_fd, &action, sizeof(action)); + + /* and finally the remaining part of the buffer (after the XID) */ + len = (s->len - s->cursor); + + BufFileWrite(stream_fd, &s->data[s->cursor], len); +} + +/* + * Cleanup the memory for subxacts and reset the related variables. + */ +static inline void +cleanup_subxact_info() +{ + if (subxact_data.subxacts) + pfree(subxact_data.subxacts); + + subxact_data.subxacts = NULL; + subxact_data.subxact_last = InvalidTransactionId; + subxact_data.nsubxacts = 0; + subxact_data.nsubxacts_max = 0; +} + +/* Logical Replication Apply worker entry point */ +void +ApplyWorkerMain(Datum main_arg) +{ + int worker_slot = DatumGetInt32(main_arg); + MemoryContext oldctx; + char originname[NAMEDATALEN]; + XLogRecPtr origin_startpos; + char *myslotname; + WalRcvStreamOptions options; + + /* Attach to slot */ + logicalrep_worker_attach(worker_slot); + + /* Setup signal handling */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * We don't currently need any ResourceOwner in a walreceiver process, but + * if we did, we could call CreateAuxProcessResourceOwner here. + */ + + /* Initialise stats to a sanish value */ + MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time = + MyLogicalRepWorker->reply_time = GetCurrentTimestamp(); + + /* Load the libpq-specific functions */ + load_file("libpqwalreceiver", false); + + /* Run as replica session replication role. */ + SetConfigOption("session_replication_role", "replica", + PGC_SUSET, PGC_S_OVERRIDE); + + /* Connect to our database. */ + BackgroundWorkerInitializeConnectionByOid(MyLogicalRepWorker->dbid, + MyLogicalRepWorker->userid, + 0); + + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* Load the subscription into persistent memory context. */ + ApplyContext = AllocSetContextCreate(TopMemoryContext, + "ApplyContext", + ALLOCSET_DEFAULT_SIZES); + StartTransactionCommand(); + oldctx = MemoryContextSwitchTo(ApplyContext); + + MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); + if (!MySubscription) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription %u will not " + "start because the subscription was removed during startup", + MyLogicalRepWorker->subid))); + proc_exit(0); + } + + MySubscriptionValid = true; + MemoryContextSwitchTo(oldctx); + + if (!MySubscription->enabled) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will not " + "start because the subscription was disabled during startup", + MySubscription->name))); + + proc_exit(0); + } + + /* Setup synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + /* Keep us informed about subscription changes. */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONOID, + subscription_change_cb, + (Datum) 0); + + if (am_tablesync_worker()) + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started", + MySubscription->name, get_rel_name(MyLogicalRepWorker->relid)))); + else + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" has started", + MySubscription->name))); + + CommitTransactionCommand(); + + /* Connect to the origin and start the replication. */ + elog(DEBUG1, "connecting to publisher using connection string \"%s\"", + MySubscription->conninfo); + + if (am_tablesync_worker()) + { + char *syncslotname; + + /* This is table synchronization worker, call initial sync. */ + syncslotname = LogicalRepSyncTableStart(&origin_startpos); + + /* allocate slot name in long-lived context */ + myslotname = MemoryContextStrdup(ApplyContext, syncslotname); + + pfree(syncslotname); + } + else + { + /* This is main apply worker */ + RepOriginId originid; + TimeLineID startpointTLI; + char *err; + + myslotname = MySubscription->slotname; + + /* + * This shouldn't happen if the subscription is enabled, but guard + * against DDL bugs or manual catalog changes. (libpqwalreceiver will + * crash if slot is NULL.) + */ + if (!myslotname) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("subscription has no replication slot set"))); + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid); + replorigin_session_origin = originid; + origin_startpos = replorigin_session_get_progress(false); + CommitTransactionCommand(); + + LogRepWorkerWalRcvConn = walrcv_connect(MySubscription->conninfo, true, + MySubscription->name, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + /* + * We don't really use the output identify_system for anything but it + * does some initializations on the upstream so let's still call it. + */ + (void) walrcv_identify_system(LogRepWorkerWalRcvConn, &startpointTLI); + } + + /* + * Setup callback for syscache so that we know when something changes in + * the subscription relation state. + */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); + + /* Build logical replication streaming options. */ + options.logical = true; + options.startpoint = origin_startpos; + options.slotname = myslotname; + options.proto.logical.proto_version = + walrcv_server_version(LogRepWorkerWalRcvConn) >= 140000 ? + LOGICALREP_PROTO_STREAM_VERSION_NUM : LOGICALREP_PROTO_VERSION_NUM; + options.proto.logical.publication_names = MySubscription->publications; + options.proto.logical.binary = MySubscription->binary; + options.proto.logical.streaming = MySubscription->stream; + + /* Start normal logical streaming replication. */ + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + + /* Run the main loop. */ + LogicalRepApplyLoop(origin_startpos); + + proc_exit(0); +} + +/* + * Is current process a logical replication worker? + */ +bool +IsLogicalWorker(void) +{ + return MyLogicalRepWorker != NULL; +} diff --git a/src/backend/replication/pgoutput/Makefile b/src/backend/replication/pgoutput/Makefile new file mode 100644 index 0000000..3b41fbc --- /dev/null +++ b/src/backend/replication/pgoutput/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/pgoutput +# +# IDENTIFICATION +# src/backend/replication/pgoutput +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/pgoutput +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(WIN32RES) \ + pgoutput.o +PGFILEDESC = "pgoutput - standard logical replication output plugin" +NAME = pgoutput + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean maintainer-clean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c new file mode 100644 index 0000000..ff9cf5d --- /dev/null +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -0,0 +1,1346 @@ +/*------------------------------------------------------------------------- + * + * pgoutput.c + * Logical Replication output plugin + * + * Copyright (c) 2012-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/pgoutput/pgoutput.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tupconvert.h" +#include "catalog/partition.h" +#include "catalog/pg_publication.h" +#include "commands/defrem.h" +#include "fmgr.h" +#include "replication/logical.h" +#include "replication/logicalproto.h" +#include "replication/origin.h" +#include "replication/pgoutput.h" +#include "utils/int8.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +PG_MODULE_MAGIC; + +extern void _PG_output_plugin_init(OutputPluginCallbacks *cb); + +static void pgoutput_startup(LogicalDecodingContext *ctx, + OutputPluginOptions *opt, bool is_init); +static void pgoutput_shutdown(LogicalDecodingContext *ctx); +static void pgoutput_begin_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_commit_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr commit_lsn); +static void pgoutput_change(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, Relation rel, + ReorderBufferChange *change); +static void pgoutput_truncate(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, int nrelations, Relation relations[], + ReorderBufferChange *change); +static void pgoutput_message(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr message_lsn, + bool transactional, const char *prefix, + Size sz, const char *message); +static bool pgoutput_origin_filter(LogicalDecodingContext *ctx, + RepOriginId origin_id); +static void pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); + +static bool publications_valid; +static bool in_streaming; + +static List *LoadPublications(List *pubnames); +static void publication_invalidation_cb(Datum arg, int cacheid, + uint32 hashvalue); +static void send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx); +static void update_replication_progress(LogicalDecodingContext *ctx); + +/* + * Entry in the map used to remember which relation schemas we sent. + * + * The schema_sent flag determines if the current schema record for the + * relation (and for its ancestor if publish_as_relid is set) was already + * sent to the subscriber (in which case we don't need to send it again). + * + * The schema cache on downstream is however updated only at commit time, + * and with streamed transactions the commit order may be different from + * the order the transactions are sent in. Also, the (sub) transactions + * might get aborted so we need to send the schema for each (sub) transaction + * so that we don't lose the schema information on abort. For handling this, + * we maintain the list of xids (streamed_txns) for those we have already sent + * the schema. + * + * For partitions, 'pubactions' considers not only the table's own + * publications, but also those of all of its ancestors. + */ +typedef struct RelationSyncEntry +{ + Oid relid; /* relation oid */ + + bool schema_sent; + List *streamed_txns; /* streamed toplevel transactions with this + * schema */ + + bool replicate_valid; + PublicationActions pubactions; + + /* + * OID of the relation to publish changes as. For a partition, this may + * be set to one of its ancestors whose schema will be used when + * replicating changes, if publish_via_partition_root is set for the + * publication. + */ + Oid publish_as_relid; + + /* + * Map used when replicating using an ancestor's schema to convert tuples + * from partition's type to the ancestor's; NULL if publish_as_relid is + * same as 'relid' or if unnecessary due to partition and the ancestor + * having identical TupleDesc. + */ + TupleConversionMap *map; +} RelationSyncEntry; + +/* Map used to remember which relation schemas we sent. */ +static HTAB *RelationSyncCache = NULL; + +static void init_rel_sync_cache(MemoryContext decoding_context); +static void cleanup_rel_sync_cache(TransactionId xid, bool is_commit); +static RelationSyncEntry *get_rel_sync_entry(PGOutputData *data, Oid relid); +static void rel_sync_cache_relation_cb(Datum arg, Oid relid); +static void rel_sync_cache_publication_cb(Datum arg, int cacheid, + uint32 hashvalue); +static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); +static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); + +/* + * Specify output plugin callbacks + */ +void +_PG_output_plugin_init(OutputPluginCallbacks *cb) +{ + AssertVariableIsOfType(&_PG_output_plugin_init, LogicalOutputPluginInit); + + cb->startup_cb = pgoutput_startup; + cb->begin_cb = pgoutput_begin_txn; + cb->change_cb = pgoutput_change; + cb->truncate_cb = pgoutput_truncate; + cb->message_cb = pgoutput_message; + cb->commit_cb = pgoutput_commit_txn; + cb->filter_by_origin_cb = pgoutput_origin_filter; + cb->shutdown_cb = pgoutput_shutdown; + + /* transaction streaming */ + cb->stream_start_cb = pgoutput_stream_start; + cb->stream_stop_cb = pgoutput_stream_stop; + cb->stream_abort_cb = pgoutput_stream_abort; + cb->stream_commit_cb = pgoutput_stream_commit; + cb->stream_change_cb = pgoutput_change; + cb->stream_message_cb = pgoutput_message; + cb->stream_truncate_cb = pgoutput_truncate; +} + +static void +parse_output_parameters(List *options, PGOutputData *data) +{ + ListCell *lc; + bool protocol_version_given = false; + bool publication_names_given = false; + bool binary_option_given = false; + bool messages_option_given = false; + bool streaming_given = false; + + data->binary = false; + data->streaming = false; + data->messages = false; + + foreach(lc, options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + Assert(defel->arg == NULL || IsA(defel->arg, String)); + + /* Check each param, whether or not we recognize it */ + if (strcmp(defel->defname, "proto_version") == 0) + { + int64 parsed; + + if (protocol_version_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + protocol_version_given = true; + + if (!scanint8(strVal(defel->arg), true, &parsed)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid proto_version"))); + + if (parsed > PG_UINT32_MAX || parsed < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("proto_version \"%s\" out of range", + strVal(defel->arg)))); + + data->protocol_version = (uint32) parsed; + } + else if (strcmp(defel->defname, "publication_names") == 0) + { + if (publication_names_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + publication_names_given = true; + + if (!SplitIdentifierString(strVal(defel->arg), ',', + &data->publication_names)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid publication_names syntax"))); + } + else if (strcmp(defel->defname, "binary") == 0) + { + if (binary_option_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + binary_option_given = true; + + data->binary = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "messages") == 0) + { + if (messages_option_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + messages_option_given = true; + + data->messages = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "streaming") == 0) + { + if (streaming_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + streaming_given = true; + + data->streaming = defGetBoolean(defel); + } + else + elog(ERROR, "unrecognized pgoutput option: %s", defel->defname); + } +} + +/* + * Initialize this plugin + */ +static void +pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init) +{ + PGOutputData *data = palloc0(sizeof(PGOutputData)); + + /* Create our memory context for private allocations. */ + data->context = AllocSetContextCreate(ctx->context, + "logical replication output context", + ALLOCSET_DEFAULT_SIZES); + + ctx->output_plugin_private = data; + + /* This plugin uses binary protocol. */ + opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; + + /* + * This is replication start and not slot initialization. + * + * Parse and validate options passed by the client. + */ + if (!is_init) + { + /* Parse the params and ERROR if we see any we don't recognize */ + parse_output_parameters(ctx->output_plugin_options, data); + + /* Check if we support requested protocol */ + if (data->protocol_version > LOGICALREP_PROTO_MAX_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("client sent proto_version=%d but we only support protocol %d or lower", + data->protocol_version, LOGICALREP_PROTO_MAX_VERSION_NUM))); + + if (data->protocol_version < LOGICALREP_PROTO_MIN_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("client sent proto_version=%d but we only support protocol %d or higher", + data->protocol_version, LOGICALREP_PROTO_MIN_VERSION_NUM))); + + if (list_length(data->publication_names) < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("publication_names parameter missing"))); + + /* + * Decide whether to enable streaming. It is disabled by default, in + * which case we just update the flag in decoding context. Otherwise + * we only allow it with sufficient version of the protocol, and when + * the output plugin supports it. + */ + if (!data->streaming) + ctx->streaming = false; + else if (data->protocol_version < LOGICALREP_PROTO_STREAM_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("requested proto_version=%d does not support streaming, need %d or higher", + data->protocol_version, LOGICALREP_PROTO_STREAM_VERSION_NUM))); + else if (!ctx->streaming) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("streaming requested, but not supported by output plugin"))); + + /* Also remember we're currently not streaming any transaction. */ + in_streaming = false; + + /* Init publication state. */ + data->publications = NIL; + publications_valid = false; + CacheRegisterSyscacheCallback(PUBLICATIONOID, + publication_invalidation_cb, + (Datum) 0); + + /* Initialize relation schema cache. */ + init_rel_sync_cache(CacheMemoryContext); + } + else + { + /* Disable the streaming during the slot initialization mode. */ + ctx->streaming = false; + } +} + +/* + * BEGIN callback + */ +static void +pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ + bool send_replication_origin = txn->origin_id != InvalidRepOriginId; + + OutputPluginPrepareWrite(ctx, !send_replication_origin); + logicalrep_write_begin(ctx->out, txn); + + if (send_replication_origin) + { + char *origin; + + /*---------- + * XXX: which behaviour do we want here? + * + * Alternatives: + * - don't send origin message if origin name not found + * (that's what we do now) + * - throw error - that will break replication, not good + * - send some special "unknown" origin + *---------- + */ + if (replorigin_by_oid(txn->origin_id, true, &origin)) + { + /* Message boundary */ + OutputPluginWrite(ctx, false); + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_origin(ctx->out, origin, txn->origin_lsn); + } + + } + + OutputPluginWrite(ctx, true); +} + +/* + * COMMIT callback + */ +static void +pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + update_replication_progress(ctx); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_commit(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); +} + +/* + * Write the current schema of the relation and its ancestor (if any) if not + * done yet. + */ +static void +maybe_send_schema(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, ReorderBufferChange *change, + Relation relation, RelationSyncEntry *relentry) +{ + bool schema_sent; + TransactionId xid = InvalidTransactionId; + TransactionId topxid = InvalidTransactionId; + + /* + * Remember XID of the (sub)transaction for the change. We don't care if + * it's top-level transaction or not (we have already sent that XID in + * start of the current streaming block). + * + * If we're not in a streaming block, just use InvalidTransactionId and + * the write methods will not include it. + */ + if (in_streaming) + xid = change->txn->xid; + + if (change->txn->toptxn) + topxid = change->txn->toptxn->xid; + else + topxid = xid; + + /* + * Do we need to send the schema? We do track streamed transactions + * separately, because those may be applied later (and the regular + * transactions won't see their effects until then) and in an order that + * we don't know at this point. + * + * XXX There is a scope of optimization here. Currently, we always send + * the schema first time in a streaming transaction but we can probably + * avoid that by checking 'relentry->schema_sent' flag. However, before + * doing that we need to study its impact on the case where we have a mix + * of streaming and non-streaming transactions. + */ + if (in_streaming) + schema_sent = get_schema_sent_in_streamed_txn(relentry, topxid); + else + schema_sent = relentry->schema_sent; + + /* Nothing to do if we already sent the schema. */ + if (schema_sent) + return; + + /* + * Nope, so send the schema. If the changes will be published using an + * ancestor's schema, not the relation's own, send that ancestor's schema + * before sending relation's own (XXX - maybe sending only the former + * suffices?). This is also a good place to set the map that will be used + * to convert the relation's tuples into the ancestor's format, if needed. + */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid); + TupleDesc indesc = RelationGetDescr(relation); + TupleDesc outdesc = RelationGetDescr(ancestor); + MemoryContext oldctx; + + /* Map must live as long as the session does. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + + /* + * Make copies of the TupleDescs that will live as long as the map + * does before putting into the map. + */ + indesc = CreateTupleDescCopy(indesc); + outdesc = CreateTupleDescCopy(outdesc); + relentry->map = convert_tuples_by_name(indesc, outdesc); + if (relentry->map == NULL) + { + /* Map not necessary, so free the TupleDescs too. */ + FreeTupleDesc(indesc); + FreeTupleDesc(outdesc); + } + + MemoryContextSwitchTo(oldctx); + send_relation_and_attrs(ancestor, xid, ctx); + RelationClose(ancestor); + } + + send_relation_and_attrs(relation, xid, ctx); + + if (in_streaming) + set_schema_sent_in_streamed_txn(relentry, topxid); + else + relentry->schema_sent = true; +} + +/* + * Sends a relation + */ +static void +send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx) +{ + TupleDesc desc = RelationGetDescr(relation); + int i; + + /* + * Write out type info if needed. We do that only for user-created types. + * We use FirstGenbkiObjectId as the cutoff, so that we only consider + * objects with hand-assigned OIDs to be "built in", not for instance any + * function or type defined in the information_schema. This is important + * because only hand-assigned OIDs can be expected to remain stable across + * major versions. + */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (att->atttypid < FirstGenbkiObjectId) + continue; + + OutputPluginPrepareWrite(ctx, false); + logicalrep_write_typ(ctx->out, xid, att->atttypid); + OutputPluginWrite(ctx, false); + } + + OutputPluginPrepareWrite(ctx, false); + logicalrep_write_rel(ctx->out, xid, relation); + OutputPluginWrite(ctx, false); +} + +/* + * Sends the decoded DML over wire. + * + * This is called both in streaming and non-streaming modes. + */ +static void +pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + MemoryContext old; + RelationSyncEntry *relentry; + TransactionId xid = InvalidTransactionId; + Relation ancestor = NULL; + + update_replication_progress(ctx); + + if (!is_publishable_relation(relation)) + return; + + /* + * Remember the xid for the change in streaming mode. We need to send xid + * with each change in the streaming mode so that subscriber can make + * their association and on aborts, it can discard the corresponding + * changes. + */ + if (in_streaming) + xid = change->txn->xid; + + relentry = get_rel_sync_entry(data, RelationGetRelid(relation)); + + /* First check the table filter */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + if (!relentry->pubactions.pubinsert) + return; + break; + case REORDER_BUFFER_CHANGE_UPDATE: + if (!relentry->pubactions.pubupdate) + return; + break; + case REORDER_BUFFER_CHANGE_DELETE: + if (!relentry->pubactions.pubdelete) + return; + break; + default: + Assert(false); + } + + /* Avoid leaking memory by using and resetting our own context */ + old = MemoryContextSwitchTo(data->context); + + maybe_send_schema(ctx, txn, change, relation, relentry); + + /* Send the data */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + { + HeapTuple tuple = &change->data.tp.newtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + ancestor = RelationIdGetRelation(relentry->publish_as_relid); + relation = ancestor; + /* Convert tuple if needed. */ + if (relentry->map) + tuple = execute_attr_map_tuple(tuple, relentry->map); + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_insert(ctx->out, xid, relation, tuple, + data->binary); + OutputPluginWrite(ctx, true); + break; + } + case REORDER_BUFFER_CHANGE_UPDATE: + { + HeapTuple oldtuple = change->data.tp.oldtuple ? + &change->data.tp.oldtuple->tuple : NULL; + HeapTuple newtuple = &change->data.tp.newtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + ancestor = RelationIdGetRelation(relentry->publish_as_relid); + relation = ancestor; + /* Convert tuples if needed. */ + if (relentry->map) + { + if (oldtuple) + oldtuple = execute_attr_map_tuple(oldtuple, + relentry->map); + newtuple = execute_attr_map_tuple(newtuple, + relentry->map); + } + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_update(ctx->out, xid, relation, oldtuple, + newtuple, data->binary); + OutputPluginWrite(ctx, true); + break; + } + case REORDER_BUFFER_CHANGE_DELETE: + if (change->data.tp.oldtuple) + { + HeapTuple oldtuple = &change->data.tp.oldtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + ancestor = RelationIdGetRelation(relentry->publish_as_relid); + relation = ancestor; + /* Convert tuple if needed. */ + if (relentry->map) + oldtuple = execute_attr_map_tuple(oldtuple, relentry->map); + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_delete(ctx->out, xid, relation, oldtuple, + data->binary); + OutputPluginWrite(ctx, true); + } + else + elog(DEBUG1, "didn't send DELETE change because of missing oldtuple"); + break; + default: + Assert(false); + } + + if (RelationIsValid(ancestor)) + { + RelationClose(ancestor); + ancestor = NULL; + } + + /* Cleanup */ + MemoryContextSwitchTo(old); + MemoryContextReset(data->context); +} + +static void +pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + MemoryContext old; + RelationSyncEntry *relentry; + int i; + int nrelids; + Oid *relids; + TransactionId xid = InvalidTransactionId; + + update_replication_progress(ctx); + + /* Remember the xid for the change in streaming mode. See pgoutput_change. */ + if (in_streaming) + xid = change->txn->xid; + + old = MemoryContextSwitchTo(data->context); + + relids = palloc0(nrelations * sizeof(Oid)); + nrelids = 0; + + for (i = 0; i < nrelations; i++) + { + Relation relation = relations[i]; + Oid relid = RelationGetRelid(relation); + + if (!is_publishable_relation(relation)) + continue; + + relentry = get_rel_sync_entry(data, relid); + + if (!relentry->pubactions.pubtruncate) + continue; + + /* + * Don't send partitions if the publication wants to send only the + * root tables through it. + */ + if (relation->rd_rel->relispartition && + relentry->publish_as_relid != relid) + continue; + + relids[nrelids++] = relid; + maybe_send_schema(ctx, txn, change, relation, relentry); + } + + if (nrelids > 0) + { + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_truncate(ctx->out, + xid, + nrelids, + relids, + change->data.truncate.cascade, + change->data.truncate.restart_seqs); + OutputPluginWrite(ctx, true); + } + + MemoryContextSwitchTo(old); + MemoryContextReset(data->context); +} + +static void +pgoutput_message(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, const char *prefix, Size sz, + const char *message) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + TransactionId xid = InvalidTransactionId; + + update_replication_progress(ctx); + + if (!data->messages) + return; + + /* + * Remember the xid for the message in streaming mode. See + * pgoutput_change. + */ + if (in_streaming) + xid = txn->xid; + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_message(ctx->out, + xid, + message_lsn, + transactional, + prefix, + sz, + message); + OutputPluginWrite(ctx, true); +} + +/* + * Currently we always forward. + */ +static bool +pgoutput_origin_filter(LogicalDecodingContext *ctx, + RepOriginId origin_id) +{ + return false; +} + +/* + * Shutdown the output plugin. + * + * Note, we don't need to clean the data->context as it's child context + * of the ctx->context so it will be cleaned up by logical decoding machinery. + */ +static void +pgoutput_shutdown(LogicalDecodingContext *ctx) +{ + if (RelationSyncCache) + { + hash_destroy(RelationSyncCache); + RelationSyncCache = NULL; + } +} + +/* + * Load publications from the list of publication names. + */ +static List * +LoadPublications(List *pubnames) +{ + List *result = NIL; + ListCell *lc; + + foreach(lc, pubnames) + { + char *pubname = (char *) lfirst(lc); + Publication *pub = GetPublicationByName(pubname, false); + + result = lappend(result, pub); + } + + return result; +} + +/* + * Publication cache invalidation callback. + */ +static void +publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + publications_valid = false; + + /* + * Also invalidate per-relation cache so that next time the filtering info + * is checked it will be updated with the new publication settings. + */ + rel_sync_cache_publication_cb(arg, cacheid, hashvalue); +} + +/* + * START STREAM callback + */ +static void +pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + bool send_replication_origin = txn->origin_id != InvalidRepOriginId; + + /* we can't nest streaming of transactions */ + Assert(!in_streaming); + + /* + * If we already sent the first stream for this transaction then don't + * send the origin id in the subsequent streams. + */ + if (rbtxn_is_streamed(txn)) + send_replication_origin = false; + + OutputPluginPrepareWrite(ctx, !send_replication_origin); + logicalrep_write_stream_start(ctx->out, txn->xid, !rbtxn_is_streamed(txn)); + + if (send_replication_origin) + { + char *origin; + + if (replorigin_by_oid(txn->origin_id, true, &origin)) + { + /* Message boundary */ + OutputPluginWrite(ctx, false); + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_origin(ctx->out, origin, InvalidXLogRecPtr); + } + } + + OutputPluginWrite(ctx, true); + + /* we're streaming a chunk of transaction now */ + in_streaming = true; +} + +/* + * STOP STREAM callback + */ +static void +pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + /* we should be streaming a trasanction */ + Assert(in_streaming); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_stop(ctx->out); + OutputPluginWrite(ctx, true); + + /* we've stopped streaming a transaction */ + in_streaming = false; +} + +/* + * Notify downstream to discard the streamed transaction (along with all + * it's subtransactions, if it's a toplevel transaction). + */ +static void +pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + ReorderBufferTXN *toptxn; + + /* + * The abort should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + + /* determine the toplevel transaction */ + toptxn = (txn->toptxn) ? txn->toptxn : txn; + + Assert(rbtxn_is_streamed(toptxn)); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_abort(ctx->out, toptxn->xid, txn->xid); + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(toptxn->xid, false); +} + +/* + * Notify downstream to apply the streamed transaction (along with all + * it's subtransactions). + */ +static void +pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + /* + * The commit should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + Assert(rbtxn_is_streamed(txn)); + + update_replication_progress(ctx); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_commit(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(txn->xid, true); +} + +/* + * Initialize the relation schema sync cache for a decoding session. + * + * The hash table is destroyed at the end of a decoding session. While + * relcache invalidations still exist and will still be invoked, they + * will just see the null hash table global and take no action. + */ +static void +init_rel_sync_cache(MemoryContext cachectx) +{ + HASHCTL ctl; + + if (RelationSyncCache != NULL) + return; + + /* Make a new hash table for the cache */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(RelationSyncEntry); + ctl.hcxt = cachectx; + + RelationSyncCache = hash_create("logical replication output relation cache", + 128, &ctl, + HASH_ELEM | HASH_CONTEXT | HASH_BLOBS); + + Assert(RelationSyncCache != NULL); + + CacheRegisterRelcacheCallback(rel_sync_cache_relation_cb, (Datum) 0); + CacheRegisterSyscacheCallback(PUBLICATIONRELMAP, + rel_sync_cache_publication_cb, + (Datum) 0); +} + +/* + * We expect relatively small number of streamed transactions. + */ +static bool +get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + ListCell *lc; + + foreach(lc, entry->streamed_txns) + { + if (xid == (uint32) lfirst_int(lc)) + return true; + } + + return false; +} + +/* + * Add the xid in the rel sync entry for which we have already sent the schema + * of the relation. + */ +static void +set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + + entry->streamed_txns = lappend_int(entry->streamed_txns, xid); + + MemoryContextSwitchTo(oldctx); +} + +/* + * Find or create entry in the relation schema cache. + * + * This looks up publications that the given relation is directly or + * indirectly part of (the latter if it's really the relation's ancestor that + * is part of a publication) and fills up the found entry with the information + * about which operations to publish and whether to use an ancestor's schema + * when publishing. + */ +static RelationSyncEntry * +get_rel_sync_entry(PGOutputData *data, Oid relid) +{ + RelationSyncEntry *entry; + bool found; + MemoryContext oldctx; + + Assert(RelationSyncCache != NULL); + + /* Find cached relation info, creating if not found */ + entry = (RelationSyncEntry *) hash_search(RelationSyncCache, + (void *) &relid, + HASH_ENTER, &found); + Assert(entry != NULL); + + /* Not found means schema wasn't sent */ + if (!found) + { + /* immediately make a new entry valid enough to satisfy callbacks */ + entry->schema_sent = false; + entry->streamed_txns = NIL; + entry->replicate_valid = false; + entry->pubactions.pubinsert = entry->pubactions.pubupdate = + entry->pubactions.pubdelete = entry->pubactions.pubtruncate = false; + entry->publish_as_relid = InvalidOid; + entry->map = NULL; /* will be set by maybe_send_schema() if + * needed */ + } + + /* Validate the entry */ + if (!entry->replicate_valid) + { + List *pubids = GetRelationPublications(relid); + ListCell *lc; + Oid publish_as_relid = relid; + int publish_ancestor_level = 0; + bool am_partition = get_rel_relispartition(relid); + char relkind = get_rel_relkind(relid); + + /* Reload publications if needed before use. */ + if (!publications_valid) + { + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + if (data->publications) + list_free_deep(data->publications); + + data->publications = LoadPublications(data->publication_names); + MemoryContextSwitchTo(oldctx); + publications_valid = true; + } + + /* + * Build publication cache. We can't use one provided by relcache as + * relcache considers all publications given relation is in, but here + * we only need to consider ones that the subscriber requested. + */ + foreach(lc, data->publications) + { + Publication *pub = lfirst(lc); + bool publish = false; + + /* + * Under what relid should we publish changes in this publication? + * We'll use the top-most relid across all publications. Also track + * the ancestor level for this publication. + */ + Oid pub_relid = relid; + int ancestor_level = 0; + + /* + * If this is a FOR ALL TABLES publication, pick the partition root + * and set the ancestor level accordingly. + */ + if (pub->alltables) + { + publish = true; + if (pub->pubviaroot && am_partition) + { + List *ancestors = get_partition_ancestors(relid); + + pub_relid = llast_oid(ancestors); + ancestor_level = list_length(ancestors); + } + } + + if (!publish) + { + bool ancestor_published = false; + + /* + * For a partition, check if any of the ancestors are + * published. If so, note down the topmost ancestor that is + * published via this publication, which will be used as the + * relation via which to publish the partition's changes. + */ + if (am_partition) + { + List *ancestors = get_partition_ancestors(relid); + ListCell *lc2; + int level = 0; + + /* + * Find the "topmost" ancestor that is in this + * publication. + */ + foreach(lc2, ancestors) + { + Oid ancestor = lfirst_oid(lc2); + + level++; + + if (list_member_oid(GetRelationPublications(ancestor), + pub->oid)) + { + ancestor_published = true; + if (pub->pubviaroot) + { + pub_relid = ancestor; + ancestor_level = level; + } + } + } + } + + if (list_member_oid(pubids, pub->oid) || ancestor_published) + publish = true; + } + + /* + * Don't publish changes for partitioned tables, because + * publishing those of its partitions suffices, unless partition + * changes won't be published due to pubviaroot being set. + */ + if (publish && + (relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot)) + { + entry->pubactions.pubinsert |= pub->pubactions.pubinsert; + entry->pubactions.pubupdate |= pub->pubactions.pubupdate; + entry->pubactions.pubdelete |= pub->pubactions.pubdelete; + entry->pubactions.pubtruncate |= pub->pubactions.pubtruncate; + + /* + * We want to publish the changes as the top-most ancestor + * across all publications. So we need to check if the + * already calculated level is higher than the new one. If + * yes, we can ignore the new value (as it's a child). + * Otherwise the new value is an ancestor, so we keep it. + */ + if (publish_ancestor_level > ancestor_level) + continue; + + /* The new value is an ancestor, so let's keep it. */ + publish_as_relid = pub_relid; + publish_ancestor_level = ancestor_level; + } + } + + list_free(pubids); + + entry->publish_as_relid = publish_as_relid; + entry->replicate_valid = true; + } + + return entry; +} + +/* + * Cleanup list of streamed transactions and update the schema_sent flag. + * + * When a streamed transaction commits or aborts, we need to remove the + * toplevel XID from the schema cache. If the transaction aborted, the + * subscriber will simply throw away the schema records we streamed, so + * we don't need to do anything else. + * + * If the transaction is committed, the subscriber will update the relation + * cache - so tweak the schema_sent flag accordingly. + */ +static void +cleanup_rel_sync_cache(TransactionId xid, bool is_commit) +{ + HASH_SEQ_STATUS hash_seq; + RelationSyncEntry *entry; + ListCell *lc; + + Assert(RelationSyncCache != NULL); + + hash_seq_init(&hash_seq, RelationSyncCache); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + /* + * We can set the schema_sent flag for an entry that has committed xid + * in the list as that ensures that the subscriber would have the + * corresponding schema and we don't need to send it unless there is + * any invalidation for that relation. + */ + foreach(lc, entry->streamed_txns) + { + if (xid == (uint32) lfirst_int(lc)) + { + if (is_commit) + entry->schema_sent = true; + + entry->streamed_txns = + foreach_delete_current(entry->streamed_txns, lc); + break; + } + } + } +} + +/* + * Relcache invalidation callback + */ +static void +rel_sync_cache_relation_cb(Datum arg, Oid relid) +{ + RelationSyncEntry *entry; + + /* + * We can get here if the plugin was used in SQL interface as the + * RelSchemaSyncCache is destroyed when the decoding finishes, but there + * is no way to unregister the relcache invalidation callback. + */ + if (RelationSyncCache == NULL) + return; + + /* + * Nobody keeps pointers to entries in this hash table around outside + * logical decoding callback calls - but invalidation events can come in + * *during* a callback if we access the relcache in the callback. Because + * of that we must mark the cache entry as invalid but not remove it from + * the hash while it could still be referenced, then prune it at a later + * safe point. + * + * Getting invalidations for relations that aren't in the table is + * entirely normal, since there's no way to unregister for an invalidation + * event. So we don't care if it's found or not. + */ + entry = (RelationSyncEntry *) hash_search(RelationSyncCache, &relid, + HASH_FIND, NULL); + + /* + * Reset schema sent status as the relation definition may have changed. + * Also free any objects that depended on the earlier definition. + */ + if (entry != NULL) + { + entry->schema_sent = false; + list_free(entry->streamed_txns); + entry->streamed_txns = NIL; + if (entry->map) + { + /* + * Must free the TupleDescs contained in the map explicitly, + * because free_conversion_map() doesn't. + */ + FreeTupleDesc(entry->map->indesc); + FreeTupleDesc(entry->map->outdesc); + free_conversion_map(entry->map); + } + entry->map = NULL; + } +} + +/* + * Publication relation map syscache invalidation callback + */ +static void +rel_sync_cache_publication_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + HASH_SEQ_STATUS status; + RelationSyncEntry *entry; + + /* + * We can get here if the plugin was used in SQL interface as the + * RelSchemaSyncCache is destroyed when the decoding finishes, but there + * is no way to unregister the relcache invalidation callback. + */ + if (RelationSyncCache == NULL) + return; + + /* + * There is no way to find which entry in our cache the hash belongs to so + * mark the whole cache as invalid. + */ + hash_seq_init(&status, RelationSyncCache); + while ((entry = (RelationSyncEntry *) hash_seq_search(&status)) != NULL) + { + entry->replicate_valid = false; + + /* + * There might be some relations dropped from the publication so we + * don't need to publish the changes for them. + */ + entry->pubactions.pubinsert = false; + entry->pubactions.pubupdate = false; + entry->pubactions.pubdelete = false; + entry->pubactions.pubtruncate = false; + } +} + +/* + * Try to update progress and send a keepalive message if too many changes were + * processed. + * + * For a large transaction, if we don't send any change to the downstream for a + * long time (exceeds the wal_receiver_timeout of standby) then it can timeout. + * This can happen when all or most of the changes are not published. + */ +static void +update_replication_progress(LogicalDecodingContext *ctx) +{ + static int changes_count = 0; + + /* + * We don't want to try sending a keepalive message after processing each + * change as that can have overhead. Tests revealed that there is no + * noticeable overhead in doing it after continuously processing 100 or so + * changes. + */ +#define CHANGES_THRESHOLD 100 + + /* + * If we are at the end of transaction LSN, update progress tracking. + * Otherwise, after continuously processing CHANGES_THRESHOLD changes, we + * try to send a keepalive message if required. + */ + if (ctx->end_xact || ++changes_count >= CHANGES_THRESHOLD) + { + OutputPluginUpdateProgress(ctx); + changes_count = 0; + } +} diff --git a/src/backend/replication/repl_gram.c b/src/backend/replication/repl_gram.c new file mode 100644 index 0000000..e8928c4 --- /dev/null +++ b/src/backend/replication/repl_gram.c @@ -0,0 +1,1895 @@ +/* A Bison parser, made by GNU Bison 3.7.5. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output, and Bison version. */ +#define YYBISON 30705 + +/* Bison version string. */ +#define YYBISON_VERSION "3.7.5" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + + +/* Substitute the variable and function names. */ +#define yyparse replication_yyparse +#define yylex replication_yylex +#define yyerror replication_yyerror +#define yydebug replication_yydebug +#define yynerrs replication_yynerrs +#define yylval replication_yylval +#define yychar replication_yychar + +/* First part of user prologue. */ +#line 1 "repl_gram.y" + +/*------------------------------------------------------------------------- + * + * repl_gram.y - Parser for the replication commands + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_gram.y + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "nodes/makefuncs.h" +#include "nodes/replnodes.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" + + +/* Result of the parsing is returned here */ +Node *replication_parse_result; + + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. Note this only works with + * bison >= 2.0. However, in bison 1.875 the default is to use alloca() + * if possible, so there's not really much problem anyhow, at least if + * you're building with gcc. + */ +#define YYMALLOC palloc +#define YYFREE pfree + + +#line 119 "repl_gram.c" + +# ifndef YY_CAST +# ifdef __cplusplus +# define YY_CAST(Type, Val) static_cast (Val) +# define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast (Val) +# else +# define YY_CAST(Type, Val) ((Type) (Val)) +# define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val)) +# endif +# endif +# ifndef YY_NULLPTR +# if defined __cplusplus +# if 201103L <= __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# else +# define YY_NULLPTR ((void*)0) +# endif +# endif + + +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int replication_yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + SCONST = 258, /* SCONST */ + IDENT = 259, /* IDENT */ + UCONST = 260, /* UCONST */ + RECPTR = 261, /* RECPTR */ + K_BASE_BACKUP = 262, /* K_BASE_BACKUP */ + K_IDENTIFY_SYSTEM = 263, /* K_IDENTIFY_SYSTEM */ + K_SHOW = 264, /* K_SHOW */ + K_START_REPLICATION = 265, /* K_START_REPLICATION */ + K_CREATE_REPLICATION_SLOT = 266, /* K_CREATE_REPLICATION_SLOT */ + K_DROP_REPLICATION_SLOT = 267, /* K_DROP_REPLICATION_SLOT */ + K_TIMELINE_HISTORY = 268, /* K_TIMELINE_HISTORY */ + K_LABEL = 269, /* K_LABEL */ + K_PROGRESS = 270, /* K_PROGRESS */ + K_FAST = 271, /* K_FAST */ + K_WAIT = 272, /* K_WAIT */ + K_NOWAIT = 273, /* K_NOWAIT */ + K_MAX_RATE = 274, /* K_MAX_RATE */ + K_WAL = 275, /* K_WAL */ + K_TABLESPACE_MAP = 276, /* K_TABLESPACE_MAP */ + K_NOVERIFY_CHECKSUMS = 277, /* K_NOVERIFY_CHECKSUMS */ + K_TIMELINE = 278, /* K_TIMELINE */ + K_PHYSICAL = 279, /* K_PHYSICAL */ + K_LOGICAL = 280, /* K_LOGICAL */ + K_SLOT = 281, /* K_SLOT */ + K_RESERVE_WAL = 282, /* K_RESERVE_WAL */ + K_TEMPORARY = 283, /* K_TEMPORARY */ + K_EXPORT_SNAPSHOT = 284, /* K_EXPORT_SNAPSHOT */ + K_NOEXPORT_SNAPSHOT = 285, /* K_NOEXPORT_SNAPSHOT */ + K_USE_SNAPSHOT = 286, /* K_USE_SNAPSHOT */ + K_MANIFEST = 287, /* K_MANIFEST */ + K_MANIFEST_CHECKSUMS = 288 /* K_MANIFEST_CHECKSUMS */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +union YYSTYPE +{ +#line 45 "repl_gram.y" + + char *str; + bool boolval; + uint32 uintval; + + XLogRecPtr recptr; + Node *node; + List *list; + DefElem *defelt; + +#line 210 "repl_gram.c" + +}; +typedef union YYSTYPE YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE replication_yylval; + +int replication_yyparse (void); + + +/* Symbol kind. */ +enum yysymbol_kind_t +{ + YYSYMBOL_YYEMPTY = -2, + YYSYMBOL_YYEOF = 0, /* "end of file" */ + YYSYMBOL_YYerror = 1, /* error */ + YYSYMBOL_YYUNDEF = 2, /* "invalid token" */ + YYSYMBOL_SCONST = 3, /* SCONST */ + YYSYMBOL_IDENT = 4, /* IDENT */ + YYSYMBOL_UCONST = 5, /* UCONST */ + YYSYMBOL_RECPTR = 6, /* RECPTR */ + YYSYMBOL_K_BASE_BACKUP = 7, /* K_BASE_BACKUP */ + YYSYMBOL_K_IDENTIFY_SYSTEM = 8, /* K_IDENTIFY_SYSTEM */ + YYSYMBOL_K_SHOW = 9, /* K_SHOW */ + YYSYMBOL_K_START_REPLICATION = 10, /* K_START_REPLICATION */ + YYSYMBOL_K_CREATE_REPLICATION_SLOT = 11, /* K_CREATE_REPLICATION_SLOT */ + YYSYMBOL_K_DROP_REPLICATION_SLOT = 12, /* K_DROP_REPLICATION_SLOT */ + YYSYMBOL_K_TIMELINE_HISTORY = 13, /* K_TIMELINE_HISTORY */ + YYSYMBOL_K_LABEL = 14, /* K_LABEL */ + YYSYMBOL_K_PROGRESS = 15, /* K_PROGRESS */ + YYSYMBOL_K_FAST = 16, /* K_FAST */ + YYSYMBOL_K_WAIT = 17, /* K_WAIT */ + YYSYMBOL_K_NOWAIT = 18, /* K_NOWAIT */ + YYSYMBOL_K_MAX_RATE = 19, /* K_MAX_RATE */ + YYSYMBOL_K_WAL = 20, /* K_WAL */ + YYSYMBOL_K_TABLESPACE_MAP = 21, /* K_TABLESPACE_MAP */ + YYSYMBOL_K_NOVERIFY_CHECKSUMS = 22, /* K_NOVERIFY_CHECKSUMS */ + YYSYMBOL_K_TIMELINE = 23, /* K_TIMELINE */ + YYSYMBOL_K_PHYSICAL = 24, /* K_PHYSICAL */ + YYSYMBOL_K_LOGICAL = 25, /* K_LOGICAL */ + YYSYMBOL_K_SLOT = 26, /* K_SLOT */ + YYSYMBOL_K_RESERVE_WAL = 27, /* K_RESERVE_WAL */ + YYSYMBOL_K_TEMPORARY = 28, /* K_TEMPORARY */ + YYSYMBOL_K_EXPORT_SNAPSHOT = 29, /* K_EXPORT_SNAPSHOT */ + YYSYMBOL_K_NOEXPORT_SNAPSHOT = 30, /* K_NOEXPORT_SNAPSHOT */ + YYSYMBOL_K_USE_SNAPSHOT = 31, /* K_USE_SNAPSHOT */ + YYSYMBOL_K_MANIFEST = 32, /* K_MANIFEST */ + YYSYMBOL_K_MANIFEST_CHECKSUMS = 33, /* K_MANIFEST_CHECKSUMS */ + YYSYMBOL_34_ = 34, /* ';' */ + YYSYMBOL_35_ = 35, /* '.' */ + YYSYMBOL_36_ = 36, /* '(' */ + YYSYMBOL_37_ = 37, /* ')' */ + YYSYMBOL_38_ = 38, /* ',' */ + YYSYMBOL_YYACCEPT = 39, /* $accept */ + YYSYMBOL_firstcmd = 40, /* firstcmd */ + YYSYMBOL_opt_semicolon = 41, /* opt_semicolon */ + YYSYMBOL_command = 42, /* command */ + YYSYMBOL_identify_system = 43, /* identify_system */ + YYSYMBOL_show = 44, /* show */ + YYSYMBOL_var_name = 45, /* var_name */ + YYSYMBOL_base_backup = 46, /* base_backup */ + YYSYMBOL_base_backup_opt_list = 47, /* base_backup_opt_list */ + YYSYMBOL_base_backup_opt = 48, /* base_backup_opt */ + YYSYMBOL_create_replication_slot = 49, /* create_replication_slot */ + YYSYMBOL_create_slot_opt_list = 50, /* create_slot_opt_list */ + YYSYMBOL_create_slot_opt = 51, /* create_slot_opt */ + YYSYMBOL_drop_replication_slot = 52, /* drop_replication_slot */ + YYSYMBOL_start_replication = 53, /* start_replication */ + YYSYMBOL_start_logical_replication = 54, /* start_logical_replication */ + YYSYMBOL_timeline_history = 55, /* timeline_history */ + YYSYMBOL_opt_physical = 56, /* opt_physical */ + YYSYMBOL_opt_temporary = 57, /* opt_temporary */ + YYSYMBOL_opt_slot = 58, /* opt_slot */ + YYSYMBOL_opt_timeline = 59, /* opt_timeline */ + YYSYMBOL_plugin_options = 60, /* plugin_options */ + YYSYMBOL_plugin_opt_list = 61, /* plugin_opt_list */ + YYSYMBOL_plugin_opt_elem = 62, /* plugin_opt_elem */ + YYSYMBOL_plugin_opt_arg = 63 /* plugin_opt_arg */ +}; +typedef enum yysymbol_kind_t yysymbol_kind_t; + + + + +#ifdef short +# undef short +#endif + +/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure + and (if available) are included + so that the code can choose integer types of a good width. */ + +#ifndef __PTRDIFF_MAX__ +# include /* INFRINGES ON USER NAME SPACE */ +# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_STDINT_H +# endif +#endif + +/* Narrow types that promote to a signed type and that can represent a + signed or unsigned integer of at least N bits. In tables they can + save space and decrease cache pressure. Promoting to a signed type + helps avoid bugs in integer arithmetic. */ + +#ifdef __INT_LEAST8_MAX__ +typedef __INT_LEAST8_TYPE__ yytype_int8; +#elif defined YY_STDINT_H +typedef int_least8_t yytype_int8; +#else +typedef signed char yytype_int8; +#endif + +#ifdef __INT_LEAST16_MAX__ +typedef __INT_LEAST16_TYPE__ yytype_int16; +#elif defined YY_STDINT_H +typedef int_least16_t yytype_int16; +#else +typedef short yytype_int16; +#endif + +/* Work around bug in HP-UX 11.23, which defines these macros + incorrectly for preprocessor constants. This workaround can likely + be removed in 2023, as HPE has promised support for HP-UX 11.23 + (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of + . */ +#ifdef __hpux +# undef UINT_LEAST8_MAX +# undef UINT_LEAST16_MAX +# define UINT_LEAST8_MAX 255 +# define UINT_LEAST16_MAX 65535 +#endif + +#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST8_TYPE__ yytype_uint8; +#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST8_MAX <= INT_MAX) +typedef uint_least8_t yytype_uint8; +#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX +typedef unsigned char yytype_uint8; +#else +typedef short yytype_uint8; +#endif + +#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST16_TYPE__ yytype_uint16; +#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST16_MAX <= INT_MAX) +typedef uint_least16_t yytype_uint16; +#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX +typedef unsigned short yytype_uint16; +#else +typedef int yytype_uint16; +#endif + +#ifndef YYPTRDIFF_T +# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__ +# define YYPTRDIFF_T __PTRDIFF_TYPE__ +# define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__ +# elif defined PTRDIFF_MAX +# ifndef ptrdiff_t +# include /* INFRINGES ON USER NAME SPACE */ +# endif +# define YYPTRDIFF_T ptrdiff_t +# define YYPTRDIFF_MAXIMUM PTRDIFF_MAX +# else +# define YYPTRDIFF_T long +# define YYPTRDIFF_MAXIMUM LONG_MAX +# endif +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned +# endif +#endif + +#define YYSIZE_MAXIMUM \ + YY_CAST (YYPTRDIFF_T, \ + (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1) \ + ? YYPTRDIFF_MAXIMUM \ + : YY_CAST (YYSIZE_T, -1))) + +#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X)) + + +/* Stored state numbers (used for stacks). */ +typedef yytype_int8 yy_state_t; + +/* State numbers in computations. */ +typedef int yy_state_fast_t; + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(Msgid) dgettext ("bison-runtime", Msgid) +# endif +# endif +# ifndef YY_ +# define YY_(Msgid) Msgid +# endif +#endif + + +#ifndef YY_ATTRIBUTE_PURE +# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_PURE __attribute__ ((__pure__)) +# else +# define YY_ATTRIBUTE_PURE +# endif +#endif + +#ifndef YY_ATTRIBUTE_UNUSED +# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__)) +# else +# define YY_ATTRIBUTE_UNUSED +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YY_USE(E) ((void) (E)) +#else +# define YY_USE(E) /* empty */ +#endif + +#if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__ +/* Suppress an incorrect diagnostic about yylval being uninitialized. */ +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"") \ + _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") +# define YY_IGNORE_MAYBE_UNINITIALIZED_END \ + _Pragma ("GCC diagnostic pop") +#else +# define YY_INITIAL_VALUE(Value) Value +#endif +#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_END +#endif +#ifndef YY_INITIAL_VALUE +# define YY_INITIAL_VALUE(Value) /* Nothing. */ +#endif + +#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__ +# define YY_IGNORE_USELESS_CAST_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"") +# define YY_IGNORE_USELESS_CAST_END \ + _Pragma ("GCC diagnostic pop") +#endif +#ifndef YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_END +#endif + + +#define YY_ASSERT(E) ((void) (0 && (E))) + +#if !defined yyoverflow + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS +# include /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's 'empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* !defined yyoverflow */ + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yy_state_t yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYPTRDIFF_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / YYSIZEOF (*yyptr); \ + } \ + while (0) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYPTRDIFF_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (0) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 26 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 52 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 39 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 25 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 57 +/* YYNSTATES -- Number of states. */ +#define YYNSTATES 78 + +/* YYMAXUTOK -- Last valid token kind. */ +#define YYMAXUTOK 288 + + +/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM + as returned by yylex, with out-of-bounds checking. */ +#define YYTRANSLATE(YYX) \ + (0 <= (YYX) && (YYX) <= YYMAXUTOK \ + ? YY_CAST (yysymbol_kind_t, yytranslate[YYX]) \ + : YYSYMBOL_YYUNDEF) + +/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM + as returned by yylex. */ +static const yytype_int8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 36, 37, 2, 2, 38, 2, 35, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 34, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33 +}; + +#if YYDEBUG + /* YYRLINE[YYN] -- Source line where rule number YYN was defined. */ +static const yytype_int16 yyrline[] = +{ + 0, 107, 107, 113, 114, 118, 119, 120, 121, 122, + 123, 124, 125, 132, 142, 149, 150, 160, 169, 172, + 176, 181, 186, 191, 196, 201, 206, 211, 216, 221, + 230, 241, 255, 258, 262, 267, 272, 277, 286, 294, + 308, 323, 338, 355, 356, 360, 361, 365, 368, 372, + 380, 385, 386, 390, 394, 401, 408, 409 +}; +#endif + +/** Accessing symbol of state STATE. */ +#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State]) + +#if YYDEBUG || 0 +/* The user-facing name of the symbol whose (internal) number is + YYSYMBOL. No bounds checking. */ +static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED; + +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "\"end of file\"", "error", "\"invalid token\"", "SCONST", "IDENT", + "UCONST", "RECPTR", "K_BASE_BACKUP", "K_IDENTIFY_SYSTEM", "K_SHOW", + "K_START_REPLICATION", "K_CREATE_REPLICATION_SLOT", + "K_DROP_REPLICATION_SLOT", "K_TIMELINE_HISTORY", "K_LABEL", "K_PROGRESS", + "K_FAST", "K_WAIT", "K_NOWAIT", "K_MAX_RATE", "K_WAL", + "K_TABLESPACE_MAP", "K_NOVERIFY_CHECKSUMS", "K_TIMELINE", "K_PHYSICAL", + "K_LOGICAL", "K_SLOT", "K_RESERVE_WAL", "K_TEMPORARY", + "K_EXPORT_SNAPSHOT", "K_NOEXPORT_SNAPSHOT", "K_USE_SNAPSHOT", + "K_MANIFEST", "K_MANIFEST_CHECKSUMS", "';'", "'.'", "'('", "')'", "','", + "$accept", "firstcmd", "opt_semicolon", "command", "identify_system", + "show", "var_name", "base_backup", "base_backup_opt_list", + "base_backup_opt", "create_replication_slot", "create_slot_opt_list", + "create_slot_opt", "drop_replication_slot", "start_replication", + "start_logical_replication", "timeline_history", "opt_physical", + "opt_temporary", "opt_slot", "opt_timeline", "plugin_options", + "plugin_opt_list", "plugin_opt_elem", "plugin_opt_arg", YY_NULLPTR +}; + +static const char * +yysymbol_name (yysymbol_kind_t yysymbol) +{ + return yytname[yysymbol]; +} +#endif + +#ifdef YYPRINT +/* YYTOKNUM[NUM] -- (External) token number corresponding to the + (internal) symbol number NUM (which must be that of a token). */ +static const yytype_int16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 59, 46, 40, 41, 44 +}; +#endif + +#define YYPACT_NINF (-25) + +#define yypact_value_is_default(Yyn) \ + ((Yyn) == YYPACT_NINF) + +#define YYTABLE_NINF (-1) + +#define yytable_value_is_error(Yyn) \ + 0 + + /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +static const yytype_int8 yypact[] = +{ + 2, -25, -25, -1, -5, 23, 24, 25, 29, -3, + -25, -25, -25, -25, -25, -25, -25, -25, -14, -25, + -2, 28, 10, 7, 19, -25, -25, -25, -25, 34, + -25, -25, -25, 33, -25, -25, -25, 36, 37, -25, + 38, 16, -25, 39, -25, -8, -25, -25, -25, -25, + -25, -25, 40, 20, -25, 43, 8, 44, -25, -7, + -25, 46, -25, -25, -25, -25, -25, -25, -25, -7, + 45, -12, -25, -25, -25, -25, 46, -25 +}; + + /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE does not specify something else to do. Zero + means the default is an error. */ +static const yytype_int8 yydefact[] = +{ + 0, 19, 13, 0, 48, 0, 0, 0, 0, 4, + 5, 12, 6, 9, 10, 7, 8, 11, 17, 15, + 14, 0, 44, 46, 38, 42, 1, 3, 2, 0, + 21, 22, 24, 0, 23, 26, 27, 0, 0, 18, + 0, 47, 43, 0, 45, 0, 39, 20, 25, 28, + 29, 16, 0, 50, 33, 0, 52, 0, 40, 30, + 33, 0, 41, 49, 37, 34, 35, 36, 32, 31, + 57, 0, 53, 56, 55, 51, 0, 54 +}; + + /* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, + -25, -9, -25, -25, -25, -25, -25, -25, -25, -25, + -25, -25, -25, -24, -25 +}; + + /* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + 0, 8, 28, 9, 10, 11, 20, 12, 18, 39, + 13, 59, 68, 14, 15, 16, 17, 43, 45, 22, + 58, 62, 71, 72, 74 +}; + + /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule whose + number is the opposite. If YYTABLE_NINF, syntax error. */ +static const yytype_int8 yytable[] = +{ + 29, 30, 31, 19, 32, 33, 34, 35, 36, 1, + 2, 3, 4, 5, 6, 7, 54, 55, 37, 38, + 64, 21, 65, 66, 67, 75, 76, 23, 24, 26, + 25, 27, 41, 40, 42, 44, 46, 47, 48, 49, + 50, 52, 51, 57, 61, 53, 56, 60, 73, 63, + 70, 69, 77 +}; + +static const yytype_int8 yycheck[] = +{ + 14, 15, 16, 4, 18, 19, 20, 21, 22, 7, + 8, 9, 10, 11, 12, 13, 24, 25, 32, 33, + 27, 26, 29, 30, 31, 37, 38, 4, 4, 0, + 5, 34, 4, 35, 24, 28, 17, 3, 5, 3, + 3, 25, 4, 23, 36, 6, 6, 4, 3, 5, + 4, 60, 76 +}; + + /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_int8 yystos[] = +{ + 0, 7, 8, 9, 10, 11, 12, 13, 40, 42, + 43, 44, 46, 49, 52, 53, 54, 55, 47, 4, + 45, 26, 58, 4, 4, 5, 0, 34, 41, 14, + 15, 16, 18, 19, 20, 21, 22, 32, 33, 48, + 35, 4, 24, 56, 28, 57, 17, 3, 5, 3, + 3, 4, 25, 6, 24, 25, 6, 23, 59, 50, + 4, 36, 60, 5, 27, 29, 30, 31, 51, 50, + 4, 61, 62, 3, 63, 37, 38, 62 +}; + + /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_int8 yyr1[] = +{ + 0, 39, 40, 41, 41, 42, 42, 42, 42, 42, + 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 49, 49, 50, 50, 51, 51, 51, 51, 52, 52, + 53, 54, 55, 56, 56, 57, 57, 58, 58, 59, + 59, 60, 60, 61, 61, 62, 63, 63 +}; + + /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN. */ +static const yytype_int8 yyr2[] = +{ + 0, 2, 2, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 2, 1, 3, 2, 2, 0, + 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, + 5, 6, 2, 0, 1, 1, 1, 1, 2, 3, + 5, 6, 2, 1, 0, 1, 0, 2, 0, 2, + 0, 3, 0, 1, 3, 2, 1, 0 +}; + + +enum { YYENOMEM = -2 }; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ + do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ + while (0) + +/* Backward compatibility with an undocumented macro. + Use YYerror or YYUNDEF. */ +#define YYERRCODE YYUNDEF + + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + +/* This macro is provided for backward compatibility. */ +# ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif + + +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Kind, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + + +/*-----------------------------------. +| Print this symbol's value on YYO. | +`-----------------------------------*/ + +static void +yy_symbol_value_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + FILE *yyoutput = yyo; + YY_USE (yyoutput); + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yykind < YYNTOKENS) + YYPRINT (yyo, yytoknum[yykind], *yyvaluep); +# endif + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/*---------------------------. +| Print this symbol on YYO. | +`---------------------------*/ + +static void +yy_symbol_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + YYFPRINTF (yyo, "%s %s (", + yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind)); + + yy_symbol_value_print (yyo, yykind, yyvaluep); + YYFPRINTF (yyo, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +static void +yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop) +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +static void +yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp, + int yyrule) +{ + int yylno = yyrline[yyrule]; + int yynrhs = yyr2[yyrule]; + int yyi; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, + YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]), + &yyvsp[(yyi + 1) - (yynrhs)]); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyssp, yyvsp, Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) ((void) 0) +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + + + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +static void +yydestruct (const char *yymsg, + yysymbol_kind_t yykind, YYSTYPE *yyvaluep) +{ + YY_USE (yyvaluep); + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp); + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/* Lookahead token kind. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; +/* Number of syntax errors so far. */ +int yynerrs; + + + + +/*----------. +| yyparse. | +`----------*/ + +int +yyparse (void) +{ + yy_state_fast_t yystate = 0; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus = 0; + + /* Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* Their size. */ + YYPTRDIFF_T yystacksize = YYINITDEPTH; + + /* The state stack: array, bottom, top. */ + yy_state_t yyssa[YYINITDEPTH]; + yy_state_t *yyss = yyssa; + yy_state_t *yyssp = yyss; + + /* The semantic value stack: array, bottom, top. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + YYSTYPE *yyvsp = yyvs; + + int yyn; + /* The return value of yyparse. */ + int yyresult; + /* Lookahead symbol kind. */ + yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yychar = YYEMPTY; /* Cause a token to be read. */ + goto yysetstate; + + +/*------------------------------------------------------------. +| yynewstate -- push a new state, which is found in yystate. | +`------------------------------------------------------------*/ +yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + +/*--------------------------------------------------------------------. +| yysetstate -- set current state (the top of the stack) to yystate. | +`--------------------------------------------------------------------*/ +yysetstate: + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + YY_ASSERT (0 <= yystate && yystate < YYNSTATES); + YY_IGNORE_USELESS_CAST_BEGIN + *yyssp = YY_CAST (yy_state_t, yystate); + YY_IGNORE_USELESS_CAST_END + YY_STACK_PRINT (yyss, yyssp); + + if (yyss + yystacksize - 1 <= yyssp) +#if !defined yyoverflow && !defined YYSTACK_RELOCATE + goto yyexhaustedlab; +#else + { + /* Get the current used size of the three stacks, in elements. */ + YYPTRDIFF_T yysize = yyssp - yyss + 1; + +# if defined yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + yy_state_t *yyss1 = yyss; + YYSTYPE *yyvs1 = yyvs; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * YYSIZEOF (*yyssp), + &yyvs1, yysize * YYSIZEOF (*yyvsp), + &yystacksize); + yyss = yyss1; + yyvs = yyvs1; + } +# else /* defined YYSTACK_RELOCATE */ + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yy_state_t *yyss1 = yyss; + union yyalloc *yyptr = + YY_CAST (union yyalloc *, + YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize)))); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YY_IGNORE_USELESS_CAST_BEGIN + YYDPRINTF ((stderr, "Stack size increased to %ld\n", + YY_CAST (long, yystacksize))); + YY_IGNORE_USELESS_CAST_END + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } +#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */ + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either empty, or end-of-input, or a valid lookahead. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token\n")); + yychar = yylex (); + } + + if (yychar <= YYEOF) + { + yychar = YYEOF; + yytoken = YYSYMBOL_YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else if (yychar == YYerror) + { + /* The scanner already issued an error message, process directly + to error recovery. But do not keep the error token as + lookahead, it is too special and may lead us to an endless + loop in error recovery. */ + yychar = YYUNDEF; + yytoken = YYSYMBOL_YYerror; + goto yyerrlab1; + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + yystate = yyn; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + /* Discard the shifted token. */ + yychar = YYEMPTY; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + '$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: /* firstcmd: command opt_semicolon */ +#line 108 "repl_gram.y" + { + replication_parse_result = (yyvsp[-1].node); + } +#line 1315 "repl_gram.c" + break; + + case 13: /* identify_system: K_IDENTIFY_SYSTEM */ +#line 133 "repl_gram.y" + { + (yyval.node) = (Node *) makeNode(IdentifySystemCmd); + } +#line 1323 "repl_gram.c" + break; + + case 14: /* show: K_SHOW var_name */ +#line 143 "repl_gram.y" + { + VariableShowStmt *n = makeNode(VariableShowStmt); + n->name = (yyvsp[0].str); + (yyval.node) = (Node *) n; + } +#line 1333 "repl_gram.c" + break; + + case 15: /* var_name: IDENT */ +#line 149 "repl_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1339 "repl_gram.c" + break; + + case 16: /* var_name: var_name '.' IDENT */ +#line 151 "repl_gram.y" + { (yyval.str) = psprintf("%s.%s", (yyvsp[-2].str), (yyvsp[0].str)); } +#line 1345 "repl_gram.c" + break; + + case 17: /* base_backup: K_BASE_BACKUP base_backup_opt_list */ +#line 161 "repl_gram.y" + { + BaseBackupCmd *cmd = makeNode(BaseBackupCmd); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1355 "repl_gram.c" + break; + + case 18: /* base_backup_opt_list: base_backup_opt_list base_backup_opt */ +#line 170 "repl_gram.y" + { (yyval.list) = lappend((yyvsp[-1].list), (yyvsp[0].defelt)); } +#line 1361 "repl_gram.c" + break; + + case 19: /* base_backup_opt_list: %empty */ +#line 172 "repl_gram.y" + { (yyval.list) = NIL; } +#line 1367 "repl_gram.c" + break; + + case 20: /* base_backup_opt: K_LABEL SCONST */ +#line 177 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("label", + (Node *)makeString((yyvsp[0].str)), -1); + } +#line 1376 "repl_gram.c" + break; + + case 21: /* base_backup_opt: K_PROGRESS */ +#line 182 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("progress", + (Node *)makeInteger(true), -1); + } +#line 1385 "repl_gram.c" + break; + + case 22: /* base_backup_opt: K_FAST */ +#line 187 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("fast", + (Node *)makeInteger(true), -1); + } +#line 1394 "repl_gram.c" + break; + + case 23: /* base_backup_opt: K_WAL */ +#line 192 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("wal", + (Node *)makeInteger(true), -1); + } +#line 1403 "repl_gram.c" + break; + + case 24: /* base_backup_opt: K_NOWAIT */ +#line 197 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("nowait", + (Node *)makeInteger(true), -1); + } +#line 1412 "repl_gram.c" + break; + + case 25: /* base_backup_opt: K_MAX_RATE UCONST */ +#line 202 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("max_rate", + (Node *)makeInteger((yyvsp[0].uintval)), -1); + } +#line 1421 "repl_gram.c" + break; + + case 26: /* base_backup_opt: K_TABLESPACE_MAP */ +#line 207 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("tablespace_map", + (Node *)makeInteger(true), -1); + } +#line 1430 "repl_gram.c" + break; + + case 27: /* base_backup_opt: K_NOVERIFY_CHECKSUMS */ +#line 212 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("noverify_checksums", + (Node *)makeInteger(true), -1); + } +#line 1439 "repl_gram.c" + break; + + case 28: /* base_backup_opt: K_MANIFEST SCONST */ +#line 217 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("manifest", + (Node *)makeString((yyvsp[0].str)), -1); + } +#line 1448 "repl_gram.c" + break; + + case 29: /* base_backup_opt: K_MANIFEST_CHECKSUMS SCONST */ +#line 222 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("manifest_checksums", + (Node *)makeString((yyvsp[0].str)), -1); + } +#line 1457 "repl_gram.c" + break; + + case 30: /* create_replication_slot: K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_PHYSICAL create_slot_opt_list */ +#line 231 "repl_gram.y" + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->temporary = (yyvsp[-2].boolval); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1471 "repl_gram.c" + break; + + case 31: /* create_replication_slot: K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_LOGICAL IDENT create_slot_opt_list */ +#line 242 "repl_gram.y" + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = (yyvsp[-4].str); + cmd->temporary = (yyvsp[-3].boolval); + cmd->plugin = (yyvsp[-1].str); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1486 "repl_gram.c" + break; + + case 32: /* create_slot_opt_list: create_slot_opt_list create_slot_opt */ +#line 256 "repl_gram.y" + { (yyval.list) = lappend((yyvsp[-1].list), (yyvsp[0].defelt)); } +#line 1492 "repl_gram.c" + break; + + case 33: /* create_slot_opt_list: %empty */ +#line 258 "repl_gram.y" + { (yyval.list) = NIL; } +#line 1498 "repl_gram.c" + break; + + case 34: /* create_slot_opt: K_EXPORT_SNAPSHOT */ +#line 263 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("export_snapshot", + (Node *)makeInteger(true), -1); + } +#line 1507 "repl_gram.c" + break; + + case 35: /* create_slot_opt: K_NOEXPORT_SNAPSHOT */ +#line 268 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("export_snapshot", + (Node *)makeInteger(false), -1); + } +#line 1516 "repl_gram.c" + break; + + case 36: /* create_slot_opt: K_USE_SNAPSHOT */ +#line 273 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("use_snapshot", + (Node *)makeInteger(true), -1); + } +#line 1525 "repl_gram.c" + break; + + case 37: /* create_slot_opt: K_RESERVE_WAL */ +#line 278 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("reserve_wal", + (Node *)makeInteger(true), -1); + } +#line 1534 "repl_gram.c" + break; + + case 38: /* drop_replication_slot: K_DROP_REPLICATION_SLOT IDENT */ +#line 287 "repl_gram.y" + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = (yyvsp[0].str); + cmd->wait = false; + (yyval.node) = (Node *) cmd; + } +#line 1546 "repl_gram.c" + break; + + case 39: /* drop_replication_slot: K_DROP_REPLICATION_SLOT IDENT K_WAIT */ +#line 295 "repl_gram.y" + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = (yyvsp[-1].str); + cmd->wait = true; + (yyval.node) = (Node *) cmd; + } +#line 1558 "repl_gram.c" + break; + + case 40: /* start_replication: K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline */ +#line 309 "repl_gram.y" + { + StartReplicationCmd *cmd; + + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->startpoint = (yyvsp[-1].recptr); + cmd->timeline = (yyvsp[0].uintval); + (yyval.node) = (Node *) cmd; + } +#line 1573 "repl_gram.c" + break; + + case 41: /* start_logical_replication: K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options */ +#line 324 "repl_gram.y" + { + StartReplicationCmd *cmd; + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->startpoint = (yyvsp[-1].recptr); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1587 "repl_gram.c" + break; + + case 42: /* timeline_history: K_TIMELINE_HISTORY UCONST */ +#line 339 "repl_gram.y" + { + TimeLineHistoryCmd *cmd; + + if ((yyvsp[0].uintval) <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", (yyvsp[0].uintval)))); + + cmd = makeNode(TimeLineHistoryCmd); + cmd->timeline = (yyvsp[0].uintval); + + (yyval.node) = (Node *) cmd; + } +#line 1605 "repl_gram.c" + break; + + case 45: /* opt_temporary: K_TEMPORARY */ +#line 360 "repl_gram.y" + { (yyval.boolval) = true; } +#line 1611 "repl_gram.c" + break; + + case 46: /* opt_temporary: %empty */ +#line 361 "repl_gram.y" + { (yyval.boolval) = false; } +#line 1617 "repl_gram.c" + break; + + case 47: /* opt_slot: K_SLOT IDENT */ +#line 366 "repl_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1623 "repl_gram.c" + break; + + case 48: /* opt_slot: %empty */ +#line 368 "repl_gram.y" + { (yyval.str) = NULL; } +#line 1629 "repl_gram.c" + break; + + case 49: /* opt_timeline: K_TIMELINE UCONST */ +#line 373 "repl_gram.y" + { + if ((yyvsp[0].uintval) <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", (yyvsp[0].uintval)))); + (yyval.uintval) = (yyvsp[0].uintval); + } +#line 1641 "repl_gram.c" + break; + + case 50: /* opt_timeline: %empty */ +#line 380 "repl_gram.y" + { (yyval.uintval) = 0; } +#line 1647 "repl_gram.c" + break; + + case 51: /* plugin_options: '(' plugin_opt_list ')' */ +#line 385 "repl_gram.y" + { (yyval.list) = (yyvsp[-1].list); } +#line 1653 "repl_gram.c" + break; + + case 52: /* plugin_options: %empty */ +#line 386 "repl_gram.y" + { (yyval.list) = NIL; } +#line 1659 "repl_gram.c" + break; + + case 53: /* plugin_opt_list: plugin_opt_elem */ +#line 391 "repl_gram.y" + { + (yyval.list) = list_make1((yyvsp[0].defelt)); + } +#line 1667 "repl_gram.c" + break; + + case 54: /* plugin_opt_list: plugin_opt_list ',' plugin_opt_elem */ +#line 395 "repl_gram.y" + { + (yyval.list) = lappend((yyvsp[-2].list), (yyvsp[0].defelt)); + } +#line 1675 "repl_gram.c" + break; + + case 55: /* plugin_opt_elem: IDENT plugin_opt_arg */ +#line 402 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[-1].str), (yyvsp[0].node), -1); + } +#line 1683 "repl_gram.c" + break; + + case 56: /* plugin_opt_arg: SCONST */ +#line 408 "repl_gram.y" + { (yyval.node) = (Node *) makeString((yyvsp[0].str)); } +#line 1689 "repl_gram.c" + break; + + case 57: /* plugin_opt_arg: %empty */ +#line 409 "repl_gram.y" + { (yyval.node) = NULL; } +#line 1695 "repl_gram.c" + break; + + +#line 1699 "repl_gram.c" + + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + + *++yyvsp = yyval; + + /* Now 'shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + { + const int yylhs = yyr1[yyn] - YYNTOKENS; + const int yyi = yypgoto[yylhs] + *yyssp; + yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp + ? yytable[yyi] + : yydefgoto[yylhs]); + } + + goto yynewstate; + + +/*--------------------------------------. +| yyerrlab -- here on detecting error. | +`--------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar); + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; + yyerror (YY_("syntax error")); + } + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + /* Pacify compilers when the user code never invokes YYERROR and the + label yyerrorlab therefore never appears in user code. */ + if (0) + YYERROR; + + /* Do not reclaim the symbols of the rule whose action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + /* Pop stack until we find a state that shifts the error token. */ + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYSYMBOL_YYerror; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + YY_ACCESSING_SYMBOL (yystate), yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + + +#if !defined yyoverflow +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + goto yyreturn; +#endif + + +/*-------------------------------------------------------. +| yyreturn -- parsing is finished, clean up and return. | +`-------------------------------------------------------*/ +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule whose action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + YY_ACCESSING_SYMBOL (+*yyssp), yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + + return yyresult; +} + +#line 412 "repl_gram.y" + + +#include "repl_scanner.c" diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y new file mode 100644 index 0000000..802c0ad --- /dev/null +++ b/src/backend/replication/repl_gram.y @@ -0,0 +1,414 @@ +%{ +/*------------------------------------------------------------------------- + * + * repl_gram.y - Parser for the replication commands + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_gram.y + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "nodes/makefuncs.h" +#include "nodes/replnodes.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" + + +/* Result of the parsing is returned here */ +Node *replication_parse_result; + + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. Note this only works with + * bison >= 2.0. However, in bison 1.875 the default is to use alloca() + * if possible, so there's not really much problem anyhow, at least if + * you're building with gcc. + */ +#define YYMALLOC palloc +#define YYFREE pfree + +%} + +%expect 0 +%name-prefix="replication_yy" + +%union { + char *str; + bool boolval; + uint32 uintval; + + XLogRecPtr recptr; + Node *node; + List *list; + DefElem *defelt; +} + +/* Non-keyword tokens */ +%token SCONST IDENT +%token UCONST +%token RECPTR + +/* Keyword tokens. */ +%token K_BASE_BACKUP +%token K_IDENTIFY_SYSTEM +%token K_SHOW +%token K_START_REPLICATION +%token K_CREATE_REPLICATION_SLOT +%token K_DROP_REPLICATION_SLOT +%token K_TIMELINE_HISTORY +%token K_LABEL +%token K_PROGRESS +%token K_FAST +%token K_WAIT +%token K_NOWAIT +%token K_MAX_RATE +%token K_WAL +%token K_TABLESPACE_MAP +%token K_NOVERIFY_CHECKSUMS +%token K_TIMELINE +%token K_PHYSICAL +%token K_LOGICAL +%token K_SLOT +%token K_RESERVE_WAL +%token K_TEMPORARY +%token K_EXPORT_SNAPSHOT +%token K_NOEXPORT_SNAPSHOT +%token K_USE_SNAPSHOT +%token K_MANIFEST +%token K_MANIFEST_CHECKSUMS + +%type command +%type base_backup start_replication start_logical_replication + create_replication_slot drop_replication_slot identify_system + timeline_history show +%type base_backup_opt_list +%type base_backup_opt +%type opt_timeline +%type plugin_options plugin_opt_list +%type plugin_opt_elem +%type plugin_opt_arg +%type opt_slot var_name +%type opt_temporary +%type create_slot_opt_list +%type create_slot_opt + +%% + +firstcmd: command opt_semicolon + { + replication_parse_result = $1; + } + ; + +opt_semicolon: ';' + | /* EMPTY */ + ; + +command: + identify_system + | base_backup + | start_replication + | start_logical_replication + | create_replication_slot + | drop_replication_slot + | timeline_history + | show + ; + +/* + * IDENTIFY_SYSTEM + */ +identify_system: + K_IDENTIFY_SYSTEM + { + $$ = (Node *) makeNode(IdentifySystemCmd); + } + ; + +/* + * SHOW setting + */ +show: + K_SHOW var_name + { + VariableShowStmt *n = makeNode(VariableShowStmt); + n->name = $2; + $$ = (Node *) n; + } + +var_name: IDENT { $$ = $1; } + | var_name '.' IDENT + { $$ = psprintf("%s.%s", $1, $3); } + ; + +/* + * BASE_BACKUP [LABEL '