diff options
Diffstat (limited to 'src/backend/replication')
41 files changed, 47772 insertions, 0 deletions
diff --git a/src/backend/replication/.gitignore b/src/backend/replication/.gitignore new file mode 100644 index 0000000..77d5a51 --- /dev/null +++ b/src/backend/replication/.gitignore @@ -0,0 +1,6 @@ +/repl_gram.h +/repl_gram.c +/repl_scanner.c +/syncrep_gram.h +/syncrep_gram.c +/syncrep_scanner.c diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile new file mode 100644 index 0000000..23f29ba --- /dev/null +++ b/src/backend/replication/Makefile @@ -0,0 +1,53 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication +# +# IDENTIFICATION +# src/backend/replication/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) + +OBJS = \ + repl_gram.o \ + repl_scanner.o \ + slot.o \ + slotfuncs.o \ + syncrep.o \ + syncrep_gram.o \ + syncrep_scanner.o \ + walreceiver.o \ + walreceiverfuncs.o \ + walsender.o + +SUBDIRS = logical + +include $(top_srcdir)/src/backend/common.mk + +# See notes in src/backend/parser/Makefile about the following two rules +repl_gram.h: repl_gram.c + touch $@ + +repl_gram.c: BISONFLAGS += -d + +# Force these dependencies to be known even without dependency info built: +repl_gram.o repl_scanner.o: repl_gram.h + +# See notes in src/backend/parser/Makefile about the following two rules +syncrep_gram.h: syncrep_gram.c + touch $@ + +syncrep_gram.c: BISONFLAGS += -d + +# Force these dependencies to be known even without dependency info built: +syncrep_gram.o syncrep_scanner.o: syncrep_gram.h + +# repl_gram.c, repl_scanner.c, syncrep_gram.c and syncrep_scanner.c +# are in the distribution tarball, so they are not cleaned here. +# (Our parent Makefile takes care of them during maintainer-clean.) diff --git a/src/backend/replication/README b/src/backend/replication/README new file mode 100644 index 0000000..8fcd78d --- /dev/null +++ b/src/backend/replication/README @@ -0,0 +1,76 @@ +src/backend/replication/README + +Walreceiver - libpqwalreceiver API +---------------------------------- + +The transport-specific part of walreceiver, responsible for connecting to +the primary server, receiving WAL files and sending messages, is loaded +dynamically to avoid having to link the main server binary with libpq. +The dynamically loaded module is in libpqwalreceiver subdirectory. + +The dynamically loaded module implements a set of functions with details +about each one of them provided in src/include/replication/walreceiver.h. + +This API should be considered internal at the moment, but we could open it +up for 3rd party replacements of libpqwalreceiver in the future, allowing +pluggable methods for receiving WAL. + +Walreceiver IPC +--------------- + +When the WAL replay in startup process has reached the end of archived WAL, +restorable using restore_command, it starts up the walreceiver process +to fetch more WAL (if streaming replication is configured). + +Walreceiver is a postmaster subprocess, so the startup process can't fork it +directly. Instead, it sends a signal to postmaster, asking postmaster to launch +it. Before that, however, startup process fills in WalRcvData->conninfo +and WalRcvData->slotname, and initializes the starting point in +WalRcvData->receiveStart. + +As walreceiver receives WAL from the primary server, and writes and flushes +it to disk (in pg_wal), it updates WalRcvData->flushedUpto and signals +the startup process to know how far WAL replay can advance. + +Walreceiver sends information about replication progress to the primary server +whenever it either writes or flushes new WAL, or the specified interval elapses. +This is used for reporting purpose. + +Walsender IPC +------------- + +At shutdown, postmaster handles walsender processes differently from regular +backends. It waits for regular backends to die before writing the +shutdown checkpoint and terminating pgarch and other auxiliary processes, but +that's not desirable for walsenders, because we want the standby servers to +receive all the WAL, including the shutdown checkpoint, before the primary +is shut down. Therefore postmaster treats walsenders like the pgarch process, +and instructs them to terminate at PM_SHUTDOWN_2 phase, after all regular +backends have died and checkpointer has issued the shutdown checkpoint. + +When postmaster accepts a connection, it immediately forks a new process +to handle the handshake and authentication, and the process initializes to +become a backend. Postmaster doesn't know if the process becomes a regular +backend or a walsender process at that time - that's indicated in the +connection handshake - so we need some extra signaling to let postmaster +identify walsender processes. + +When walsender process starts up, it marks itself as a walsender process in +the PMSignal array. That way postmaster can tell it apart from regular +backends. + +Note that no big harm is done if postmaster thinks that a walsender is a +regular backend; it will just terminate the walsender earlier in the shutdown +phase. A walsender will look like a regular backend until it's done with the +initialization and has marked itself in PMSignal array, and at process +termination, after unmarking the PMSignal slot. + +Each walsender allocates an entry from the WalSndCtl array, and tracks +information about replication progress. User can monitor them via +statistics views. + + +Walsender - walreceiver protocol +-------------------------------- + +See manual. diff --git a/src/backend/replication/libpqwalreceiver/Makefile b/src/backend/replication/libpqwalreceiver/Makefile new file mode 100644 index 0000000..f26daa1 --- /dev/null +++ b/src/backend/replication/libpqwalreceiver/Makefile @@ -0,0 +1,37 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/libpqwalreceiver +# +# IDENTIFICATION +# src/backend/replication/libpqwalreceiver/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/libpqwalreceiver +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS) + +OBJS = \ + $(WIN32RES) \ + libpqwalreceiver.o +SHLIB_LINK_INTERNAL = $(libpq) +SHLIB_LINK = $(filter -lintl, $(LIBS)) +SHLIB_PREREQS = submake-libpq +PGFILEDESC = "libpqwalreceiver - receive WAL during streaming replication" +NAME = libpqwalreceiver + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean maintainer-clean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c new file mode 100644 index 0000000..b4038e1 --- /dev/null +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -0,0 +1,1239 @@ +/*------------------------------------------------------------------------- + * + * libpqwalreceiver.c + * + * This file contains the libpq-specific parts of walreceiver. It's + * loaded as a dynamic module to avoid linking the main server binary with + * libpq. + * + * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <sys/time.h> + +#include "access/xlog.h" +#include "catalog/pg_type.h" +#include "common/connect.h" +#include "funcapi.h" +#include "libpq-fe.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "pqexpbuffer.h" +#include "replication/walreceiver.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/tuplestore.h" + +PG_MODULE_MAGIC; + +struct WalReceiverConn +{ + /* Current connection to the primary, if any */ + PGconn *streamConn; + /* Used to remember if the connection is logical or physical */ + bool logical; + /* Buffer for currently read records */ + char *recvBuf; +}; + +/* Prototypes for interface functions */ +static WalReceiverConn *libpqrcv_connect(const char *conninfo, + bool logical, bool must_use_password, + const char *appname, char **err); +static void libpqrcv_check_conninfo(const char *conninfo, + bool must_use_password); +static char *libpqrcv_get_conninfo(WalReceiverConn *conn); +static void libpqrcv_get_senderinfo(WalReceiverConn *conn, + char **sender_host, int *sender_port); +static char *libpqrcv_identify_system(WalReceiverConn *conn, + TimeLineID *primary_tli); +static int libpqrcv_server_version(WalReceiverConn *conn); +static void libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, + TimeLineID tli, char **filename, + char **content, int *len); +static bool libpqrcv_startstreaming(WalReceiverConn *conn, + const WalRcvStreamOptions *options); +static void libpqrcv_endstreaming(WalReceiverConn *conn, + TimeLineID *next_tli); +static int libpqrcv_receive(WalReceiverConn *conn, char **buffer, + pgsocket *wait_fd); +static void libpqrcv_send(WalReceiverConn *conn, const char *buffer, + int nbytes); +static char *libpqrcv_create_slot(WalReceiverConn *conn, + const char *slotname, + bool temporary, + bool two_phase, + CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn); +static pid_t libpqrcv_get_backend_pid(WalReceiverConn *conn); +static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn, + const char *query, + const int nRetTypes, + const Oid *retTypes); +static void libpqrcv_disconnect(WalReceiverConn *conn); + +static WalReceiverFunctionsType PQWalReceiverFunctions = { + .walrcv_connect = libpqrcv_connect, + .walrcv_check_conninfo = libpqrcv_check_conninfo, + .walrcv_get_conninfo = libpqrcv_get_conninfo, + .walrcv_get_senderinfo = libpqrcv_get_senderinfo, + .walrcv_identify_system = libpqrcv_identify_system, + .walrcv_server_version = libpqrcv_server_version, + .walrcv_readtimelinehistoryfile = libpqrcv_readtimelinehistoryfile, + .walrcv_startstreaming = libpqrcv_startstreaming, + .walrcv_endstreaming = libpqrcv_endstreaming, + .walrcv_receive = libpqrcv_receive, + .walrcv_send = libpqrcv_send, + .walrcv_create_slot = libpqrcv_create_slot, + .walrcv_get_backend_pid = libpqrcv_get_backend_pid, + .walrcv_exec = libpqrcv_exec, + .walrcv_disconnect = libpqrcv_disconnect +}; + +/* Prototypes for private functions */ +static PGresult *libpqrcv_PQexec(PGconn *streamConn, const char *query); +static PGresult *libpqrcv_PQgetResult(PGconn *streamConn); +static char *stringlist_to_identifierstr(PGconn *conn, List *strings); + +/* + * Module initialization function + */ +void +_PG_init(void) +{ + if (WalReceiverFunctions != NULL) + elog(ERROR, "libpqwalreceiver already loaded"); + WalReceiverFunctions = &PQWalReceiverFunctions; +} + +/* + * Establish the connection to the primary server for XLOG streaming + * + * If an error occurs, this function will normally return NULL and set *err + * to a palloc'ed error message. However, if must_use_password is true and + * the connection fails to use the password, this function will ereport(ERROR). + * We do this because in that case the error includes a detail and a hint for + * consistency with other parts of the system, and it's not worth adding the + * machinery to pass all of those back to the caller just to cover this one + * case. + */ +static WalReceiverConn * +libpqrcv_connect(const char *conninfo, bool logical, bool must_use_password, + const char *appname, char **err) +{ + WalReceiverConn *conn; + PostgresPollingStatusType status; + const char *keys[6]; + const char *vals[6]; + int i = 0; + + /* + * Re-validate connection string. The validation already happened at DDL + * time, but the subscription owner may have changed. If we don't recheck + * with the correct must_use_password, it's possible that the connection + * will obtain the password from a different source, such as PGPASSFILE or + * PGPASSWORD. + */ + libpqrcv_check_conninfo(conninfo, must_use_password); + + /* + * We use the expand_dbname parameter to process the connection string (or + * URI), and pass some extra options. + */ + keys[i] = "dbname"; + vals[i] = conninfo; + keys[++i] = "replication"; + vals[i] = logical ? "database" : "true"; + if (!logical) + { + /* + * The database name is ignored by the server in replication mode, but + * specify "replication" for .pgpass lookup. + */ + keys[++i] = "dbname"; + vals[i] = "replication"; + } + keys[++i] = "fallback_application_name"; + vals[i] = appname; + if (logical) + { + /* Tell the publisher to translate to our encoding */ + keys[++i] = "client_encoding"; + vals[i] = GetDatabaseEncodingName(); + + /* + * Force assorted GUC parameters to settings that ensure that the + * publisher will output data values in a form that is unambiguous to + * the subscriber. (We don't want to modify the subscriber's GUC + * settings, since that might surprise user-defined code running in + * the subscriber, such as triggers.) This should match what pg_dump + * does. + */ + keys[++i] = "options"; + vals[i] = "-c datestyle=ISO -c intervalstyle=postgres -c extra_float_digits=3"; + } + keys[++i] = NULL; + vals[i] = NULL; + + Assert(i < sizeof(keys)); + + conn = palloc0(sizeof(WalReceiverConn)); + conn->streamConn = PQconnectStartParams(keys, vals, + /* expand_dbname = */ true); + if (PQstatus(conn->streamConn) == CONNECTION_BAD) + goto bad_connection_errmsg; + + /* + * Poll connection until we have OK or FAILED status. + * + * Per spec for PQconnectPoll, first wait till socket is write-ready. + */ + status = PGRES_POLLING_WRITING; + do + { + int io_flag; + int rc; + + if (status == PGRES_POLLING_READING) + io_flag = WL_SOCKET_READABLE; +#ifdef WIN32 + /* Windows needs a different test while waiting for connection-made */ + else if (PQstatus(conn->streamConn) == CONNECTION_STARTED) + io_flag = WL_SOCKET_CONNECTED; +#endif + else + io_flag = WL_SOCKET_WRITEABLE; + + rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | io_flag, + PQsocket(conn->streamConn), + 0, + WAIT_EVENT_LIBPQWALRECEIVER_CONNECT); + + /* Interrupted? */ + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + ProcessWalRcvInterrupts(); + } + + /* If socket is ready, advance the libpq state machine */ + if (rc & io_flag) + status = PQconnectPoll(conn->streamConn); + } while (status != PGRES_POLLING_OK && status != PGRES_POLLING_FAILED); + + if (PQstatus(conn->streamConn) != CONNECTION_OK) + goto bad_connection_errmsg; + + if (must_use_password && !PQconnectionUsedPassword(conn->streamConn)) + { + PQfinish(conn->streamConn); + pfree(conn); + + ereport(ERROR, + (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + errmsg("password is required"), + errdetail("Non-superuser cannot connect if the server does not request a password."), + errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters."))); + } + + if (logical) + { + PGresult *res; + + res = libpqrcv_PQexec(conn->streamConn, + ALWAYS_SECURE_SEARCH_PATH_SQL); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + *err = psprintf(_("could not clear search path: %s"), + pchomp(PQerrorMessage(conn->streamConn))); + goto bad_connection; + } + PQclear(res); + } + + conn->logical = logical; + + return conn; + + /* error path, using libpq's error message */ +bad_connection_errmsg: + *err = pchomp(PQerrorMessage(conn->streamConn)); + + /* error path, error already set */ +bad_connection: + PQfinish(conn->streamConn); + pfree(conn); + return NULL; +} + +/* + * Validate connection info string, and determine whether it might cause + * local filesystem access to be attempted. + * + * If the connection string can't be parsed, this function will raise + * an error and will not return. If it can, it will return true if this + * connection string specifies a password and false otherwise. + */ +static void +libpqrcv_check_conninfo(const char *conninfo, bool must_use_password) +{ + PQconninfoOption *opts = NULL; + PQconninfoOption *opt; + char *err = NULL; + + opts = PQconninfoParse(conninfo, &err); + if (opts == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + if (must_use_password) + { + bool uses_password = false; + + for (opt = opts; opt->keyword != NULL; ++opt) + { + /* Ignore connection options that are not present. */ + if (opt->val == NULL) + continue; + + if (strcmp(opt->keyword, "password") == 0 && opt->val[0] != '\0') + { + uses_password = true; + break; + } + } + + if (!uses_password) + { + /* malloc'd, so we must free it explicitly */ + PQconninfoFree(opts); + + ereport(ERROR, + (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + errmsg("password is required"), + errdetail("Non-superusers must provide a password in the connection string."))); + } + } + + PQconninfoFree(opts); +} + +/* + * Return a user-displayable conninfo string. Any security-sensitive fields + * are obfuscated. + */ +static char * +libpqrcv_get_conninfo(WalReceiverConn *conn) +{ + PQconninfoOption *conn_opts; + PQconninfoOption *conn_opt; + PQExpBufferData buf; + char *retval; + + Assert(conn->streamConn != NULL); + + initPQExpBuffer(&buf); + conn_opts = PQconninfo(conn->streamConn); + + if (conn_opts == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("could not parse connection string: %s", + _("out of memory")))); + + /* build a clean connection string from pieces */ + for (conn_opt = conn_opts; conn_opt->keyword != NULL; conn_opt++) + { + bool obfuscate; + + /* Skip debug and empty options */ + if (strchr(conn_opt->dispchar, 'D') || + conn_opt->val == NULL || + conn_opt->val[0] == '\0') + continue; + + /* Obfuscate security-sensitive options */ + obfuscate = strchr(conn_opt->dispchar, '*') != NULL; + + appendPQExpBuffer(&buf, "%s%s=%s", + buf.len == 0 ? "" : " ", + conn_opt->keyword, + obfuscate ? "********" : conn_opt->val); + } + + PQconninfoFree(conn_opts); + + retval = PQExpBufferDataBroken(buf) ? NULL : pstrdup(buf.data); + termPQExpBuffer(&buf); + return retval; +} + +/* + * Provides information of sender this WAL receiver is connected to. + */ +static void +libpqrcv_get_senderinfo(WalReceiverConn *conn, char **sender_host, + int *sender_port) +{ + char *ret = NULL; + + *sender_host = NULL; + *sender_port = 0; + + Assert(conn->streamConn != NULL); + + ret = PQhost(conn->streamConn); + if (ret && strlen(ret) != 0) + *sender_host = pstrdup(ret); + + ret = PQport(conn->streamConn); + if (ret && strlen(ret) != 0) + *sender_port = atoi(ret); +} + +/* + * Check that primary's system identifier matches ours, and fetch the current + * timeline ID of the primary. + */ +static char * +libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) +{ + PGresult *res; + char *primary_sysid; + + /* + * Get the system identifier and timeline ID as a DataRow message from the + * primary server. + */ + res = libpqrcv_PQexec(conn->streamConn, "IDENTIFY_SYSTEM"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive database system identifier and timeline ID from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + /* + * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in + * 9.4 and onwards. + */ + if (PQnfields(res) < 3 || PQntuples(res) != 1) + { + int ntuples = PQntuples(res); + int nfields = PQnfields(res); + + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid response from primary server"), + errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", + ntuples, nfields, 1, 3))); + } + primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); + *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); + PQclear(res); + + return primary_sysid; +} + +/* + * Thin wrapper around libpq to obtain server version. + */ +static int +libpqrcv_server_version(WalReceiverConn *conn) +{ + return PQserverVersion(conn->streamConn); +} + +/* + * Start streaming WAL data from given streaming options. + * + * Returns true if we switched successfully to copy-both mode. False + * means the server received the command and executed it successfully, but + * didn't switch to copy-mode. That means that there was no WAL on the + * requested timeline and starting point, because the server switched to + * another timeline at or before the requested starting point. On failure, + * throws an ERROR. + */ +static bool +libpqrcv_startstreaming(WalReceiverConn *conn, + const WalRcvStreamOptions *options) +{ + StringInfoData cmd; + PGresult *res; + + Assert(options->logical == conn->logical); + Assert(options->slotname || !options->logical); + + initStringInfo(&cmd); + + /* Build the command. */ + appendStringInfoString(&cmd, "START_REPLICATION"); + if (options->slotname != NULL) + appendStringInfo(&cmd, " SLOT \"%s\"", + options->slotname); + + if (options->logical) + appendStringInfoString(&cmd, " LOGICAL"); + + appendStringInfo(&cmd, " %X/%X", LSN_FORMAT_ARGS(options->startpoint)); + + /* + * Additional options are different depending on if we are doing logical + * or physical replication. + */ + if (options->logical) + { + char *pubnames_str; + List *pubnames; + char *pubnames_literal; + + appendStringInfoString(&cmd, " ("); + + appendStringInfo(&cmd, "proto_version '%u'", + options->proto.logical.proto_version); + + if (options->proto.logical.streaming_str) + appendStringInfo(&cmd, ", streaming '%s'", + options->proto.logical.streaming_str); + + if (options->proto.logical.twophase && + PQserverVersion(conn->streamConn) >= 150000) + appendStringInfoString(&cmd, ", two_phase 'on'"); + + if (options->proto.logical.origin && + PQserverVersion(conn->streamConn) >= 160000) + appendStringInfo(&cmd, ", origin '%s'", + options->proto.logical.origin); + + pubnames = options->proto.logical.publication_names; + pubnames_str = stringlist_to_identifierstr(conn->streamConn, pubnames); + if (!pubnames_str) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), /* likely guess */ + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + pubnames_literal = PQescapeLiteral(conn->streamConn, pubnames_str, + strlen(pubnames_str)); + if (!pubnames_literal) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), /* likely guess */ + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + appendStringInfo(&cmd, ", publication_names %s", pubnames_literal); + PQfreemem(pubnames_literal); + pfree(pubnames_str); + + if (options->proto.logical.binary && + PQserverVersion(conn->streamConn) >= 140000) + appendStringInfoString(&cmd, ", binary 'true'"); + + appendStringInfoChar(&cmd, ')'); + } + else + appendStringInfo(&cmd, " TIMELINE %u", + options->proto.physical.startpointTLI); + + /* Start streaming. */ + res = libpqrcv_PQexec(conn->streamConn, cmd.data); + pfree(cmd.data); + + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + return false; + } + else if (PQresultStatus(res) != PGRES_COPY_BOTH) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not start WAL streaming: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + PQclear(res); + return true; +} + +/* + * Stop streaming WAL data. Returns the next timeline's ID in *next_tli, as + * reported by the server, or 0 if it did not report it. + */ +static void +libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli) +{ + PGresult *res; + + /* + * Send copy-end message. As in libpqrcv_PQexec, this could theoretically + * block, but the risk seems small. + */ + if (PQputCopyEnd(conn->streamConn, NULL) <= 0 || + PQflush(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not send end-of-streaming message to primary: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + *next_tli = 0; + + /* + * After COPY is finished, we should receive a result set indicating the + * next timeline's ID, or just CommandComplete if the server was shut + * down. + * + * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT is + * also possible in case we aborted the copy in mid-stream. + */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (PQresultStatus(res) == PGRES_TUPLES_OK) + { + /* + * Read the next timeline's ID. The server also sends the timeline's + * starting point, but it is ignored. + */ + if (PQnfields(res) < 2 || PQntuples(res) != 1) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result set after end-of-streaming"))); + *next_tli = pg_strtoint32(PQgetvalue(res, 0, 0)); + PQclear(res); + + /* the result set should be followed by CommandComplete */ + res = libpqrcv_PQgetResult(conn->streamConn); + } + else if (PQresultStatus(res) == PGRES_COPY_OUT) + { + PQclear(res); + + /* End the copy */ + if (PQendcopy(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("error while shutting down streaming COPY: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* CommandComplete should follow */ + res = libpqrcv_PQgetResult(conn->streamConn); + } + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("error reading result of streaming command: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + PQclear(res); + + /* Verify that there are no more results */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (res != NULL) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result after CommandComplete: %s", + pchomp(PQerrorMessage(conn->streamConn))))); +} + +/* + * Fetch the timeline history file for 'tli' from primary. + */ +static void +libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, + TimeLineID tli, char **filename, + char **content, int *len) +{ + PGresult *res; + char cmd[64]; + + Assert(!conn->logical); + + /* + * Request the primary to send over the history file for given timeline. + */ + snprintf(cmd, sizeof(cmd), "TIMELINE_HISTORY %u", tli); + res = libpqrcv_PQexec(conn->streamConn, cmd); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive timeline history file from " + "the primary server: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + if (PQnfields(res) != 2 || PQntuples(res) != 1) + { + int ntuples = PQntuples(res); + int nfields = PQnfields(res); + + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid response from primary server"), + errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", + ntuples, nfields))); + } + *filename = pstrdup(PQgetvalue(res, 0, 0)); + + *len = PQgetlength(res, 0, 1); + *content = palloc(*len); + memcpy(*content, PQgetvalue(res, 0, 1), *len); + PQclear(res); +} + +/* + * Send a query and wait for the results by using the asynchronous libpq + * functions and socket readiness events. + * + * We must not use the regular blocking libpq functions like PQexec() + * since they are uninterruptible by signals on some platforms, such as + * Windows. + * + * The function is modeled on PQexec() in libpq, but only implements + * those parts that are in use in the walreceiver api. + * + * May return NULL, rather than an error result, on failure. + */ +static PGresult * +libpqrcv_PQexec(PGconn *streamConn, const char *query) +{ + PGresult *lastResult = NULL; + + /* + * PQexec() silently discards any prior query results on the connection. + * This is not required for this function as it's expected that the caller + * (which is this library in all cases) will behave correctly and we don't + * have to be backwards compatible with old libpq. + */ + + /* + * Submit the query. Since we don't use non-blocking mode, this could + * theoretically block. In practice, since we don't send very long query + * strings, the risk seems negligible. + */ + if (!PQsendQuery(streamConn, query)) + return NULL; + + for (;;) + { + /* Wait for, and collect, the next PGresult. */ + PGresult *result; + + result = libpqrcv_PQgetResult(streamConn); + if (result == NULL) + break; /* query is complete, or failure */ + + /* + * Emulate PQexec()'s behavior of returning the last result when there + * are many. We are fine with returning just last error message. + */ + PQclear(lastResult); + lastResult = result; + + if (PQresultStatus(lastResult) == PGRES_COPY_IN || + PQresultStatus(lastResult) == PGRES_COPY_OUT || + PQresultStatus(lastResult) == PGRES_COPY_BOTH || + PQstatus(streamConn) == CONNECTION_BAD) + break; + } + + return lastResult; +} + +/* + * Perform the equivalent of PQgetResult(), but watch for interrupts. + */ +static PGresult * +libpqrcv_PQgetResult(PGconn *streamConn) +{ + /* + * Collect data until PQgetResult is ready to get the result without + * blocking. + */ + while (PQisBusy(streamConn)) + { + int rc; + + /* + * We don't need to break down the sleep into smaller increments, + * since we'll get interrupted by signals and can handle any + * interrupts here. + */ + rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE | + WL_LATCH_SET, + PQsocket(streamConn), + 0, + WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); + + /* Interrupted? */ + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + ProcessWalRcvInterrupts(); + } + + /* Consume whatever data is available from the socket */ + if (PQconsumeInput(streamConn) == 0) + { + /* trouble; return NULL */ + return NULL; + } + } + + /* Now we can collect and return the next PGresult */ + return PQgetResult(streamConn); +} + +/* + * Disconnect connection to primary, if any. + */ +static void +libpqrcv_disconnect(WalReceiverConn *conn) +{ + PQfinish(conn->streamConn); + PQfreemem(conn->recvBuf); + pfree(conn); +} + +/* + * Receive a message available from XLOG stream. + * + * Returns: + * + * If data was received, returns the length of the data. *buffer is set to + * point to a buffer holding the received message. The buffer is only valid + * until the next libpqrcv_* call. + * + * If no data was available immediately, returns 0, and *wait_fd is set to a + * socket descriptor which can be waited on before trying again. + * + * -1 if the server ended the COPY. + * + * ereports on error. + */ +static int +libpqrcv_receive(WalReceiverConn *conn, char **buffer, + pgsocket *wait_fd) +{ + int rawlen; + + PQfreemem(conn->recvBuf); + conn->recvBuf = NULL; + + /* Try to receive a CopyData message */ + rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); + if (rawlen == 0) + { + /* Try consuming some data. */ + if (PQconsumeInput(conn->streamConn) == 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* Now that we've consumed some input, try again */ + rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); + if (rawlen == 0) + { + /* Tell caller to try again when our socket is ready. */ + *wait_fd = PQsocket(conn->streamConn); + return 0; + } + } + if (rawlen == -1) /* end-of-streaming or error */ + { + PGresult *res; + + res = libpqrcv_PQgetResult(conn->streamConn); + if (PQresultStatus(res) == PGRES_COMMAND_OK) + { + PQclear(res); + + /* Verify that there are no more results. */ + res = libpqrcv_PQgetResult(conn->streamConn); + if (res != NULL) + { + PQclear(res); + + /* + * If the other side closed the connection orderly (otherwise + * we'd seen an error, or PGRES_COPY_IN) don't report an error + * here, but let callers deal with it. + */ + if (PQstatus(conn->streamConn) == CONNECTION_BAD) + return -1; + + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected result after CommandComplete: %s", + PQerrorMessage(conn->streamConn)))); + } + + return -1; + } + else if (PQresultStatus(res) == PGRES_COPY_IN) + { + PQclear(res); + return -1; + } + else + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + } + } + if (rawlen < -1) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not receive data from WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); + + /* Return received messages to caller */ + *buffer = conn->recvBuf; + return rawlen; +} + +/* + * Send a message to XLOG stream. + * + * ereports on error. + */ +static void +libpqrcv_send(WalReceiverConn *conn, const char *buffer, int nbytes) +{ + if (PQputCopyData(conn->streamConn, buffer, nbytes) <= 0 || + PQflush(conn->streamConn)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not send data to WAL stream: %s", + pchomp(PQerrorMessage(conn->streamConn))))); +} + +/* + * Create new replication slot. + * Returns the name of the exported snapshot for logical slot or NULL for + * physical slot. + */ +static char * +libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, + bool temporary, bool two_phase, CRSSnapshotAction snapshot_action, + XLogRecPtr *lsn) +{ + PGresult *res; + StringInfoData cmd; + char *snapshot; + int use_new_options_syntax; + + use_new_options_syntax = (PQserverVersion(conn->streamConn) >= 150000); + + initStringInfo(&cmd); + + appendStringInfo(&cmd, "CREATE_REPLICATION_SLOT \"%s\"", slotname); + + if (temporary) + appendStringInfoString(&cmd, " TEMPORARY"); + + if (conn->logical) + { + appendStringInfoString(&cmd, " LOGICAL pgoutput "); + if (use_new_options_syntax) + appendStringInfoChar(&cmd, '('); + if (two_phase) + { + appendStringInfoString(&cmd, "TWO_PHASE"); + if (use_new_options_syntax) + appendStringInfoString(&cmd, ", "); + else + appendStringInfoChar(&cmd, ' '); + } + + if (use_new_options_syntax) + { + switch (snapshot_action) + { + case CRS_EXPORT_SNAPSHOT: + appendStringInfoString(&cmd, "SNAPSHOT 'export'"); + break; + case CRS_NOEXPORT_SNAPSHOT: + appendStringInfoString(&cmd, "SNAPSHOT 'nothing'"); + break; + case CRS_USE_SNAPSHOT: + appendStringInfoString(&cmd, "SNAPSHOT 'use'"); + break; + } + } + else + { + switch (snapshot_action) + { + case CRS_EXPORT_SNAPSHOT: + appendStringInfoString(&cmd, "EXPORT_SNAPSHOT"); + break; + case CRS_NOEXPORT_SNAPSHOT: + appendStringInfoString(&cmd, "NOEXPORT_SNAPSHOT"); + break; + case CRS_USE_SNAPSHOT: + appendStringInfoString(&cmd, "USE_SNAPSHOT"); + break; + } + } + + if (use_new_options_syntax) + appendStringInfoChar(&cmd, ')'); + } + else + { + if (use_new_options_syntax) + appendStringInfoString(&cmd, " PHYSICAL (RESERVE_WAL)"); + else + appendStringInfoString(&cmd, " PHYSICAL RESERVE_WAL"); + } + + res = libpqrcv_PQexec(conn->streamConn, cmd.data); + pfree(cmd.data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("could not create replication slot \"%s\": %s", + slotname, pchomp(PQerrorMessage(conn->streamConn))))); + } + + if (lsn) + *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, + CStringGetDatum(PQgetvalue(res, 0, 1)))); + + if (!PQgetisnull(res, 0, 2)) + snapshot = pstrdup(PQgetvalue(res, 0, 2)); + else + snapshot = NULL; + + PQclear(res); + + return snapshot; +} + +/* + * Return PID of remote backend process. + */ +static pid_t +libpqrcv_get_backend_pid(WalReceiverConn *conn) +{ + return PQbackendPID(conn->streamConn); +} + +/* + * Convert tuple query result to tuplestore. + */ +static void +libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, + const int nRetTypes, const Oid *retTypes) +{ + int tupn; + int coln; + int nfields = PQnfields(pgres); + HeapTuple tuple; + AttInMetadata *attinmeta; + MemoryContext rowcontext; + MemoryContext oldcontext; + + /* Make sure we got expected number of fields. */ + if (nfields != nRetTypes) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid query response"), + errdetail("Expected %d fields, got %d fields.", + nRetTypes, nfields))); + + walres->tuplestore = tuplestore_begin_heap(true, false, work_mem); + + /* Create tuple descriptor corresponding to expected result. */ + walres->tupledesc = CreateTemplateTupleDesc(nRetTypes); + for (coln = 0; coln < nRetTypes; coln++) + TupleDescInitEntry(walres->tupledesc, (AttrNumber) coln + 1, + PQfname(pgres, coln), retTypes[coln], -1, 0); + attinmeta = TupleDescGetAttInMetadata(walres->tupledesc); + + /* No point in doing more here if there were no tuples returned. */ + if (PQntuples(pgres) == 0) + return; + + /* Create temporary context for local allocations. */ + rowcontext = AllocSetContextCreate(CurrentMemoryContext, + "libpqrcv query result context", + ALLOCSET_DEFAULT_SIZES); + + /* Process returned rows. */ + for (tupn = 0; tupn < PQntuples(pgres); tupn++) + { + char *cstrs[MaxTupleAttributeNumber]; + + ProcessWalRcvInterrupts(); + + /* Do the allocations in temporary context. */ + oldcontext = MemoryContextSwitchTo(rowcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (coln = 0; coln < nfields; coln++) + { + if (PQgetisnull(pgres, tupn, coln)) + cstrs[coln] = NULL; + else + cstrs[coln] = PQgetvalue(pgres, tupn, coln); + } + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(attinmeta, cstrs); + tuplestore_puttuple(walres->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(rowcontext); + } + + MemoryContextDelete(rowcontext); +} + +/* + * Public interface for sending generic queries (and commands). + * + * This can only be called from process connected to database. + */ +static WalRcvExecResult * +libpqrcv_exec(WalReceiverConn *conn, const char *query, + const int nRetTypes, const Oid *retTypes) +{ + PGresult *pgres = NULL; + WalRcvExecResult *walres = palloc0(sizeof(WalRcvExecResult)); + char *diag_sqlstate; + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the query interface requires a database connection"))); + + pgres = libpqrcv_PQexec(conn->streamConn, query); + + switch (PQresultStatus(pgres)) + { + case PGRES_SINGLE_TUPLE: + case PGRES_TUPLES_OK: + walres->status = WALRCV_OK_TUPLES; + libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes); + break; + + case PGRES_COPY_IN: + walres->status = WALRCV_OK_COPY_IN; + break; + + case PGRES_COPY_OUT: + walres->status = WALRCV_OK_COPY_OUT; + break; + + case PGRES_COPY_BOTH: + walres->status = WALRCV_OK_COPY_BOTH; + break; + + case PGRES_COMMAND_OK: + walres->status = WALRCV_OK_COMMAND; + break; + + /* Empty query is considered error. */ + case PGRES_EMPTY_QUERY: + walres->status = WALRCV_ERROR; + walres->err = _("empty query"); + break; + + case PGRES_PIPELINE_SYNC: + case PGRES_PIPELINE_ABORTED: + walres->status = WALRCV_ERROR; + walres->err = _("unexpected pipeline mode"); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_BAD_RESPONSE: + walres->status = WALRCV_ERROR; + walres->err = pchomp(PQerrorMessage(conn->streamConn)); + diag_sqlstate = PQresultErrorField(pgres, PG_DIAG_SQLSTATE); + if (diag_sqlstate) + walres->sqlstate = MAKE_SQLSTATE(diag_sqlstate[0], + diag_sqlstate[1], + diag_sqlstate[2], + diag_sqlstate[3], + diag_sqlstate[4]); + break; + } + + PQclear(pgres); + + return walres; +} + +/* + * Given a List of strings, return it as single comma separated + * string, quoting identifiers as needed. + * + * This is essentially the reverse of SplitIdentifierString. + * + * The caller should free the result. + */ +static char * +stringlist_to_identifierstr(PGconn *conn, List *strings) +{ + ListCell *lc; + StringInfoData res; + bool first = true; + + initStringInfo(&res); + + foreach(lc, strings) + { + char *val = strVal(lfirst(lc)); + char *val_escaped; + + if (first) + first = false; + else + appendStringInfoChar(&res, ','); + + val_escaped = PQescapeIdentifier(conn, val, strlen(val)); + if (!val_escaped) + { + free(res.data); + return NULL; + } + appendStringInfoString(&res, val_escaped); + PQfreemem(val_escaped); + } + + return res.data; +} diff --git a/src/backend/replication/libpqwalreceiver/meson.build b/src/backend/replication/libpqwalreceiver/meson.build new file mode 100644 index 0000000..ec439fb --- /dev/null +++ b/src/backend/replication/libpqwalreceiver/meson.build @@ -0,0 +1,21 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +libpqwalreceiver_sources = files( + 'libpqwalreceiver.c', +) + +if host_system == 'windows' + libpqwalreceiver_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pqwalreceiver', + '--FILEDESC', 'libpqwalreceiver - receive WAL during streaming replication',]) +endif + +libpqwalreceiver = shared_module('pqwalreceiver', + libpqwalreceiver_sources, + kwargs: pg_mod_args + { + 'name_prefix': 'lib', + 'dependencies': pg_mod_args['dependencies'] + [libpq], + } +) + +backend_targets += libpqwalreceiver diff --git a/src/backend/replication/logical/Makefile b/src/backend/replication/logical/Makefile new file mode 100644 index 0000000..2dc25e3 --- /dev/null +++ b/src/backend/replication/logical/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/logical +# +# IDENTIFICATION +# src/backend/replication/logical/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/logical +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) + +OBJS = \ + applyparallelworker.o \ + decode.o \ + launcher.o \ + logical.o \ + logicalfuncs.o \ + message.o \ + origin.o \ + proto.o \ + relation.o \ + reorderbuffer.o \ + snapbuild.o \ + tablesync.o \ + worker.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c new file mode 100644 index 0000000..a24709e --- /dev/null +++ b/src/backend/replication/logical/applyparallelworker.c @@ -0,0 +1,1642 @@ +/*------------------------------------------------------------------------- + * applyparallelworker.c + * Support routines for applying xact by parallel apply worker + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/applyparallelworker.c + * + * This file contains the code to launch, set up, and teardown a parallel apply + * worker which receives the changes from the leader worker and invokes routines + * to apply those on the subscriber database. Additionally, this file contains + * routines that are intended to support setting up, using, and tearing down a + * ParallelApplyWorkerInfo which is required so the leader worker and parallel + * apply workers can communicate with each other. + * + * The parallel apply workers are assigned (if available) as soon as xact's + * first stream is received for subscriptions that have set their 'streaming' + * option as parallel. The leader apply worker will send changes to this new + * worker via shared memory. We keep this worker assigned till the transaction + * commit is received and also wait for the worker to finish at commit. This + * preserves commit ordering and avoid file I/O in most cases, although we + * still need to spill to a file if there is no worker available. See comments + * atop logical/worker to know more about streamed xacts whose changes are + * spilled to disk. It is important to maintain commit order to avoid failures + * due to: (a) transaction dependencies - say if we insert a row in the first + * transaction and update it in the second transaction on publisher then + * allowing the subscriber to apply both in parallel can lead to failure in the + * update; (b) deadlocks - allowing transactions that update the same set of + * rows/tables in the opposite order to be applied in parallel can lead to + * deadlocks. + * + * A worker pool is used to avoid restarting workers for each streaming + * transaction. We maintain each worker's information (ParallelApplyWorkerInfo) + * in the ParallelApplyWorkerPool. After successfully launching a new worker, + * its information is added to the ParallelApplyWorkerPool. Once the worker + * finishes applying the transaction, it is marked as available for re-use. + * Now, before starting a new worker to apply the streaming transaction, we + * check the list for any available worker. Note that we retain a maximum of + * half the max_parallel_apply_workers_per_subscription workers in the pool and + * after that, we simply exit the worker after applying the transaction. + * + * XXX This worker pool threshold is arbitrary and we can provide a GUC + * variable for this in the future if required. + * + * The leader apply worker will create a separate dynamic shared memory segment + * when each parallel apply worker starts. The reason for this design is that + * we cannot predict how many workers will be needed. It may be possible to + * allocate enough shared memory in one segment based on the maximum number of + * parallel apply workers (max_parallel_apply_workers_per_subscription), but + * this would waste memory if no process is actually started. + * + * The dynamic shared memory segment contains: (a) a shm_mq that is used to + * send changes in the transaction from leader apply worker to parallel apply + * worker; (b) another shm_mq that is used to send errors (and other messages + * reported via elog/ereport) from the parallel apply worker to leader apply + * worker; (c) necessary information to be shared among parallel apply workers + * and the leader apply worker (i.e. members of ParallelApplyWorkerShared). + * + * Locking Considerations + * ---------------------- + * We have a risk of deadlock due to concurrently applying the transactions in + * parallel mode that were independent on the publisher side but became + * dependent on the subscriber side due to the different database structures + * (like schema of subscription tables, constraints, etc.) on each side. This + * can happen even without parallel mode when there are concurrent operations + * on the subscriber. In order to detect the deadlocks among leader (LA) and + * parallel apply (PA) workers, we used lmgr locks when the PA waits for the + * next stream (set of changes) and LA waits for PA to finish the transaction. + * An alternative approach could be to not allow parallelism when the schema of + * tables is different between the publisher and subscriber but that would be + * too restrictive and would require the publisher to send much more + * information than it is currently sending. + * + * Consider a case where the subscribed table does not have a unique key on the + * publisher and has a unique key on the subscriber. The deadlock can happen in + * the following ways: + * + * 1) Deadlock between the leader apply worker and a parallel apply worker + * + * Consider that the parallel apply worker (PA) is executing TX-1 and the + * leader apply worker (LA) is executing TX-2 concurrently on the subscriber. + * Now, LA is waiting for PA because of the unique key constraint of the + * subscribed table while PA is waiting for LA to send the next stream of + * changes or transaction finish command message. + * + * In order for lmgr to detect this, we have LA acquire a session lock on the + * remote transaction (by pa_lock_stream()) and have PA wait on the lock before + * trying to receive the next stream of changes. Specifically, LA will acquire + * the lock in AccessExclusive mode before sending the STREAM_STOP and will + * release it if already acquired after sending the STREAM_START, STREAM_ABORT + * (for toplevel transaction), STREAM_PREPARE, and STREAM_COMMIT. The PA will + * acquire the lock in AccessShare mode after processing STREAM_STOP and + * STREAM_ABORT (for subtransaction) and then release the lock immediately + * after acquiring it. + * + * The lock graph for the above example will look as follows: + * LA (waiting to acquire the lock on the unique index) -> PA (waiting to + * acquire the stream lock) -> LA + * + * This way, when PA is waiting for LA for the next stream of changes, we can + * have a wait-edge from PA to LA in lmgr, which will make us detect the + * deadlock between LA and PA. + * + * 2) Deadlock between the leader apply worker and parallel apply workers + * + * This scenario is similar to the first case but TX-1 and TX-2 are executed by + * two parallel apply workers (PA-1 and PA-2 respectively). In this scenario, + * PA-2 is waiting for PA-1 to complete its transaction while PA-1 is waiting + * for subsequent input from LA. Also, LA is waiting for PA-2 to complete its + * transaction in order to preserve the commit order. There is a deadlock among + * the three processes. + * + * In order for lmgr to detect this, we have PA acquire a session lock (this is + * a different lock than referred in the previous case, see + * pa_lock_transaction()) on the transaction being applied and have LA wait on + * the lock before proceeding in the transaction finish commands. Specifically, + * PA will acquire this lock in AccessExclusive mode before executing the first + * message of the transaction and release it at the xact end. LA will acquire + * this lock in AccessShare mode at transaction finish commands and release it + * immediately. + * + * The lock graph for the above example will look as follows: + * LA (waiting to acquire the transaction lock) -> PA-2 (waiting to acquire the + * lock due to unique index constraint) -> PA-1 (waiting to acquire the stream + * lock) -> LA + * + * This way when LA is waiting to finish the transaction end command to preserve + * the commit order, we will be able to detect deadlock, if any. + * + * One might think we can use XactLockTableWait(), but XactLockTableWait() + * considers PREPARED TRANSACTION as still in progress which means the lock + * won't be released even after the parallel apply worker has prepared the + * transaction. + * + * 3) Deadlock when the shm_mq buffer is full + * + * In the previous scenario (ie. PA-1 and PA-2 are executing transactions + * concurrently), if the shm_mq buffer between LA and PA-2 is full, LA has to + * wait to send messages, and this wait doesn't appear in lmgr. + * + * To avoid this wait, we use a non-blocking write and wait with a timeout. If + * the timeout is exceeded, the LA will serialize all the pending messages to + * a file and indicate PA-2 that it needs to read that file for the remaining + * messages. Then LA will start waiting for commit as in the previous case + * which will detect deadlock if any. See pa_send_data() and + * enum TransApplyAction. + * + * Lock types + * ---------- + * Both the stream lock and the transaction lock mentioned above are + * session-level locks because both locks could be acquired outside the + * transaction, and the stream lock in the leader needs to persist across + * transaction boundaries i.e. until the end of the streaming transaction. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "libpq/pqmq.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "replication/origin.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "tcop/tcopprot.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + +#define PG_LOGICAL_APPLY_SHM_MAGIC 0x787ca067 + +/* + * DSM keys for parallel apply worker. Unlike other parallel execution code, + * since we don't need to worry about DSM keys conflicting with plan_node_id we + * can use small integers. + */ +#define PARALLEL_APPLY_KEY_SHARED 1 +#define PARALLEL_APPLY_KEY_MQ 2 +#define PARALLEL_APPLY_KEY_ERROR_QUEUE 3 + +/* Queue size of DSM, 16 MB for now. */ +#define DSM_QUEUE_SIZE (16 * 1024 * 1024) + +/* + * Error queue size of DSM. It is desirable to make it large enough that a + * typical ErrorResponse can be sent without blocking. That way, a worker that + * errors out can write the whole message into the queue and terminate without + * waiting for the user backend. + */ +#define DSM_ERROR_QUEUE_SIZE (16 * 1024) + +/* + * There are three fields in each message received by the parallel apply + * worker: start_lsn, end_lsn and send_time. Because we have updated these + * statistics in the leader apply worker, we can ignore these fields in the + * parallel apply worker (see function LogicalRepApplyLoop). + */ +#define SIZE_STATS_MESSAGE (2 * sizeof(XLogRecPtr) + sizeof(TimestampTz)) + +/* + * The type of session-level lock on a transaction being applied on a logical + * replication subscriber. + */ +#define PARALLEL_APPLY_LOCK_STREAM 0 +#define PARALLEL_APPLY_LOCK_XACT 1 + +/* + * Hash table entry to map xid to the parallel apply worker state. + */ +typedef struct ParallelApplyWorkerEntry +{ + TransactionId xid; /* Hash key -- must be first */ + ParallelApplyWorkerInfo *winfo; +} ParallelApplyWorkerEntry; + +/* + * A hash table used to cache the state of streaming transactions being applied + * by the parallel apply workers. + */ +static HTAB *ParallelApplyTxnHash = NULL; + +/* +* A list (pool) of active parallel apply workers. The information for +* the new worker is added to the list after successfully launching it. The +* list entry is removed if there are already enough workers in the worker +* pool at the end of the transaction. For more information about the worker +* pool, see comments atop this file. + */ +static List *ParallelApplyWorkerPool = NIL; + +/* + * Information shared between leader apply worker and parallel apply worker. + */ +ParallelApplyWorkerShared *MyParallelShared = NULL; + +/* + * Is there a message sent by a parallel apply worker that the leader apply + * worker needs to receive? + */ +volatile sig_atomic_t ParallelApplyMessagePending = false; + +/* + * Cache the parallel apply worker information required for applying the + * current streaming transaction. It is used to save the cost of searching the + * hash table when applying the changes between STREAM_START and STREAM_STOP. + */ +static ParallelApplyWorkerInfo *stream_apply_worker = NULL; + +/* A list to maintain subtransactions, if any. */ +static List *subxactlist = NIL; + +static void pa_free_worker_info(ParallelApplyWorkerInfo *winfo); +static ParallelTransState pa_get_xact_state(ParallelApplyWorkerShared *wshared); +static PartialFileSetState pa_get_fileset_state(void); + +/* + * Returns true if it is OK to start a parallel apply worker, false otherwise. + */ +static bool +pa_can_start(void) +{ + /* Only leader apply workers can start parallel apply workers. */ + if (!am_leader_apply_worker()) + return false; + + /* + * It is good to check for any change in the subscription parameter to + * avoid the case where for a very long time the change doesn't get + * reflected. This can happen when there is a constant flow of streaming + * transactions that are handled by parallel apply workers. + * + * It is better to do it before the below checks so that the latest values + * of subscription can be used for the checks. + */ + maybe_reread_subscription(); + + /* + * Don't start a new parallel apply worker if the subscription is not + * using parallel streaming mode, or if the publisher does not support + * parallel apply. + */ + if (!MyLogicalRepWorker->parallel_apply) + return false; + + /* + * Don't start a new parallel worker if user has set skiplsn as it's + * possible that they want to skip the streaming transaction. For + * streaming transactions, we need to serialize the transaction to a file + * so that we can get the last LSN of the transaction to judge whether to + * skip before starting to apply the change. + * + * One might think that we could allow parallelism if the first lsn of the + * transaction is greater than skiplsn, but we don't send it with the + * STREAM START message, and it doesn't seem worth sending the extra eight + * bytes with the STREAM START to enable parallelism for this case. + */ + if (!XLogRecPtrIsInvalid(MySubscription->skiplsn)) + return false; + + /* + * For streaming transactions that are being applied using a parallel + * apply worker, we cannot decide whether to apply the change for a + * relation that is not in the READY state (see + * should_apply_changes_for_rel) as we won't know remote_final_lsn by that + * time. So, we don't start the new parallel apply worker in this case. + */ + if (!AllTablesyncsReady()) + return false; + + return true; +} + +/* + * Set up a dynamic shared memory segment. + * + * We set up a control region that contains a fixed-size worker info + * (ParallelApplyWorkerShared), a message queue, and an error queue. + * + * Returns true on success, false on failure. + */ +static bool +pa_setup_dsm(ParallelApplyWorkerInfo *winfo) +{ + shm_toc_estimator e; + Size segsize; + dsm_segment *seg; + shm_toc *toc; + ParallelApplyWorkerShared *shared; + shm_mq *mq; + Size queue_size = DSM_QUEUE_SIZE; + Size error_queue_size = DSM_ERROR_QUEUE_SIZE; + + /* + * Estimate how much shared memory we need. + * + * Because the TOC machinery may choose to insert padding of oddly-sized + * requests, we must estimate each chunk separately. + * + * We need one key to register the location of the header, and two other + * keys to track the locations of the message queue and the error message + * queue. + */ + shm_toc_initialize_estimator(&e); + shm_toc_estimate_chunk(&e, sizeof(ParallelApplyWorkerShared)); + shm_toc_estimate_chunk(&e, queue_size); + shm_toc_estimate_chunk(&e, error_queue_size); + + shm_toc_estimate_keys(&e, 3); + segsize = shm_toc_estimate(&e); + + /* Create the shared memory segment and establish a table of contents. */ + seg = dsm_create(shm_toc_estimate(&e), 0); + if (!seg) + return false; + + toc = shm_toc_create(PG_LOGICAL_APPLY_SHM_MAGIC, dsm_segment_address(seg), + segsize); + + /* Set up the header region. */ + shared = shm_toc_allocate(toc, sizeof(ParallelApplyWorkerShared)); + SpinLockInit(&shared->mutex); + + shared->xact_state = PARALLEL_TRANS_UNKNOWN; + pg_atomic_init_u32(&(shared->pending_stream_count), 0); + shared->last_commit_end = InvalidXLogRecPtr; + shared->fileset_state = FS_EMPTY; + + shm_toc_insert(toc, PARALLEL_APPLY_KEY_SHARED, shared); + + /* Set up message queue for the worker. */ + mq = shm_mq_create(shm_toc_allocate(toc, queue_size), queue_size); + shm_toc_insert(toc, PARALLEL_APPLY_KEY_MQ, mq); + shm_mq_set_sender(mq, MyProc); + + /* Attach the queue. */ + winfo->mq_handle = shm_mq_attach(mq, seg, NULL); + + /* Set up error queue for the worker. */ + mq = shm_mq_create(shm_toc_allocate(toc, error_queue_size), + error_queue_size); + shm_toc_insert(toc, PARALLEL_APPLY_KEY_ERROR_QUEUE, mq); + shm_mq_set_receiver(mq, MyProc); + + /* Attach the queue. */ + winfo->error_mq_handle = shm_mq_attach(mq, seg, NULL); + + /* Return results to caller. */ + winfo->dsm_seg = seg; + winfo->shared = shared; + + return true; +} + +/* + * Try to get a parallel apply worker from the pool. If none is available then + * start a new one. + */ +static ParallelApplyWorkerInfo * +pa_launch_parallel_worker(void) +{ + MemoryContext oldcontext; + bool launched; + ParallelApplyWorkerInfo *winfo; + ListCell *lc; + + /* Try to get an available parallel apply worker from the worker pool. */ + foreach(lc, ParallelApplyWorkerPool) + { + winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + if (!winfo->in_use) + return winfo; + } + + /* + * Start a new parallel apply worker. + * + * The worker info can be used for the lifetime of the worker process, so + * create it in a permanent context. + */ + oldcontext = MemoryContextSwitchTo(ApplyContext); + + winfo = (ParallelApplyWorkerInfo *) palloc0(sizeof(ParallelApplyWorkerInfo)); + + /* Setup shared memory. */ + if (!pa_setup_dsm(winfo)) + { + MemoryContextSwitchTo(oldcontext); + pfree(winfo); + return NULL; + } + + launched = logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + InvalidOid, + dsm_segment_handle(winfo->dsm_seg)); + + if (launched) + { + ParallelApplyWorkerPool = lappend(ParallelApplyWorkerPool, winfo); + } + else + { + pa_free_worker_info(winfo); + winfo = NULL; + } + + MemoryContextSwitchTo(oldcontext); + + return winfo; +} + +/* + * Allocate a parallel apply worker that will be used for the specified xid. + * + * We first try to get an available worker from the pool, if any and then try + * to launch a new worker. On successful allocation, remember the worker + * information in the hash table so that we can get it later for processing the + * streaming changes. + */ +void +pa_allocate_worker(TransactionId xid) +{ + bool found; + ParallelApplyWorkerInfo *winfo = NULL; + ParallelApplyWorkerEntry *entry; + + if (!pa_can_start()) + return; + + winfo = pa_launch_parallel_worker(); + if (!winfo) + return; + + /* First time through, initialize parallel apply worker state hashtable. */ + if (!ParallelApplyTxnHash) + { + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(TransactionId); + ctl.entrysize = sizeof(ParallelApplyWorkerEntry); + ctl.hcxt = ApplyContext; + + ParallelApplyTxnHash = hash_create("logical replication parallel apply workers hash", + 16, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + /* Create an entry for the requested transaction. */ + entry = hash_search(ParallelApplyTxnHash, &xid, HASH_ENTER, &found); + if (found) + elog(ERROR, "hash table corrupted"); + + /* Update the transaction information in shared memory. */ + SpinLockAcquire(&winfo->shared->mutex); + winfo->shared->xact_state = PARALLEL_TRANS_UNKNOWN; + winfo->shared->xid = xid; + SpinLockRelease(&winfo->shared->mutex); + + winfo->in_use = true; + winfo->serialize_changes = false; + entry->winfo = winfo; + entry->xid = xid; +} + +/* + * Find the assigned worker for the given transaction, if any. + */ +ParallelApplyWorkerInfo * +pa_find_worker(TransactionId xid) +{ + bool found; + ParallelApplyWorkerEntry *entry; + + if (!TransactionIdIsValid(xid)) + return NULL; + + if (!ParallelApplyTxnHash) + return NULL; + + /* Return the cached parallel apply worker if valid. */ + if (stream_apply_worker) + return stream_apply_worker; + + /* Find an entry for the requested transaction. */ + entry = hash_search(ParallelApplyTxnHash, &xid, HASH_FIND, &found); + if (found) + { + /* The worker must not have exited. */ + Assert(entry->winfo->in_use); + return entry->winfo; + } + + return NULL; +} + +/* + * Makes the worker available for reuse. + * + * This removes the parallel apply worker entry from the hash table so that it + * can't be used. If there are enough workers in the pool, it stops the worker + * and frees the corresponding info. Otherwise it just marks the worker as + * available for reuse. + * + * For more information about the worker pool, see comments atop this file. + */ +static void +pa_free_worker(ParallelApplyWorkerInfo *winfo) +{ + Assert(!am_parallel_apply_worker()); + Assert(winfo->in_use); + Assert(pa_get_xact_state(winfo->shared) == PARALLEL_TRANS_FINISHED); + + if (!hash_search(ParallelApplyTxnHash, &winfo->shared->xid, HASH_REMOVE, NULL)) + elog(ERROR, "hash table corrupted"); + + /* + * Stop the worker if there are enough workers in the pool. + * + * XXX Additionally, we also stop the worker if the leader apply worker + * serialize part of the transaction data due to a send timeout. This is + * because the message could be partially written to the queue and there + * is no way to clean the queue other than resending the message until it + * succeeds. Instead of trying to send the data which anyway would have + * been serialized and then letting the parallel apply worker deal with + * the spurious message, we stop the worker. + */ + if (winfo->serialize_changes || + list_length(ParallelApplyWorkerPool) > + (max_parallel_apply_workers_per_subscription / 2)) + { + logicalrep_pa_worker_stop(winfo); + pa_free_worker_info(winfo); + + return; + } + + winfo->in_use = false; + winfo->serialize_changes = false; +} + +/* + * Free the parallel apply worker information and unlink the files with + * serialized changes if any. + */ +static void +pa_free_worker_info(ParallelApplyWorkerInfo *winfo) +{ + Assert(winfo); + + if (winfo->mq_handle) + shm_mq_detach(winfo->mq_handle); + + if (winfo->error_mq_handle) + shm_mq_detach(winfo->error_mq_handle); + + /* Unlink the files with serialized changes. */ + if (winfo->serialize_changes) + stream_cleanup_files(MyLogicalRepWorker->subid, winfo->shared->xid); + + if (winfo->dsm_seg) + dsm_detach(winfo->dsm_seg); + + /* Remove from the worker pool. */ + ParallelApplyWorkerPool = list_delete_ptr(ParallelApplyWorkerPool, winfo); + + pfree(winfo); +} + +/* + * Detach the error queue for all parallel apply workers. + */ +void +pa_detach_all_error_mq(void) +{ + ListCell *lc; + + foreach(lc, ParallelApplyWorkerPool) + { + ParallelApplyWorkerInfo *winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + if (winfo->error_mq_handle) + { + shm_mq_detach(winfo->error_mq_handle); + winfo->error_mq_handle = NULL; + } + } +} + +/* + * Check if there are any pending spooled messages. + */ +static bool +pa_has_spooled_message_pending() +{ + PartialFileSetState fileset_state; + + fileset_state = pa_get_fileset_state(); + + return (fileset_state != FS_EMPTY); +} + +/* + * Replay the spooled messages once the leader apply worker has finished + * serializing changes to the file. + * + * Returns false if there aren't any pending spooled messages, true otherwise. + */ +static bool +pa_process_spooled_messages_if_required(void) +{ + PartialFileSetState fileset_state; + + fileset_state = pa_get_fileset_state(); + + if (fileset_state == FS_EMPTY) + return false; + + /* + * If the leader apply worker is busy serializing the partial changes then + * acquire the stream lock now and wait for the leader worker to finish + * serializing the changes. Otherwise, the parallel apply worker won't get + * a chance to receive a STREAM_STOP (and acquire the stream lock) until + * the leader had serialized all changes which can lead to undetected + * deadlock. + * + * Note that the fileset state can be FS_SERIALIZE_DONE once the leader + * worker has finished serializing the changes. + */ + if (fileset_state == FS_SERIALIZE_IN_PROGRESS) + { + pa_lock_stream(MyParallelShared->xid, AccessShareLock); + pa_unlock_stream(MyParallelShared->xid, AccessShareLock); + + fileset_state = pa_get_fileset_state(); + } + + /* + * We cannot read the file immediately after the leader has serialized all + * changes to the file because there may still be messages in the memory + * queue. We will apply all spooled messages the next time we call this + * function and that will ensure there are no messages left in the memory + * queue. + */ + if (fileset_state == FS_SERIALIZE_DONE) + { + pa_set_fileset_state(MyParallelShared, FS_READY); + } + else if (fileset_state == FS_READY) + { + apply_spooled_messages(&MyParallelShared->fileset, + MyParallelShared->xid, + InvalidXLogRecPtr); + pa_set_fileset_state(MyParallelShared, FS_EMPTY); + } + + return true; +} + +/* + * Interrupt handler for main loop of parallel apply worker. + */ +static void +ProcessParallelApplyInterrupts(void) +{ + CHECK_FOR_INTERRUPTS(); + + if (ShutdownRequestPending) + { + ereport(LOG, + (errmsg("logical replication parallel apply worker for subscription \"%s\" has finished", + MySubscription->name))); + + proc_exit(0); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } +} + +/* Parallel apply worker main loop. */ +static void +LogicalParallelApplyLoop(shm_mq_handle *mqh) +{ + shm_mq_result shmq_res; + ErrorContextCallback errcallback; + MemoryContext oldcxt = CurrentMemoryContext; + + /* + * Init the ApplyMessageContext which we clean up after each replication + * protocol message. + */ + ApplyMessageContext = AllocSetContextCreate(ApplyContext, + "ApplyMessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * Push apply error context callback. Fields will be filled while applying + * a change. + */ + errcallback.callback = apply_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + for (;;) + { + void *data; + Size len; + + ProcessParallelApplyInterrupts(); + + /* Ensure we are reading the data into our memory context. */ + MemoryContextSwitchTo(ApplyMessageContext); + + shmq_res = shm_mq_receive(mqh, &len, &data, true); + + if (shmq_res == SHM_MQ_SUCCESS) + { + StringInfoData s; + int c; + + if (len == 0) + elog(ERROR, "invalid message length"); + + s.cursor = 0; + s.maxlen = -1; + s.data = (char *) data; + s.len = len; + + /* + * The first byte of messages sent from leader apply worker to + * parallel apply workers can only be 'w'. + */ + c = pq_getmsgbyte(&s); + if (c != 'w') + elog(ERROR, "unexpected message \"%c\"", c); + + /* + * Ignore statistics fields that have been updated by the leader + * apply worker. + * + * XXX We can avoid sending the statistics fields from the leader + * apply worker but for that, it needs to rebuild the entire + * message by removing these fields which could be more work than + * simply ignoring these fields in the parallel apply worker. + */ + s.cursor += SIZE_STATS_MESSAGE; + + apply_dispatch(&s); + } + else if (shmq_res == SHM_MQ_WOULD_BLOCK) + { + /* Replay the changes from the file, if any. */ + if (!pa_process_spooled_messages_if_required()) + { + int rc; + + /* Wait for more work. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, + WAIT_EVENT_LOGICAL_PARALLEL_APPLY_MAIN); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + } + else + { + Assert(shmq_res == SHM_MQ_DETACHED); + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication apply worker"))); + } + + MemoryContextReset(ApplyMessageContext); + MemoryContextSwitchTo(oldcxt); + } + + /* Pop the error context stack. */ + error_context_stack = errcallback.previous; + + MemoryContextSwitchTo(oldcxt); +} + +/* + * Make sure the leader apply worker tries to read from our error queue one more + * time. This guards against the case where we exit uncleanly without sending + * an ErrorResponse, for example because some code calls proc_exit directly. + * + * Also explicitly detach from dsm segment to invoke on_dsm_detach callbacks, + * if any. See ParallelWorkerShutdown for details. + */ +static void +pa_shutdown(int code, Datum arg) +{ + SendProcSignal(MyLogicalRepWorker->leader_pid, + PROCSIG_PARALLEL_APPLY_MESSAGE, + InvalidBackendId); + + dsm_detach((dsm_segment *) DatumGetPointer(arg)); +} + +/* + * Parallel apply worker entry point. + */ +void +ParallelApplyWorkerMain(Datum main_arg) +{ + ParallelApplyWorkerShared *shared; + dsm_handle handle; + dsm_segment *seg; + shm_toc *toc; + shm_mq *mq; + shm_mq_handle *mqh; + shm_mq_handle *error_mqh; + RepOriginId originid; + int worker_slot = DatumGetInt32(main_arg); + char originname[NAMEDATALEN]; + + InitializingApplyWorker = true; + + /* Setup signal handling. */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * Attach to the dynamic shared memory segment for the parallel apply, and + * find its table of contents. + * + * Like parallel query, we don't need resource owner by this time. See + * ParallelWorkerMain. + */ + memcpy(&handle, MyBgworkerEntry->bgw_extra, sizeof(dsm_handle)); + seg = dsm_attach(handle); + if (!seg) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + + toc = shm_toc_attach(PG_LOGICAL_APPLY_SHM_MAGIC, dsm_segment_address(seg)); + if (!toc) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid magic number in dynamic shared memory segment"))); + + /* Look up the shared information. */ + shared = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_SHARED, false); + MyParallelShared = shared; + + /* + * Attach to the message queue. + */ + mq = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_MQ, false); + shm_mq_set_receiver(mq, MyProc); + mqh = shm_mq_attach(mq, seg, NULL); + + /* + * Primary initialization is complete. Now, we can attach to our slot. + * This is to ensure that the leader apply worker does not write data to + * the uninitialized memory queue. + */ + logicalrep_worker_attach(worker_slot); + + /* + * Register the shutdown callback after we are attached to the worker + * slot. This is to ensure that MyLogicalRepWorker remains valid when this + * callback is invoked. + */ + before_shmem_exit(pa_shutdown, PointerGetDatum(seg)); + + SpinLockAcquire(&MyParallelShared->mutex); + MyParallelShared->logicalrep_worker_generation = MyLogicalRepWorker->generation; + MyParallelShared->logicalrep_worker_slot_no = worker_slot; + SpinLockRelease(&MyParallelShared->mutex); + + /* + * Attach to the error queue. + */ + mq = shm_toc_lookup(toc, PARALLEL_APPLY_KEY_ERROR_QUEUE, false); + shm_mq_set_sender(mq, MyProc); + error_mqh = shm_mq_attach(mq, seg, NULL); + + pq_redirect_to_shm_mq(seg, error_mqh); + pq_set_parallel_leader(MyLogicalRepWorker->leader_pid, + InvalidBackendId); + + MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time = + MyLogicalRepWorker->reply_time = 0; + + InitializeApplyWorker(); + + InitializingApplyWorker = false; + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + ReplicationOriginNameForLogicalRep(MySubscription->oid, InvalidOid, + originname, sizeof(originname)); + originid = replorigin_by_name(originname, false); + + /* + * The parallel apply worker doesn't need to monopolize this replication + * origin which was already acquired by its leader process. + */ + replorigin_session_setup(originid, MyLogicalRepWorker->leader_pid); + replorigin_session_origin = originid; + CommitTransactionCommand(); + + /* + * Setup callback for syscache so that we know when something changes in + * the subscription relation state. + */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); + + set_apply_error_context_origin(originname); + + LogicalParallelApplyLoop(mqh); + + /* + * The parallel apply worker must not get here because the parallel apply + * worker will only stop when it receives a SIGTERM or SIGINT from the + * leader, or when there is an error. None of these cases will allow the + * code to reach here. + */ + Assert(false); +} + +/* + * Handle receipt of an interrupt indicating a parallel apply worker message. + * + * Note: this is called within a signal handler! All we can do is set a flag + * that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * HandleParallelApplyMessages(). + */ +void +HandleParallelApplyMessageInterrupt(void) +{ + InterruptPending = true; + ParallelApplyMessagePending = true; + SetLatch(MyLatch); +} + +/* + * Handle a single protocol message received from a single parallel apply + * worker. + */ +static void +HandleParallelApplyMessage(StringInfo msg) +{ + char msgtype; + + msgtype = pq_getmsgbyte(msg); + + switch (msgtype) + { + case 'E': /* ErrorResponse */ + { + ErrorData edata; + + /* Parse ErrorResponse. */ + pq_parse_errornotice(msg, &edata); + + /* + * If desired, add a context line to show that this is a + * message propagated from a parallel apply worker. Otherwise, + * it can sometimes be confusing to understand what actually + * happened. + */ + if (edata.context) + edata.context = psprintf("%s\n%s", edata.context, + _("logical replication parallel apply worker")); + else + edata.context = pstrdup(_("logical replication parallel apply worker")); + + /* + * Context beyond that should use the error context callbacks + * that were in effect in LogicalRepApplyLoop(). + */ + error_context_stack = apply_error_context_stack; + + /* + * The actual error must have been reported by the parallel + * apply worker. + */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication parallel apply worker exited due to error"), + errcontext("%s", edata.context))); + } + + /* + * Don't need to do anything about NoticeResponse and + * NotifyResponse as the logical replication worker doesn't need + * to send messages to the client. + */ + case 'N': + case 'A': + break; + + default: + elog(ERROR, "unrecognized message type received from logical replication parallel apply worker: %c (message length %d bytes)", + msgtype, msg->len); + } +} + +/* + * Handle any queued protocol messages received from parallel apply workers. + */ +void +HandleParallelApplyMessages(void) +{ + ListCell *lc; + MemoryContext oldcontext; + + static MemoryContext hpam_context = NULL; + + /* + * This is invoked from ProcessInterrupts(), and since some of the + * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential + * for recursive calls if more signals are received while this runs. It's + * unclear that recursive entry would be safe, and it doesn't seem useful + * even if it is safe, so let's block interrupts until done. + */ + HOLD_INTERRUPTS(); + + /* + * Moreover, CurrentMemoryContext might be pointing almost anywhere. We + * don't want to risk leaking data into long-lived contexts, so let's do + * our work here in a private context that we can reset on each use. + */ + if (!hpam_context) /* first time through? */ + hpam_context = AllocSetContextCreate(TopMemoryContext, + "HandleParallelApplyMessages", + ALLOCSET_DEFAULT_SIZES); + else + MemoryContextReset(hpam_context); + + oldcontext = MemoryContextSwitchTo(hpam_context); + + ParallelApplyMessagePending = false; + + foreach(lc, ParallelApplyWorkerPool) + { + shm_mq_result res; + Size nbytes; + void *data; + ParallelApplyWorkerInfo *winfo = (ParallelApplyWorkerInfo *) lfirst(lc); + + /* + * The leader will detach from the error queue and set it to NULL + * before preparing to stop all parallel apply workers, so we don't + * need to handle error messages anymore. See + * logicalrep_worker_detach. + */ + if (!winfo->error_mq_handle) + continue; + + res = shm_mq_receive(winfo->error_mq_handle, &nbytes, &data, true); + + if (res == SHM_MQ_WOULD_BLOCK) + continue; + else if (res == SHM_MQ_SUCCESS) + { + StringInfoData msg; + + initStringInfo(&msg); + appendBinaryStringInfo(&msg, data, nbytes); + HandleParallelApplyMessage(&msg); + pfree(msg.data); + } + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication parallel apply worker"))); + } + + MemoryContextSwitchTo(oldcontext); + + /* Might as well clear the context on our way out */ + MemoryContextReset(hpam_context); + + RESUME_INTERRUPTS(); +} + +/* + * Send the data to the specified parallel apply worker via shared-memory + * queue. + * + * Returns false if the attempt to send data via shared memory times out, true + * otherwise. + */ +bool +pa_send_data(ParallelApplyWorkerInfo *winfo, Size nbytes, const void *data) +{ + int rc; + shm_mq_result result; + TimestampTz startTime = 0; + + Assert(!IsTransactionState()); + Assert(!winfo->serialize_changes); + + /* + * We don't try to send data to parallel worker for 'immediate' mode. This + * is primarily used for testing purposes. + */ + if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE)) + return false; + +/* + * This timeout is a bit arbitrary but testing revealed that it is sufficient + * to send the message unless the parallel apply worker is waiting on some + * lock or there is a serious resource crunch. See the comments atop this file + * to know why we are using a non-blocking way to send the message. + */ +#define SHM_SEND_RETRY_INTERVAL_MS 1000 +#define SHM_SEND_TIMEOUT_MS (10000 - SHM_SEND_RETRY_INTERVAL_MS) + + for (;;) + { + result = shm_mq_send(winfo->mq_handle, nbytes, data, true, true); + + if (result == SHM_MQ_SUCCESS) + return true; + else if (result == SHM_MQ_DETACHED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not send data to shared-memory queue"))); + + Assert(result == SHM_MQ_WOULD_BLOCK); + + /* Wait before retrying. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + SHM_SEND_RETRY_INTERVAL_MS, + WAIT_EVENT_LOGICAL_APPLY_SEND_DATA); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (startTime == 0) + startTime = GetCurrentTimestamp(); + else if (TimestampDifferenceExceeds(startTime, GetCurrentTimestamp(), + SHM_SEND_TIMEOUT_MS)) + return false; + } +} + +/* + * Switch to PARTIAL_SERIALIZE mode for the current transaction -- this means + * that the current data and any subsequent data for this transaction will be + * serialized to a file. This is done to prevent possible deadlocks with + * another parallel apply worker (refer to the comments atop this file). + */ +void +pa_switch_to_partial_serialize(ParallelApplyWorkerInfo *winfo, + bool stream_locked) +{ + ereport(LOG, + (errmsg("logical replication apply worker will serialize the remaining changes of remote transaction %u to a file", + winfo->shared->xid))); + + /* + * The parallel apply worker could be stuck for some reason (say waiting + * on some lock by other backend), so stop trying to send data directly to + * it and start serializing data to the file instead. + */ + winfo->serialize_changes = true; + + /* Initialize the stream fileset. */ + stream_start_internal(winfo->shared->xid, true); + + /* + * Acquires the stream lock if not already to make sure that the parallel + * apply worker will wait for the leader to release the stream lock until + * the end of the transaction. + */ + if (!stream_locked) + pa_lock_stream(winfo->shared->xid, AccessExclusiveLock); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_IN_PROGRESS); +} + +/* + * Wait until the parallel apply worker's transaction state has reached or + * exceeded the given xact_state. + */ +static void +pa_wait_for_xact_state(ParallelApplyWorkerInfo *winfo, + ParallelTransState xact_state) +{ + for (;;) + { + /* + * Stop if the transaction state has reached or exceeded the given + * xact_state. + */ + if (pa_get_xact_state(winfo->shared) >= xact_state) + break; + + /* Wait to be signalled. */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, + WAIT_EVENT_LOGICAL_PARALLEL_APPLY_STATE_CHANGE); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * Wait until the parallel apply worker's transaction finishes. + */ +static void +pa_wait_for_xact_finish(ParallelApplyWorkerInfo *winfo) +{ + /* + * Wait until the parallel apply worker set the state to + * PARALLEL_TRANS_STARTED which means it has acquired the transaction + * lock. This is to prevent leader apply worker from acquiring the + * transaction lock earlier than the parallel apply worker. + */ + pa_wait_for_xact_state(winfo, PARALLEL_TRANS_STARTED); + + /* + * Wait for the transaction lock to be released. This is required to + * detect deadlock among leader and parallel apply workers. Refer to the + * comments atop this file. + */ + pa_lock_transaction(winfo->shared->xid, AccessShareLock); + pa_unlock_transaction(winfo->shared->xid, AccessShareLock); + + /* + * Check if the state becomes PARALLEL_TRANS_FINISHED in case the parallel + * apply worker failed while applying changes causing the lock to be + * released. + */ + if (pa_get_xact_state(winfo->shared) != PARALLEL_TRANS_FINISHED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("lost connection to the logical replication parallel apply worker"))); +} + +/* + * Set the transaction state for a given parallel apply worker. + */ +void +pa_set_xact_state(ParallelApplyWorkerShared *wshared, + ParallelTransState xact_state) +{ + SpinLockAcquire(&wshared->mutex); + wshared->xact_state = xact_state; + SpinLockRelease(&wshared->mutex); +} + +/* + * Get the transaction state for a given parallel apply worker. + */ +static ParallelTransState +pa_get_xact_state(ParallelApplyWorkerShared *wshared) +{ + ParallelTransState xact_state; + + SpinLockAcquire(&wshared->mutex); + xact_state = wshared->xact_state; + SpinLockRelease(&wshared->mutex); + + return xact_state; +} + +/* + * Cache the parallel apply worker information. + */ +void +pa_set_stream_apply_worker(ParallelApplyWorkerInfo *winfo) +{ + stream_apply_worker = winfo; +} + +/* + * Form a unique savepoint name for the streaming transaction. + * + * Note that different subscriptions for publications on different nodes can + * receive same remote xid, so we need to use subscription id along with it. + * + * Returns the name in the supplied buffer. + */ +static void +pa_savepoint_name(Oid suboid, TransactionId xid, char *spname, Size szsp) +{ + snprintf(spname, szsp, "pg_sp_%u_%u", suboid, xid); +} + +/* + * Define a savepoint for a subxact in parallel apply worker if needed. + * + * The parallel apply worker can figure out if a new subtransaction was + * started by checking if the new change arrived with a different xid. In that + * case define a named savepoint, so that we are able to rollback to it + * if required. + */ +void +pa_start_subtrans(TransactionId current_xid, TransactionId top_xid) +{ + if (current_xid != top_xid && + !list_member_xid(subxactlist, current_xid)) + { + MemoryContext oldctx; + char spname[NAMEDATALEN]; + + pa_savepoint_name(MySubscription->oid, current_xid, + spname, sizeof(spname)); + + elog(DEBUG1, "defining savepoint %s in logical replication parallel apply worker", spname); + + /* We must be in transaction block to define the SAVEPOINT. */ + if (!IsTransactionBlock()) + { + if (!IsTransactionState()) + StartTransactionCommand(); + + BeginTransactionBlock(); + CommitTransactionCommand(); + } + + DefineSavepoint(spname); + + /* + * CommitTransactionCommand is needed to start a subtransaction after + * issuing a SAVEPOINT inside a transaction block (see + * StartSubTransaction()). + */ + CommitTransactionCommand(); + + oldctx = MemoryContextSwitchTo(TopTransactionContext); + subxactlist = lappend_xid(subxactlist, current_xid); + MemoryContextSwitchTo(oldctx); + } +} + +/* Reset the list that maintains subtransactions. */ +void +pa_reset_subtrans(void) +{ + /* + * We don't need to free this explicitly as the allocated memory will be + * freed at the transaction end. + */ + subxactlist = NIL; +} + +/* + * Handle STREAM ABORT message when the transaction was applied in a parallel + * apply worker. + */ +void +pa_stream_abort(LogicalRepStreamAbortData *abort_data) +{ + TransactionId xid = abort_data->xid; + TransactionId subxid = abort_data->subxid; + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = abort_data->abort_lsn; + replorigin_session_origin_timestamp = abort_data->abort_time; + + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just free the subxactlist. + */ + if (subxid == xid) + { + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + + /* + * Release the lock as we might be processing an empty streaming + * transaction in which case the lock won't be released during + * transaction rollback. + * + * Note that it's ok to release the transaction lock before aborting + * the transaction because even if the parallel apply worker dies due + * to crash or some other reason, such a transaction would still be + * considered aborted. + */ + pa_unlock_transaction(xid, AccessExclusiveLock); + + AbortCurrentTransaction(); + + if (IsTransactionBlock()) + { + EndTransactionBlock(false); + CommitTransactionCommand(); + } + + pa_reset_subtrans(); + + pgstat_report_activity(STATE_IDLE, NULL); + } + else + { + /* OK, so it's a subxact. Rollback to the savepoint. */ + int i; + char spname[NAMEDATALEN]; + + pa_savepoint_name(MySubscription->oid, subxid, spname, sizeof(spname)); + + elog(DEBUG1, "rolling back to savepoint %s in logical replication parallel apply worker", spname); + + /* + * Search the subxactlist, determine the offset tracked for the + * subxact, and truncate the list. + * + * Note that for an empty sub-transaction we won't find the subxid + * here. + */ + for (i = list_length(subxactlist) - 1; i >= 0; i--) + { + TransactionId xid_tmp = lfirst_xid(list_nth_cell(subxactlist, i)); + + if (xid_tmp == subxid) + { + RollbackToSavepoint(spname); + CommitTransactionCommand(); + subxactlist = list_truncate(subxactlist, i); + break; + } + } + } +} + +/* + * Set the fileset state for a particular parallel apply worker. The fileset + * will be set once the leader worker serialized all changes to the file + * so that it can be used by parallel apply worker. + */ +void +pa_set_fileset_state(ParallelApplyWorkerShared *wshared, + PartialFileSetState fileset_state) +{ + SpinLockAcquire(&wshared->mutex); + wshared->fileset_state = fileset_state; + + if (fileset_state == FS_SERIALIZE_DONE) + { + Assert(am_leader_apply_worker()); + Assert(MyLogicalRepWorker->stream_fileset); + wshared->fileset = *MyLogicalRepWorker->stream_fileset; + } + + SpinLockRelease(&wshared->mutex); +} + +/* + * Get the fileset state for the current parallel apply worker. + */ +static PartialFileSetState +pa_get_fileset_state(void) +{ + PartialFileSetState fileset_state; + + Assert(am_parallel_apply_worker()); + + SpinLockAcquire(&MyParallelShared->mutex); + fileset_state = MyParallelShared->fileset_state; + SpinLockRelease(&MyParallelShared->mutex); + + return fileset_state; +} + +/* + * Helper functions to acquire and release a lock for each stream block. + * + * Set locktag_field4 to PARALLEL_APPLY_LOCK_STREAM to indicate that it's a + * stream lock. + * + * Refer to the comments atop this file to see how the stream lock is used. + */ +void +pa_lock_stream(TransactionId xid, LOCKMODE lockmode) +{ + LockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_STREAM, lockmode); +} + +void +pa_unlock_stream(TransactionId xid, LOCKMODE lockmode) +{ + UnlockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_STREAM, lockmode); +} + +/* + * Helper functions to acquire and release a lock for each local transaction + * apply. + * + * Set locktag_field4 to PARALLEL_APPLY_LOCK_XACT to indicate that it's a + * transaction lock. + * + * Note that all the callers must pass a remote transaction ID instead of a + * local transaction ID as xid. This is because the local transaction ID will + * only be assigned while applying the first change in the parallel apply but + * it's possible that the first change in the parallel apply worker is blocked + * by a concurrently executing transaction in another parallel apply worker. We + * can only communicate the local transaction id to the leader after applying + * the first change so it won't be able to wait after sending the xact finish + * command using this lock. + * + * Refer to the comments atop this file to see how the transaction lock is + * used. + */ +void +pa_lock_transaction(TransactionId xid, LOCKMODE lockmode) +{ + LockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_XACT, lockmode); +} + +void +pa_unlock_transaction(TransactionId xid, LOCKMODE lockmode) +{ + UnlockApplyTransactionForSession(MyLogicalRepWorker->subid, xid, + PARALLEL_APPLY_LOCK_XACT, lockmode); +} + +/* + * Decrement the number of pending streaming blocks and wait on the stream lock + * if there is no pending block available. + */ +void +pa_decr_and_wait_stream_block(void) +{ + Assert(am_parallel_apply_worker()); + + /* + * It is only possible to not have any pending stream chunks when we are + * applying spooled messages. + */ + if (pg_atomic_read_u32(&MyParallelShared->pending_stream_count) == 0) + { + if (pa_has_spooled_message_pending()) + return; + + elog(ERROR, "invalid pending streaming chunk 0"); + } + + if (pg_atomic_sub_fetch_u32(&MyParallelShared->pending_stream_count, 1) == 0) + { + pa_lock_stream(MyParallelShared->xid, AccessShareLock); + pa_unlock_stream(MyParallelShared->xid, AccessShareLock); + } +} + +/* + * Finish processing the streaming transaction in the leader apply worker. + */ +void +pa_xact_finish(ParallelApplyWorkerInfo *winfo, XLogRecPtr remote_lsn) +{ + Assert(am_leader_apply_worker()); + + /* + * Unlock the shared object lock so that parallel apply worker can + * continue to receive and apply changes. + */ + pa_unlock_stream(winfo->shared->xid, AccessExclusiveLock); + + /* + * Wait for that worker to finish. This is necessary to maintain commit + * order which avoids failures due to transaction dependencies and + * deadlocks. + */ + pa_wait_for_xact_finish(winfo); + + if (!XLogRecPtrIsInvalid(remote_lsn)) + store_flush_position(remote_lsn, winfo->shared->last_commit_end); + + pa_free_worker(winfo); +} diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c new file mode 100644 index 0000000..d91055a --- /dev/null +++ b/src/backend/replication/logical/decode.c @@ -0,0 +1,1292 @@ +/* ------------------------------------------------------------------------- + * + * decode.c + * This module decodes WAL records read using xlogreader.h's APIs for the + * purpose of logical decoding by passing information to the + * reorderbuffer module (containing the actual changes) and to the + * snapbuild module to build a fitting catalog snapshot (to be able to + * properly decode the changes in the reorderbuffer). + * + * NOTE: + * This basically tries to handle all low level xlog stuff for + * reorderbuffer.c and snapbuild.c. There's some minor leakage where a + * specific record's struct is used to pass data along, but those just + * happen to contain the right amount of data in a convenient + * format. There isn't and shouldn't be much intelligence about the + * contents of records in here except turning them into a more usable + * format. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/replication/logical/decode.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecord.h" +#include "access/xlogutils.h" +#include "catalog/pg_control.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/standby.h" + +/* individual record(group)'s handlers */ +static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +static void DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase); +static void DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase); +static void DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed); + + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple); + +/* helper functions for decoding transactions */ +static inline bool FilterPrepare(LogicalDecodingContext *ctx, + TransactionId xid, const char *gid); +static bool DecodeTXNNeedSkip(LogicalDecodingContext *ctx, + XLogRecordBuffer *buf, Oid txn_dbid, + RepOriginId origin_id); + +/* + * Take every XLogReadRecord()ed record and perform the actions required to + * decode it using the output plugin already setup in the logical decoding + * context. + * + * NB: Note that every record's xid needs to be processed by reorderbuffer + * (xids contained in the content of records are not relevant for this rule). + * That means that for records which'd otherwise not go through the + * reorderbuffer ReorderBufferProcessXid() has to be called. We don't want to + * call ReorderBufferProcessXid for each record type by default, because + * e.g. empty xacts can be handled more efficiently if there's no previous + * state for them. + * + * We also support the ability to fast forward thru records, skipping some + * record types completely - see individual record types for details. + */ +void +LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record) +{ + XLogRecordBuffer buf; + TransactionId txid; + RmgrData rmgr; + + buf.origptr = ctx->reader->ReadRecPtr; + buf.endptr = ctx->reader->EndRecPtr; + buf.record = record; + + txid = XLogRecGetTopXid(record); + + /* + * If the top-level xid is valid, we need to assign the subxact to the + * top-level xact. We need to do this for all records, hence we do it + * before the switch. + */ + if (TransactionIdIsValid(txid)) + { + ReorderBufferAssignChild(ctx->reorder, + txid, + XLogRecGetXid(record), + buf.origptr); + } + + rmgr = GetRmgr(XLogRecGetRmid(record)); + + if (rmgr.rm_decode != NULL) + rmgr.rm_decode(ctx, &buf); + else + { + /* just deal with xid, and done */ + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), + buf.origptr); + } +} + +/* + * Handle rmgr XLOG_ID records for LogicalDecodingProcessRecord(). + */ +void +xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), + buf->origptr); + + switch (info) + { + /* this is also used in END_OF_RECOVERY checkpoints */ + case XLOG_CHECKPOINT_SHUTDOWN: + case XLOG_END_OF_RECOVERY: + SnapBuildSerializationPoint(builder, buf->origptr); + + break; + case XLOG_CHECKPOINT_ONLINE: + + /* + * a RUNNING_XACTS record will have been logged near to this, we + * can restart from there. + */ + break; + case XLOG_PARAMETER_CHANGE: + { + xl_parameter_change *xlrec = + (xl_parameter_change *) XLogRecGetData(buf->record); + + /* + * If wal_level on the primary is reduced to less than + * logical, we want to prevent existing logical slots from + * being used. Existing logical slots on the standby get + * invalidated when this WAL record is replayed; and further, + * slot creation fails when wal_level is not sufficient; but + * all these operations are not synchronized, so a logical + * slot may creep in while the wal_level is being reduced. + * Hence this extra check. + */ + if (xlrec->wal_level < WAL_LEVEL_LOGICAL) + { + /* + * This can occur only on a standby, as a primary would + * not allow to restart after changing wal_level < logical + * if there is pre-existing logical slot. + */ + Assert(RecoveryInProgress()); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding on standby requires wal_level >= logical on the primary"))); + } + break; + } + case XLOG_NOOP: + case XLOG_NEXTOID: + case XLOG_SWITCH: + case XLOG_BACKUP_END: + case XLOG_RESTORE_POINT: + case XLOG_FPW_CHANGE: + case XLOG_FPI_FOR_HINT: + case XLOG_FPI: + case XLOG_OVERWRITE_CONTRECORD: + break; + default: + elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); + } +} + +/* + * Handle rmgr XACT_ID records for LogicalDecodingProcessRecord(). + */ +void +xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + ReorderBuffer *reorder = ctx->reorder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & XLOG_XACT_OPMASK; + + /* + * If the snapshot isn't yet fully built, we cannot decode anything, so + * bail out. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + return; + + switch (info) + { + case XLOG_XACT_COMMIT: + case XLOG_XACT_COMMIT_PREPARED: + { + xl_xact_commit *xlrec; + xl_xact_parsed_commit parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_commit *) XLogRecGetData(r); + ParseCommitRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_COMMIT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeCommit(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ABORT: + case XLOG_XACT_ABORT_PREPARED: + { + xl_xact_abort *xlrec; + xl_xact_parsed_abort parsed; + TransactionId xid; + bool two_phase = false; + + xlrec = (xl_xact_abort *) XLogRecGetData(r); + ParseAbortRecord(XLogRecGetInfo(buf->record), xlrec, &parsed); + + if (!TransactionIdIsValid(parsed.twophase_xid)) + xid = XLogRecGetXid(r); + else + xid = parsed.twophase_xid; + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (info == XLOG_XACT_ABORT_PREPARED) + two_phase = !(FilterPrepare(ctx, xid, + parsed.twophase_gid)); + + DecodeAbort(ctx, buf, &parsed, xid, two_phase); + break; + } + case XLOG_XACT_ASSIGNMENT: + + /* + * We assign subxact to the toplevel xact while processing each + * record if required. So, we don't need to do anything here. See + * LogicalDecodingProcessRecord. + */ + break; + case XLOG_XACT_INVALIDATIONS: + { + TransactionId xid; + xl_xact_invals *invals; + + xid = XLogRecGetXid(r); + invals = (xl_xact_invals *) XLogRecGetData(r); + + /* + * Execute the invalidations for xid-less transactions, + * otherwise, accumulate them so that they can be processed at + * the commit time. + */ + if (TransactionIdIsValid(xid)) + { + if (!ctx->fast_forward) + ReorderBufferAddInvalidations(reorder, xid, + buf->origptr, + invals->nmsgs, + invals->msgs); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, + buf->origptr); + } + else if ((!ctx->fast_forward)) + ReorderBufferImmediateInvalidation(ctx->reorder, + invals->nmsgs, + invals->msgs); + } + break; + case XLOG_XACT_PREPARE: + { + xl_xact_parsed_prepare parsed; + xl_xact_prepare *xlrec; + + /* ok, parse it */ + xlrec = (xl_xact_prepare *) XLogRecGetData(r); + ParsePrepareRecord(XLogRecGetInfo(buf->record), + xlrec, &parsed); + + /* + * We would like to process the transaction in a two-phase + * manner iff output plugin supports two-phase commits and + * doesn't filter the transaction at prepare time. + */ + if (FilterPrepare(ctx, parsed.twophase_xid, + parsed.twophase_gid)) + { + ReorderBufferProcessXid(reorder, parsed.twophase_xid, + buf->origptr); + break; + } + + /* + * Note that if the prepared transaction has locked [user] + * catalog tables exclusively then decoding prepare can block + * till the main transaction is committed because it needs to + * lock the catalog tables. + * + * XXX Now, this can even lead to a deadlock if the prepare + * transaction is waiting to get it logically replicated for + * distributed 2PC. This can be avoided by disallowing + * preparing transactions that have locked [user] catalog + * tables exclusively but as of now, we ask users not to do + * such an operation. + */ + DecodePrepare(ctx, buf, &parsed); + break; + } + default: + elog(ERROR, "unexpected RM_XACT_ID record type: %u", info); + } +} + +/* + * Handle rmgr STANDBY_ID records for LogicalDecodingProcessRecord(). + */ +void +standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + switch (info) + { + case XLOG_RUNNING_XACTS: + { + xl_running_xacts *running = (xl_running_xacts *) XLogRecGetData(r); + + SnapBuildProcessRunningXacts(builder, buf->origptr, running); + + /* + * Abort all transactions that we keep track of, that are + * older than the record's oldestRunningXid. This is the most + * convenient spot for doing so since, in contrast to shutdown + * or end-of-recovery checkpoints, we have information about + * all running transactions which includes prepared ones, + * while shutdown checkpoints just know that no non-prepared + * transactions are in progress. + */ + ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid); + } + break; + case XLOG_STANDBY_LOCK: + break; + case XLOG_INVALIDATIONS: + + /* + * We are processing the invalidations at the command level via + * XLOG_XACT_INVALIDATIONS. So we don't need to do anything here. + */ + break; + default: + elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP2_ID records for LogicalDecodingProcessRecord(). + */ +void +heap2_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP2_MULTI_INSERT: + if (!ctx->fast_forward && + SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeMultiInsert(ctx, buf); + break; + case XLOG_HEAP2_NEW_CID: + { + xl_heap_new_cid *xlrec; + + xlrec = (xl_heap_new_cid *) XLogRecGetData(buf->record); + SnapBuildProcessNewCid(builder, xid, buf->origptr, xlrec); + + break; + } + case XLOG_HEAP2_REWRITE: + + /* + * Although these records only exist to serve the needs of logical + * decoding, all the work happens as part of crash or archive + * recovery, so we don't need to do anything here. + */ + break; + + /* + * Everything else here is just low level physical stuff we're not + * interested in. + */ + case XLOG_HEAP2_FREEZE_PAGE: + case XLOG_HEAP2_PRUNE: + case XLOG_HEAP2_VACUUM: + case XLOG_HEAP2_VISIBLE: + case XLOG_HEAP2_LOCK_UPDATED: + break; + default: + elog(ERROR, "unexpected RM_HEAP2_ID record type: %u", info); + } +} + +/* + * Handle rmgr HEAP_ID records for LogicalDecodingProcessRecord(). + */ +void +heap_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_HEAP_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeInsert(ctx, buf); + break; + + /* + * Treat HOT update as normal updates. There is no useful + * information in the fact that we could make it a HOT update + * locally and the WAL layout is compatible. + */ + case XLOG_HEAP_HOT_UPDATE: + case XLOG_HEAP_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeUpdate(ctx, buf); + break; + + case XLOG_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeDelete(ctx, buf); + break; + + case XLOG_HEAP_TRUNCATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeTruncate(ctx, buf); + break; + + case XLOG_HEAP_INPLACE: + + /* + * Inplace updates are only ever performed on catalog tuples and + * can, per definition, not change tuple visibility. Since we + * don't decode catalog tuples, we're not interested in the + * record's contents. + * + * In-place updates can be used either by XID-bearing transactions + * (e.g. in CREATE INDEX CONCURRENTLY) or by XID-less + * transactions (e.g. VACUUM). In the former case, the commit + * record will include cache invalidations, so we mark the + * transaction as catalog modifying here. Currently that's + * redundant because the commit will do that as well, but once we + * support decoding in-progress relations, this will be important. + */ + if (!TransactionIdIsValid(xid)) + break; + + (void) SnapBuildProcessChange(builder, xid, buf->origptr); + ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr); + break; + + case XLOG_HEAP_CONFIRM: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeSpecConfirm(ctx, buf); + break; + + case XLOG_HEAP_LOCK: + /* we don't care about row level locks for now */ + break; + + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +/* + * Ask output plugin whether we want to skip this PREPARE and send + * this transaction as a regular commit later. + */ +static inline bool +FilterPrepare(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + /* + * Skip if decoding of two-phase transactions at PREPARE time is not + * enabled. In that case, all two-phase transactions are considered + * filtered out and will be applied as regular transactions at COMMIT + * PREPARED. + */ + if (!ctx->twophase) + return true; + + /* + * The filter_prepare callback is optional. When not supplied, all + * prepared transactions should go through. + */ + if (ctx->callbacks.filter_prepare_cb == NULL) + return false; + + return filter_prepare_cb_wrapper(ctx, xid, gid); +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Handle rmgr LOGICALMSG_ID records for LogicalDecodingProcessRecord(). + */ +void +logicalmsg_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogReaderState *r = buf->record; + TransactionId xid = XLogRecGetXid(r); + uint8 info = XLogRecGetInfo(r) & ~XLR_INFO_MASK; + RepOriginId origin_id = XLogRecGetOrigin(r); + Snapshot snapshot = NULL; + xl_logical_message *message; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(ERROR, "unexpected RM_LOGICALMSG_ID record type: %u", info); + + ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding messages. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + message = (xl_logical_message *) XLogRecGetData(r); + + if (message->dbId != ctx->slot->data.database || + FilterByOrigin(ctx, origin_id)) + return; + + if (message->transactional && + !SnapBuildProcessChange(builder, xid, buf->origptr)) + return; + else if (!message->transactional && + (SnapBuildCurrentState(builder) != SNAPBUILD_CONSISTENT || + SnapBuildXactNeedsSkip(builder, buf->origptr))) + return; + + /* + * If this is a non-transactional change, get the snapshot we're expected + * to use. We only get here when the snapshot is consistent, and the + * change is not meant to be skipped. + * + * For transactional changes we don't need a snapshot, we'll use the + * regular snapshot maintained by ReorderBuffer. We just leave it NULL. + */ + if (!message->transactional) + snapshot = SnapBuildGetOrBuildSnapshot(builder); + + ReorderBufferQueueMessage(ctx->reorder, xid, snapshot, buf->endptr, + message->transactional, + message->message, /* first part of message is + * prefix */ + message->message_size, + message->message + message->prefix_size); +} + +/* + * Consolidated commit record handling between the different form of commit + * records. + * + * 'two_phase' indicates that caller wants to process the transaction in two + * phases, first process prepare if not already done and then process + * commit_prepared. + */ +static void +DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_commit *parsed, TransactionId xid, + bool two_phase) +{ + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz commit_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + int i; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + commit_time = parsed->origin_timestamp; + } + + SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid, + parsed->nsubxacts, parsed->subxacts, + parsed->xinfo); + + /* ---- + * Check whether we are interested in this specific transaction, and tell + * the reorderbuffer to forget the content of the (sub-)transactions + * if not. + * + * We can't just use ReorderBufferAbort() here, because we need to execute + * the transaction's invalidations. This currently won't be needed if + * we're just skipping over the transaction because currently we only do + * so during startup, to get to the first transaction the client needs. As + * we have reset the catalog caches before starting to read WAL, and we + * haven't yet touched any catalogs, there can't be anything to invalidate. + * But if we're "forgetting" this commit because it happened in another + * database, the invalidations might be important, because they could be + * for shared catalogs and we might have loaded data into the relevant + * syscaches. + * --- + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferForget(ctx->reorder, parsed->subxacts[i], buf->origptr); + } + ReorderBufferForget(ctx->reorder, xid, buf->origptr); + + return; + } + + /* tell the reorderbuffer about the surviving subtransactions */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* + * Send the final commit record if the transaction data is already + * decoded, otherwise, process the entire transaction. + */ + if (two_phase) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + SnapBuildGetTwoPhaseAt(ctx->snapshot_builder), + commit_time, origin_id, origin_lsn, + parsed->twophase_gid, true); + } + else + { + ReorderBufferCommit(ctx->reorder, xid, buf->origptr, buf->endptr, + commit_time, origin_id, origin_lsn); + } + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + +/* + * Decode PREPARE record. Similar logic as in DecodeCommit. + * + * Note that we don't skip prepare even if have detected concurrent abort + * because it is quite possible that we had already sent some changes before we + * detect abort in which case we need to abort those changes in the subscriber. + * To abort such changes, we do send the prepare and then the rollback prepared + * which is what happened on the publisher-side as well. Now, we can invent a + * new abort API wherein in such cases we send abort and skip sending prepared + * and rollback prepared but then it is not that straightforward because we + * might have streamed this transaction by that time in which case it is + * handled when the rollback is encountered. It is not impossible to optimize + * the concurrent abort case but it can introduce design complexity w.r.t + * handling different cases so leaving it for now as it doesn't seem worth it. + */ +static void +DecodePrepare(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_prepare *parsed) +{ + SnapBuild *builder = ctx->snapshot_builder; + XLogRecPtr origin_lsn = parsed->origin_lsn; + TimestampTz prepare_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + int i; + TransactionId xid = parsed->twophase_xid; + + if (parsed->origin_timestamp != 0) + prepare_time = parsed->origin_timestamp; + + /* + * Remember the prepare info for a txn so that it can be used later in + * commit prepared if required. See ReorderBufferFinishPrepared. + */ + if (!ReorderBufferRememberPrepareInfo(ctx->reorder, xid, buf->origptr, + buf->endptr, prepare_time, origin_id, + origin_lsn)) + return; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + return; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + * + * We can't call ReorderBufferForget as we did in DecodeCommit as the txn + * hasn't yet been committed, removing this txn before a commit might + * result in the computation of an incorrect restart_lsn. See + * SnapBuildProcessRunningXacts. But we need to process cache + * invalidations if there are any for the reasons mentioned in + * DecodeCommit. + */ + if (DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id)) + { + ReorderBufferSkipPrepare(ctx->reorder, xid); + ReorderBufferInvalidate(ctx->reorder, xid, buf->origptr); + return; + } + + /* Tell the reorderbuffer about the surviving subtransactions. */ + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferCommitChild(ctx->reorder, xid, parsed->subxacts[i], + buf->origptr, buf->endptr); + } + + /* replay actions of all transaction + subtransactions in order */ + ReorderBufferPrepare(ctx->reorder, xid, parsed->twophase_gid); + + /* + * Update the decoding stats at transaction prepare/commit/abort. + * Additionally we send the stats when we spill or stream the changes to + * avoid losing them in case the decoding is interrupted. It is not clear + * that sending more or less frequently than this would be better. + */ + UpdateDecodingStats(ctx); +} + + +/* + * Get the data from the various forms of abort records and pass it on to + * snapbuild.c and reorderbuffer.c. + * + * 'two_phase' indicates to finish prepared transaction. + */ +static void +DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + xl_xact_parsed_abort *parsed, TransactionId xid, + bool two_phase) +{ + int i; + XLogRecPtr origin_lsn = InvalidXLogRecPtr; + TimestampTz abort_time = parsed->xact_time; + RepOriginId origin_id = XLogRecGetOrigin(buf->record); + bool skip_xact; + + if (parsed->xinfo & XACT_XINFO_HAS_ORIGIN) + { + origin_lsn = parsed->origin_lsn; + abort_time = parsed->origin_timestamp; + } + + /* + * Check whether we need to process this transaction. See + * DecodeTXNNeedSkip for the reasons why we sometimes want to skip the + * transaction. + */ + skip_xact = DecodeTXNNeedSkip(ctx, buf, parsed->dbId, origin_id); + + /* + * Send the final rollback record for a prepared transaction unless we + * need to skip it. For non-two-phase xacts, simply forget the xact. + */ + if (two_phase && !skip_xact) + { + ReorderBufferFinishPrepared(ctx->reorder, xid, buf->origptr, buf->endptr, + InvalidXLogRecPtr, + abort_time, origin_id, origin_lsn, + parsed->twophase_gid, false); + } + else + { + for (i = 0; i < parsed->nsubxacts; i++) + { + ReorderBufferAbort(ctx->reorder, parsed->subxacts[i], + buf->record->EndRecPtr, abort_time); + } + + ReorderBufferAbort(ctx->reorder, xid, buf->record->EndRecPtr, + abort_time); + } + + /* update the decoding stats */ + UpdateDecodingStats(ctx); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfHeapDelete; + Size tuplelen = datalen - SizeOfHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_TRUNCATE from wal + */ +static void +DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_truncate *xlrec; + ReorderBufferChange *change; + + xlrec = (xl_heap_truncate *) XLogRecGetData(r); + + /* only interested in our database */ + if (xlrec->dbId != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_TRUNCATE; + change->origin_id = XLogRecGetOrigin(r); + if (xlrec->flags & XLH_TRUNCATE_CASCADE) + change->data.truncate.cascade = true; + if (xlrec->flags & XLH_TRUNCATE_RESTART_SEQS) + change->data.truncate.restart_seqs = true; + change->data.truncate.nrelids = xlrec->nrelids; + change->data.truncate.relids = ReorderBufferGetRelids(ctx->reorder, + xlrec->nrelids); + memcpy(change->data.truncate.relids, xlrec->relids, + xlrec->nrelids * sizeof(Oid)); + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Parse XLOG_HEAP_CONFIRM from wal into a confirmation change. + * + * This is pretty trivial, all the state essentially already setup by the + * speculative insertion. + */ +static void +DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + ReorderBufferChange *change; + RelFileLocator target_locator; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_heap_header xlhdr; + int datalen = len - SizeOfHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, + data + SizeOfHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} + +/* + * Check whether we are interested in this specific transaction. + * + * There can be several reasons we might not be interested in this + * transaction: + * 1) We might not be interested in decoding transactions up to this + * LSN. This can happen because we previously decoded it and now just + * are restarting or if we haven't assembled a consistent snapshot yet. + * 2) The transaction happened in another database. + * 3) The output plugin is not interested in the origin. + * 4) We are doing fast-forwarding + */ +static bool +DecodeTXNNeedSkip(LogicalDecodingContext *ctx, XLogRecordBuffer *buf, + Oid txn_dbid, RepOriginId origin_id) +{ + return (SnapBuildXactNeedsSkip(ctx->snapshot_builder, buf->origptr) || + (txn_dbid != InvalidOid && txn_dbid != ctx->slot->data.database) || + ctx->fast_forward || FilterByOrigin(ctx, origin_id)); +} diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c new file mode 100644 index 0000000..8395ae7 --- /dev/null +++ b/src/backend/replication/logical/launcher.c @@ -0,0 +1,1333 @@ +/*------------------------------------------------------------------------- + * launcher.c + * PostgreSQL logical replication worker launcher process + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/launcher.c + * + * NOTES + * This module contains the logical replication worker launcher which + * uses the background worker infrastructure to start the logical + * replication workers for every enabled subscription. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "funcapi.h" +#include "lib/dshash.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/ps_status.h" +#include "utils/snapmgr.h" +#include "utils/timeout.h" + +/* max sleep time between cycles (3min) */ +#define DEFAULT_NAPTIME_PER_CYCLE 180000L + +/* GUC variables */ +int max_logical_replication_workers = 4; +int max_sync_workers_per_subscription = 2; +int max_parallel_apply_workers_per_subscription = 2; + +LogicalRepWorker *MyLogicalRepWorker = NULL; + +typedef struct LogicalRepCtxStruct +{ + /* Supervisor process. */ + pid_t launcher_pid; + + /* Hash table holding last start times of subscriptions' apply workers. */ + dsa_handle last_start_dsa; + dshash_table_handle last_start_dsh; + + /* Background workers. */ + LogicalRepWorker workers[FLEXIBLE_ARRAY_MEMBER]; +} LogicalRepCtxStruct; + +static LogicalRepCtxStruct *LogicalRepCtx; + +/* an entry in the last-start-times shared hash table */ +typedef struct LauncherLastStartTimesEntry +{ + Oid subid; /* OID of logrep subscription (hash key) */ + TimestampTz last_start_time; /* last time its apply worker was started */ +} LauncherLastStartTimesEntry; + +/* parameters for the last-start-times shared hash table */ +static const dshash_parameters dsh_params = { + sizeof(Oid), + sizeof(LauncherLastStartTimesEntry), + dshash_memcmp, + dshash_memhash, + LWTRANCHE_LAUNCHER_HASH +}; + +static dsa_area *last_start_times_dsa = NULL; +static dshash_table *last_start_times = NULL; + +static bool on_commit_launcher_wakeup = false; + + +static void ApplyLauncherWakeup(void); +static void logicalrep_launcher_onexit(int code, Datum arg); +static void logicalrep_worker_onexit(int code, Datum arg); +static void logicalrep_worker_detach(void); +static void logicalrep_worker_cleanup(LogicalRepWorker *worker); +static int logicalrep_pa_worker_count(Oid subid); +static void logicalrep_launcher_attach_dshmem(void); +static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); +static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); + + +/* + * Load the list of subscriptions. + * + * Only the fields interesting for worker start/stop functions are filled for + * each subscription. + */ +static List * +get_subscription_list(void) +{ + List *res = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + /* + * Start a transaction so we can access pg_database, and get a snapshot. + * We don't have a use for the snapshot itself, but we're interested in + * the secondary effect that it sets RecentGlobalXmin. (This is critical + * for anything that reads heap pages, because HOT may decide to prune + * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). + */ + StartTransactionCommand(); + (void) GetTransactionSnapshot(); + + rel = table_open(SubscriptionRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_subscription subform = (Form_pg_subscription) GETSTRUCT(tup); + Subscription *sub; + MemoryContext oldcxt; + + /* + * Allocate our results in the caller's context, not the + * transaction's. We do this inside the loop, and restore the original + * context at the end, so that leaky things like heap_getnext() are + * not called in a potentially long-lived context. + */ + oldcxt = MemoryContextSwitchTo(resultcxt); + + sub = (Subscription *) palloc0(sizeof(Subscription)); + sub->oid = subform->oid; + sub->dbid = subform->subdbid; + sub->owner = subform->subowner; + sub->enabled = subform->subenabled; + sub->name = pstrdup(NameStr(subform->subname)); + /* We don't fill fields we are not interested in. */ + + res = lappend(res, sub); + MemoryContextSwitchTo(oldcxt); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return res; +} + +/* + * Wait for a background worker to start up and attach to the shmem context. + * + * This is only needed for cleaning up the shared memory in case the worker + * fails to attach. + * + * Returns whether the attach was successful. + */ +static bool +WaitForReplicationWorkerAttach(LogicalRepWorker *worker, + uint16 generation, + BackgroundWorkerHandle *handle) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* Worker either died or has started. Return false if died. */ + if (!worker->in_use || worker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + return worker->in_use; + } + + LWLockRelease(LogicalRepWorkerLock); + + /* Check if worker has died before attaching, and clean up after it. */ + status = GetBackgroundWorkerPid(handle, &pid); + + if (status == BGWH_STOPPED) + { + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + /* Ensure that this was indeed the worker we waited for. */ + if (generation == worker->generation) + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + /* + * We need timeout because we generally don't get notified via latch + * about the worker attach. But we don't expect to have to wait long. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + } +} + +/* + * Walks the workers array and searches for one that matches given + * subscription id and relid. + * + * We are only interested in the leader apply worker or table sync worker. + */ +LogicalRepWorker * +logicalrep_worker_find(Oid subid, Oid relid, bool only_running) +{ + int i; + LogicalRepWorker *res = NULL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + /* Skip parallel apply workers. */ + if (isParallelApplyWorker(w)) + continue; + + if (w->in_use && w->subid == subid && w->relid == relid && + (!only_running || w->proc)) + { + res = w; + break; + } + } + + return res; +} + +/* + * Similar to logicalrep_worker_find(), but returns a list of all workers for + * the subscription, instead of just one. + */ +List * +logicalrep_workers_find(Oid subid, bool only_running) +{ + int i; + List *res = NIL; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->in_use && w->subid == subid && (!only_running || w->proc)) + res = lappend(res, w); + } + + return res; +} + +/* + * Start new logical replication background worker, if possible. + * + * Returns true on success, false on failure. + */ +bool +logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, + Oid relid, dsm_handle subworker_dsm) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + uint16 generation; + int i; + int slot = 0; + LogicalRepWorker *worker = NULL; + int nsyncworkers; + int nparallelapplyworkers; + TimestampTz now; + bool is_parallel_apply_worker = (subworker_dsm != DSM_HANDLE_INVALID); + + /* Sanity check - tablesync worker cannot be a subworker */ + Assert(!(is_parallel_apply_worker && OidIsValid(relid))); + + ereport(DEBUG1, + (errmsg_internal("starting logical replication worker for subscription \"%s\"", + subname))); + + /* Report this after the initial starting message for consistency. */ + if (max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("cannot start logical replication workers when max_replication_slots = 0"))); + + /* + * We need to do the modification of the shared memory under lock so that + * we have consistent view. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + +retry: + /* Find unused worker slot. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (!w->in_use) + { + worker = w; + slot = i; + break; + } + } + + nsyncworkers = logicalrep_sync_worker_count(subid); + + now = GetCurrentTimestamp(); + + /* + * If we didn't find a free slot, try to do garbage collection. The + * reason we do this is because if some worker failed to start up and its + * parent has crashed while waiting, the in_use state was never cleared. + */ + if (worker == NULL || nsyncworkers >= max_sync_workers_per_subscription) + { + bool did_cleanup = false; + + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + /* + * If the worker was marked in use but didn't manage to attach in + * time, clean it up. + */ + if (w->in_use && !w->proc && + TimestampDifferenceExceeds(w->launch_time, now, + wal_receiver_timeout)) + { + elog(WARNING, + "logical replication worker for subscription %u took too long to start; canceled", + w->subid); + + logicalrep_worker_cleanup(w); + did_cleanup = true; + } + } + + if (did_cleanup) + goto retry; + } + + /* + * We don't allow to invoke more sync workers once we have reached the + * sync worker limit per subscription. So, just return silently as we + * might get here because of an otherwise harmless race condition. + */ + if (OidIsValid(relid) && nsyncworkers >= max_sync_workers_per_subscription) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + nparallelapplyworkers = logicalrep_pa_worker_count(subid); + + /* + * Return false if the number of parallel apply workers reached the limit + * per subscription. + */ + if (is_parallel_apply_worker && + nparallelapplyworkers >= max_parallel_apply_workers_per_subscription) + { + LWLockRelease(LogicalRepWorkerLock); + return false; + } + + /* + * However if there are no more free worker slots, inform user about it + * before exiting. + */ + if (worker == NULL) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of logical replication worker slots"), + errhint("You might need to increase %s.", "max_logical_replication_workers"))); + return false; + } + + /* Prepare the worker slot. */ + worker->launch_time = now; + worker->in_use = true; + worker->generation++; + worker->proc = NULL; + worker->dbid = dbid; + worker->userid = userid; + worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->stream_fileset = NULL; + worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; + worker->parallel_apply = is_parallel_apply_worker; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); + + /* Before releasing lock, remember generation for future identification. */ + generation = worker->generation; + + LWLockRelease(LogicalRepWorkerLock); + + /* Register the new dynamic worker. */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + + if (is_parallel_apply_worker) + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ParallelApplyWorkerMain"); + else + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyWorkerMain"); + + if (OidIsValid(relid)) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication worker for subscription %u sync %u", subid, relid); + else if (is_parallel_apply_worker) + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication parallel apply worker for subscription %u", subid); + else + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication apply worker for subscription %u", subid); + + if (is_parallel_apply_worker) + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication parallel worker"); + else + snprintf(bgw.bgw_type, BGW_MAXLEN, "logical replication worker"); + + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = Int32GetDatum(slot); + + if (is_parallel_apply_worker) + memcpy(bgw.bgw_extra, &subworker_dsm, sizeof(dsm_handle)); + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + /* Failed to start worker, so clean up the worker slot. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + Assert(generation == worker->generation); + logicalrep_worker_cleanup(worker); + LWLockRelease(LogicalRepWorkerLock); + + ereport(WARNING, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("out of background worker slots"), + errhint("You might need to increase %s.", "max_worker_processes"))); + return false; + } + + /* Now wait until it attaches. */ + return WaitForReplicationWorkerAttach(worker, generation, bgw_handle); +} + +/* + * Internal function to stop the worker and wait until it detaches from the + * slot. + */ +static void +logicalrep_worker_stop_internal(LogicalRepWorker *worker, int signo) +{ + uint16 generation; + + Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_SHARED)); + + /* + * Remember which generation was our worker so we can check if what we see + * is still the same one. + */ + generation = worker->generation; + + /* + * If we found a worker but it does not have proc set then it is still + * starting up; wait for it to finish starting and then kill it. + */ + while (worker->in_use && !worker->proc) + { + int rc; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + /* Recheck worker status. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + /* + * Check whether the worker slot is no longer used, which would mean + * that the worker has exited, or whether the worker generation is + * different, meaning that a different worker has taken the slot. + */ + if (!worker->in_use || worker->generation != generation) + return; + + /* Worker has assigned proc, so it has started. */ + if (worker->proc) + break; + } + + /* Now terminate the worker ... */ + kill(worker->proc->pid, signo); + + /* ... and wait for it to die. */ + for (;;) + { + int rc; + + /* is it gone? */ + if (!worker->proc || worker->generation != generation) + break; + + LWLockRelease(LogicalRepWorkerLock); + + /* Wait a bit --- we don't expect to have to wait long. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 10L, WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + } +} + +/* + * Stop the logical replication worker for subid/relid, if any. + */ +void +logicalrep_worker_stop(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, false); + + if (worker) + { + Assert(!isParallelApplyWorker(worker)); + logicalrep_worker_stop_internal(worker, SIGTERM); + } + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Stop the given logical replication parallel apply worker. + * + * Node that the function sends SIGINT instead of SIGTERM to the parallel apply + * worker so that the worker exits cleanly. + */ +void +logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo) +{ + int slot_no; + uint16 generation; + LogicalRepWorker *worker; + + SpinLockAcquire(&winfo->shared->mutex); + generation = winfo->shared->logicalrep_worker_generation; + slot_no = winfo->shared->logicalrep_worker_slot_no; + SpinLockRelease(&winfo->shared->mutex); + + Assert(slot_no >= 0 && slot_no < max_logical_replication_workers); + + /* + * Detach from the error_mq_handle for the parallel apply worker before + * stopping it. This prevents the leader apply worker from trying to + * receive the message from the error queue that might already be detached + * by the parallel apply worker. + */ + if (winfo->error_mq_handle) + { + shm_mq_detach(winfo->error_mq_handle); + winfo->error_mq_handle = NULL; + } + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = &LogicalRepCtx->workers[slot_no]; + Assert(isParallelApplyWorker(worker)); + + /* + * Only stop the worker if the generation matches and the worker is alive. + */ + if (worker->generation == generation && worker->proc) + logicalrep_worker_stop_internal(worker, SIGINT); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) any logical replication worker for specified sub/rel. + */ +void +logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, true); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) the specified logical replication worker. + * + * Caller must hold lock, else worker->proc could change under us. + */ +void +logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + SetLatch(&worker->proc->procLatch); +} + +/* + * Attach to a slot. + */ +void +logicalrep_worker_attach(int slot) +{ + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + Assert(slot >= 0 && slot < max_logical_replication_workers); + MyLogicalRepWorker = &LogicalRepCtx->workers[slot]; + + if (!MyLogicalRepWorker->in_use) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is empty, cannot attach", + slot))); + } + + if (MyLogicalRepWorker->proc) + { + LWLockRelease(LogicalRepWorkerLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication worker slot %d is already used by " + "another worker, cannot attach", slot))); + } + + MyLogicalRepWorker->proc = MyProc; + before_shmem_exit(logicalrep_worker_onexit, (Datum) 0); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Stop the parallel apply workers if any, and detach the leader apply worker + * (cleans up the worker info). + */ +static void +logicalrep_worker_detach(void) +{ + /* Stop the parallel apply workers. */ + if (am_leader_apply_worker()) + { + List *workers; + ListCell *lc; + + /* + * Detach from the error_mq_handle for all parallel apply workers + * before terminating them. This prevents the leader apply worker from + * receiving the worker termination message and sending it to logs + * when the same is already done by the parallel worker. + */ + pa_detach_all_error_mq(); + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + workers = logicalrep_workers_find(MyLogicalRepWorker->subid, true); + foreach(lc, workers) + { + LogicalRepWorker *w = (LogicalRepWorker *) lfirst(lc); + + if (isParallelApplyWorker(w)) + logicalrep_worker_stop_internal(w, SIGTERM); + } + + LWLockRelease(LogicalRepWorkerLock); + } + + /* Block concurrent access. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + logicalrep_worker_cleanup(MyLogicalRepWorker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Clean up worker info. + */ +static void +logicalrep_worker_cleanup(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMeInMode(LogicalRepWorkerLock, LW_EXCLUSIVE)); + + worker->in_use = false; + worker->proc = NULL; + worker->dbid = InvalidOid; + worker->userid = InvalidOid; + worker->subid = InvalidOid; + worker->relid = InvalidOid; + worker->leader_pid = InvalidPid; + worker->parallel_apply = false; +} + +/* + * Cleanup function for logical replication launcher. + * + * Called on logical replication launcher exit. + */ +static void +logicalrep_launcher_onexit(int code, Datum arg) +{ + LogicalRepCtx->launcher_pid = 0; +} + +/* + * Cleanup function. + * + * Called on logical replication worker exit. + */ +static void +logicalrep_worker_onexit(int code, Datum arg) +{ + /* Disconnect gracefully from the remote side. */ + if (LogRepWorkerWalRcvConn) + walrcv_disconnect(LogRepWorkerWalRcvConn); + + logicalrep_worker_detach(); + + /* Cleanup fileset used for streaming transactions. */ + if (MyLogicalRepWorker->stream_fileset != NULL) + FileSetDeleteAll(MyLogicalRepWorker->stream_fileset); + + /* + * Session level locks may be acquired outside of a transaction in + * parallel apply mode and will not be released when the worker + * terminates, so manually release all locks before the worker exits. + * + * The locks will be acquired once the worker is initialized. + */ + if (!InitializingApplyWorker) + LockReleaseAll(DEFAULT_LOCKMETHOD, true); + + ApplyLauncherWakeup(); +} + +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int +logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + +/* + * Count the number of registered (but not necessarily running) parallel apply + * workers for a subscription. + */ +static int +logicalrep_pa_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* + * Scan all attached parallel apply workers, only counting those which + * have the given subscription id. + */ + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (w->subid == subid && isParallelApplyWorker(w)) + res++; + } + + return res; +} + +/* + * ApplyLauncherShmemSize + * Compute space needed for replication launcher shared memory + */ +Size +ApplyLauncherShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of LogicalRepWorker. + */ + size = sizeof(LogicalRepCtxStruct); + size = MAXALIGN(size); + size = add_size(size, mul_size(max_logical_replication_workers, + sizeof(LogicalRepWorker))); + return size; +} + +/* + * ApplyLauncherRegister + * Register a background worker running the logical replication launcher. + */ +void +ApplyLauncherRegister(void) +{ + BackgroundWorker bgw; + + if (max_logical_replication_workers == 0) + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ApplyLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, + "logical replication launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, + "logical replication launcher"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +/* + * ApplyLauncherShmemInit + * Allocate and initialize replication launcher shared memory + */ +void +ApplyLauncherShmemInit(void) +{ + bool found; + + LogicalRepCtx = (LogicalRepCtxStruct *) + ShmemInitStruct("Logical Replication Launcher Data", + ApplyLauncherShmemSize(), + &found); + + if (!found) + { + int slot; + + memset(LogicalRepCtx, 0, ApplyLauncherShmemSize()); + + LogicalRepCtx->last_start_dsa = DSA_HANDLE_INVALID; + LogicalRepCtx->last_start_dsh = DSHASH_HANDLE_INVALID; + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < max_logical_replication_workers; slot++) + { + LogicalRepWorker *worker = &LogicalRepCtx->workers[slot]; + + memset(worker, 0, sizeof(LogicalRepWorker)); + SpinLockInit(&worker->relmutex); + } + } +} + +/* + * Initialize or attach to the dynamic shared hash table that stores the + * last-start times, if not already done. + * This must be called before accessing the table. + */ +static void +logicalrep_launcher_attach_dshmem(void) +{ + MemoryContext oldcontext; + + /* Quick exit if we already did this. */ + if (LogicalRepCtx->last_start_dsh != DSHASH_HANDLE_INVALID && + last_start_times != NULL) + return; + + /* Otherwise, use a lock to ensure only one process creates the table. */ + LWLockAcquire(LogicalRepWorkerLock, LW_EXCLUSIVE); + + /* Be sure any local memory allocated by DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + if (LogicalRepCtx->last_start_dsh == DSHASH_HANDLE_INVALID) + { + /* Initialize dynamic shared hash table for last-start times. */ + last_start_times_dsa = dsa_create(LWTRANCHE_LAUNCHER_DSA); + dsa_pin(last_start_times_dsa); + dsa_pin_mapping(last_start_times_dsa); + last_start_times = dshash_create(last_start_times_dsa, &dsh_params, 0); + + /* Store handles in shared memory for other backends to use. */ + LogicalRepCtx->last_start_dsa = dsa_get_handle(last_start_times_dsa); + LogicalRepCtx->last_start_dsh = dshash_get_hash_table_handle(last_start_times); + } + else if (!last_start_times) + { + /* Attach to existing dynamic shared hash table. */ + last_start_times_dsa = dsa_attach(LogicalRepCtx->last_start_dsa); + dsa_pin_mapping(last_start_times_dsa); + last_start_times = dshash_attach(last_start_times_dsa, &dsh_params, + LogicalRepCtx->last_start_dsh, 0); + } + + MemoryContextSwitchTo(oldcontext); + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Set the last-start time for the subscription. + */ +static void +ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time) +{ + LauncherLastStartTimesEntry *entry; + bool found; + + logicalrep_launcher_attach_dshmem(); + + entry = dshash_find_or_insert(last_start_times, &subid, &found); + entry->last_start_time = start_time; + dshash_release_lock(last_start_times, entry); +} + +/* + * Return the last-start time for the subscription, or 0 if there isn't one. + */ +static TimestampTz +ApplyLauncherGetWorkerStartTime(Oid subid) +{ + LauncherLastStartTimesEntry *entry; + TimestampTz ret; + + logicalrep_launcher_attach_dshmem(); + + entry = dshash_find(last_start_times, &subid, false); + if (entry == NULL) + return 0; + + ret = entry->last_start_time; + dshash_release_lock(last_start_times, entry); + + return ret; +} + +/* + * Remove the last-start-time entry for the subscription, if one exists. + * + * This has two use-cases: to remove the entry related to a subscription + * that's been deleted or disabled (just to avoid leaking shared memory), + * and to allow immediate restart of an apply worker that has exited + * due to subscription parameter changes. + */ +void +ApplyLauncherForgetWorkerStartTime(Oid subid) +{ + logicalrep_launcher_attach_dshmem(); + + (void) dshash_delete_key(last_start_times, &subid); +} + +/* + * Wakeup the launcher on commit if requested. + */ +void +AtEOXact_ApplyLauncher(bool isCommit) +{ + if (isCommit) + { + if (on_commit_launcher_wakeup) + ApplyLauncherWakeup(); + } + + on_commit_launcher_wakeup = false; +} + +/* + * Request wakeup of the launcher on commit of the transaction. + * + * This is used to send launcher signal to stop sleeping and process the + * subscriptions when current transaction commits. Should be used when new + * tuple was added to the pg_subscription catalog. +*/ +void +ApplyLauncherWakeupAtCommit(void) +{ + if (!on_commit_launcher_wakeup) + on_commit_launcher_wakeup = true; +} + +static void +ApplyLauncherWakeup(void) +{ + if (LogicalRepCtx->launcher_pid != 0) + kill(LogicalRepCtx->launcher_pid, SIGUSR1); +} + +/* + * Main loop for the apply launcher process. + */ +void +ApplyLauncherMain(Datum main_arg) +{ + ereport(DEBUG1, + (errmsg_internal("logical replication launcher started"))); + + before_shmem_exit(logicalrep_launcher_onexit, (Datum) 0); + + Assert(LogicalRepCtx->launcher_pid == 0); + LogicalRepCtx->launcher_pid = MyProcPid; + + /* Establish signal handlers. */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * Establish connection to nailed catalogs (we only ever access + * pg_subscription). + */ + BackgroundWorkerInitializeConnection(NULL, NULL, 0); + + /* Enter main loop */ + for (;;) + { + int rc; + List *sublist; + ListCell *lc; + MemoryContext subctx; + MemoryContext oldctx; + long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + + CHECK_FOR_INTERRUPTS(); + + /* Use temporary context to avoid leaking memory across cycles. */ + subctx = AllocSetContextCreate(TopMemoryContext, + "Logical Replication Launcher sublist", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(subctx); + + /* Start any missing workers for enabled subscriptions. */ + sublist = get_subscription_list(); + foreach(lc, sublist) + { + Subscription *sub = (Subscription *) lfirst(lc); + LogicalRepWorker *w; + TimestampTz last_start; + TimestampTz now; + long elapsed; + + if (!sub->enabled) + continue; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); + LWLockRelease(LogicalRepWorkerLock); + + if (w != NULL) + continue; /* worker is running already */ + + /* + * If the worker is eligible to start now, launch it. Otherwise, + * adjust wait_time so that we'll wake up as soon as it can be + * started. + * + * Each subscription's apply worker can only be restarted once per + * wal_retrieve_retry_interval, so that errors do not cause us to + * repeatedly restart the worker as fast as possible. In cases + * where a restart is expected (e.g., subscription parameter + * changes), another process should remove the last-start entry + * for the subscription so that the worker can be restarted + * without waiting for wal_retrieve_retry_interval to elapse. + */ + last_start = ApplyLauncherGetWorkerStartTime(sub->oid); + now = GetCurrentTimestamp(); + if (last_start == 0 || + (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval) + { + ApplyLauncherSetWorkerStartTime(sub->oid, now); + logicalrep_worker_launch(sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid, + DSM_HANDLE_INVALID); + } + else + { + wait_time = Min(wait_time, + wal_retrieve_retry_interval - elapsed); + } + } + + /* Switch back to original memory context. */ + MemoryContextSwitchTo(oldctx); + /* Clean the temporary memory. */ + MemoryContextDelete(subctx); + + /* Wait for more work. */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_LOGICAL_LAUNCHER_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + } + + /* Not reachable */ +} + +/* + * Is current process the logical replication launcher? + */ +bool +IsLogicalLauncher(void) +{ + return LogicalRepCtx->launcher_pid == MyProcPid; +} + +/* + * Return the pid of the leader apply worker if the given pid is the pid of a + * parallel apply worker, otherwise, return InvalidPid. + */ +pid_t +GetLeaderApplyWorkerPid(pid_t pid) +{ + int leader_pid = InvalidPid; + int i; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + for (i = 0; i < max_logical_replication_workers; i++) + { + LogicalRepWorker *w = &LogicalRepCtx->workers[i]; + + if (isParallelApplyWorker(w) && w->proc && pid == w->proc->pid) + { + leader_pid = w->leader_pid; + break; + } + } + + LWLockRelease(LogicalRepWorkerLock); + + return leader_pid; +} + +/* + * Returns state of the subscriptions. + */ +Datum +pg_stat_get_subscription(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_SUBSCRIPTION_COLS 9 + Oid subid = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0); + int i; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + InitMaterializedSRF(fcinfo, 0); + + /* Make sure we get consistent view of the workers. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + for (i = 0; i < max_logical_replication_workers; i++) + { + /* for each row */ + Datum values[PG_STAT_GET_SUBSCRIPTION_COLS] = {0}; + bool nulls[PG_STAT_GET_SUBSCRIPTION_COLS] = {0}; + int worker_pid; + LogicalRepWorker worker; + + memcpy(&worker, &LogicalRepCtx->workers[i], + sizeof(LogicalRepWorker)); + if (!worker.proc || !IsBackendPid(worker.proc->pid)) + continue; + + if (OidIsValid(subid) && worker.subid != subid) + continue; + + worker_pid = worker.proc->pid; + + values[0] = ObjectIdGetDatum(worker.subid); + if (OidIsValid(worker.relid)) + values[1] = ObjectIdGetDatum(worker.relid); + else + nulls[1] = true; + values[2] = Int32GetDatum(worker_pid); + + if (isParallelApplyWorker(&worker)) + values[3] = Int32GetDatum(worker.leader_pid); + else + nulls[3] = true; + + if (XLogRecPtrIsInvalid(worker.last_lsn)) + nulls[4] = true; + else + values[4] = LSNGetDatum(worker.last_lsn); + if (worker.last_send_time == 0) + nulls[5] = true; + else + values[5] = TimestampTzGetDatum(worker.last_send_time); + if (worker.last_recv_time == 0) + nulls[6] = true; + else + values[6] = TimestampTzGetDatum(worker.last_recv_time); + if (XLogRecPtrIsInvalid(worker.reply_lsn)) + nulls[7] = true; + else + values[7] = LSNGetDatum(worker.reply_lsn); + if (worker.reply_time == 0) + nulls[8] = true; + else + values[8] = TimestampTzGetDatum(worker.reply_time); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + + /* + * If only a single subscription was requested, and we found it, + * break. + */ + if (OidIsValid(subid)) + break; + } + + LWLockRelease(LogicalRepWorkerLock); + + return (Datum) 0; +} diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c new file mode 100644 index 0000000..41243d0 --- /dev/null +++ b/src/backend/replication/logical/logical.c @@ -0,0 +1,1951 @@ +/*------------------------------------------------------------------------- + * logical.c + * PostgreSQL logical decoding coordination + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logical.c + * + * NOTES + * This file coordinates interaction between the various modules that + * together provide logical decoding, primarily by providing so + * called LogicalDecodingContexts. The goal is to encapsulate most of the + * internal complexity for consumers of logical decoding, so they can + * create and consume a changestream with a low amount of code. Builtin + * consumers are the walsender and SQL SRF interface, but it's possible to + * add further ones without changing core code, e.g. to consume changes in + * a bgworker. + * + * The idea is that a consumer provides three callbacks, one to read WAL, + * one to prepare a data write, and a final one for actually writing since + * their implementation depends on the type of consumer. Check + * logicalfuncs.c for an example implementation of a fairly simple consumer + * and an implementation of a WAL reading callback that's suitable for + * simple consumers. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +/* data for errcontext callback */ +typedef struct LogicalErrorCallbackState +{ + LogicalDecodingContext *ctx; + const char *callback_name; + XLogRecPtr report_location; +} LogicalErrorCallbackState; + +/* wrappers around output plugin callbacks */ +static void output_plugin_error_callback(void *arg); +static void startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init); +static void shutdown_cb_wrapper(LogicalDecodingContext *ctx); +static void begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn); +static void prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, TimestampTz prepare_time); +static void change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); +static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); + +/* streaming callbacks */ +static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn); +static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn); +static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn); +static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message); +static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change); + +/* callback to update txn's progress */ +static void update_progress_txn_cb_wrapper(ReorderBuffer *cache, + ReorderBufferTXN *txn, + XLogRecPtr lsn); + +static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin); + +/* + * Make sure the current settings & environment are capable of doing logical + * decoding. + */ +void +CheckLogicalDecodingRequirements(void) +{ + CheckSlotRequirements(); + + /* + * NB: Adding a new requirement likely means that RestoreSlotFromDisk() + * needs the same check. + */ + + if (wal_level < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires wal_level >= logical"))); + + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding requires a database connection"))); + + if (RecoveryInProgress()) + { + /* + * This check may have race conditions, but whenever + * XLOG_PARAMETER_CHANGE indicates that wal_level has changed, we + * verify that there are no existing logical replication slots. And to + * avoid races around creating a new slot, + * CheckLogicalDecodingRequirements() is called once before creating + * the slot, and once when logical decoding is initially starting up. + */ + if (GetActiveWalLevelOnStandby() < WAL_LEVEL_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical decoding on standby requires wal_level >= logical on the primary"))); + } +} + +/* + * Helper function for CreateInitDecodingContext() and + * CreateDecodingContext() performing common tasks. + */ +static LogicalDecodingContext * +StartupDecodingContext(List *output_plugin_options, + XLogRecPtr start_lsn, + TransactionId xmin_horizon, + bool need_full_snapshot, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + ReplicationSlot *slot; + MemoryContext context, + old_context; + LogicalDecodingContext *ctx; + + /* shorter lines... */ + slot = MyReplicationSlot; + + context = AllocSetContextCreate(CurrentMemoryContext, + "Logical decoding context", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(context); + ctx = palloc0(sizeof(LogicalDecodingContext)); + + ctx->context = context; + + /* + * (re-)load output plugins, so we detect a bad (removed) output plugin + * now. + */ + if (!fast_forward) + LoadOutputPlugin(&ctx->callbacks, NameStr(slot->data.plugin)); + + /* + * Now that the slot's xmin has been set, we can announce ourselves as a + * logical decoding backend which doesn't need to be checked individually + * when computing the xmin horizon because the xmin is enforced via + * replication slots. + * + * We can only do so if we're outside of a transaction (i.e. the case when + * streaming changes via walsender), otherwise an already setup + * snapshot/xid would end up being ignored. That's not a particularly + * bothersome restriction since the SQL interface can't be used for + * streaming anyway. + */ + if (!IsTransactionOrTransactionBlock()) + { + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags |= PROC_IN_LOGICAL_DECODING; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); + } + + ctx->slot = slot; + + ctx->reader = XLogReaderAllocate(wal_segment_size, NULL, xl_routine, ctx); + if (!ctx->reader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + ctx->reorder = ReorderBufferAllocate(); + ctx->snapshot_builder = + AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn, + need_full_snapshot, slot->data.two_phase_at); + + ctx->reorder->private_data = ctx; + + /* wrap output plugin callbacks, so we can add error context information */ + ctx->reorder->begin = begin_cb_wrapper; + ctx->reorder->apply_change = change_cb_wrapper; + ctx->reorder->apply_truncate = truncate_cb_wrapper; + ctx->reorder->commit = commit_cb_wrapper; + ctx->reorder->message = message_cb_wrapper; + + /* + * To support streaming, we require start/stop/abort/commit/change + * callbacks. The message and truncate callbacks are optional, similar to + * regular output plugins. We however enable streaming when at least one + * of the methods is enabled so that we can easily identify missing + * methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->streaming = (ctx->callbacks.stream_start_cb != NULL) || + (ctx->callbacks.stream_stop_cb != NULL) || + (ctx->callbacks.stream_abort_cb != NULL) || + (ctx->callbacks.stream_commit_cb != NULL) || + (ctx->callbacks.stream_change_cb != NULL) || + (ctx->callbacks.stream_message_cb != NULL) || + (ctx->callbacks.stream_truncate_cb != NULL); + + /* + * streaming callbacks + * + * stream_message and stream_truncate callbacks are optional, so we do not + * fail with ERROR when missing, but the wrappers simply do nothing. We + * must set the ReorderBuffer callbacks to something, otherwise the calls + * from there will crash (we don't want to move the checks there). + */ + ctx->reorder->stream_start = stream_start_cb_wrapper; + ctx->reorder->stream_stop = stream_stop_cb_wrapper; + ctx->reorder->stream_abort = stream_abort_cb_wrapper; + ctx->reorder->stream_prepare = stream_prepare_cb_wrapper; + ctx->reorder->stream_commit = stream_commit_cb_wrapper; + ctx->reorder->stream_change = stream_change_cb_wrapper; + ctx->reorder->stream_message = stream_message_cb_wrapper; + ctx->reorder->stream_truncate = stream_truncate_cb_wrapper; + + + /* + * To support two-phase logical decoding, we require + * begin_prepare/prepare/commit-prepare/abort-prepare callbacks. The + * filter_prepare callback is optional. We however enable two-phase + * logical decoding when at least one of the methods is enabled so that we + * can easily identify missing methods. + * + * We decide it here, but only check it later in the wrappers. + */ + ctx->twophase = (ctx->callbacks.begin_prepare_cb != NULL) || + (ctx->callbacks.prepare_cb != NULL) || + (ctx->callbacks.commit_prepared_cb != NULL) || + (ctx->callbacks.rollback_prepared_cb != NULL) || + (ctx->callbacks.stream_prepare_cb != NULL) || + (ctx->callbacks.filter_prepare_cb != NULL); + + /* + * Callback to support decoding at prepare time. + */ + ctx->reorder->begin_prepare = begin_prepare_cb_wrapper; + ctx->reorder->prepare = prepare_cb_wrapper; + ctx->reorder->commit_prepared = commit_prepared_cb_wrapper; + ctx->reorder->rollback_prepared = rollback_prepared_cb_wrapper; + + /* + * Callback to support updating progress during sending data of a + * transaction (and its subtransactions) to the output plugin. + */ + ctx->reorder->update_progress_txn = update_progress_txn_cb_wrapper; + + ctx->out = makeStringInfo(); + ctx->prepare_write = prepare_write; + ctx->write = do_write; + ctx->update_progress = update_progress; + + ctx->output_plugin_options = output_plugin_options; + + ctx->fast_forward = fast_forward; + + MemoryContextSwitchTo(old_context); + + return ctx; +} + +/* + * Create a new decoding context, for a new logical slot. + * + * plugin -- contains the name of the output plugin + * output_plugin_options -- contains options passed to the output plugin + * need_full_snapshot -- if true, must obtain a snapshot able to read all + * tables; if false, one that can read only catalogs is acceptable. + * restart_lsn -- if given as invalid, it's this routine's responsibility to + * mark WAL as reserved by setting a convenient restart_lsn for the slot. + * Otherwise, we set for decoding to start from the given LSN without + * marking WAL reserved beforehand. In that scenario, it's up to the + * caller to guarantee that WAL remains available. + * xl_routine -- XLogReaderRoutine for underlying XLogReader + * prepare_write, do_write, update_progress -- + * callbacks that perform the use-case dependent, actual, work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateInitDecodingContext(const char *plugin, + List *output_plugin_options, + bool need_full_snapshot, + XLogRecPtr restart_lsn, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + TransactionId xmin_horizon = InvalidTransactionId; + ReplicationSlot *slot; + NameData plugin_name; + LogicalDecodingContext *ctx; + MemoryContext old_context; + + /* + * On a standby, this check is also required while creating the slot. + * Check the comments in the function. + */ + CheckLogicalDecodingRequirements(); + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + if (plugin == NULL) + elog(ERROR, "cannot initialize logical decoding without a specified plugin"); + + /* Make sure the passed slot is suitable. These are user facing errors. */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + if (IsTransactionState() && + GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot create logical replication slot in transaction that has performed writes"))); + + /* + * Register output plugin name with slot. We need the mutex to avoid + * concurrent reading of a partially copied string. But we don't want any + * complicated code while holding a spinlock, so do namestrcpy() outside. + */ + namestrcpy(&plugin_name, plugin); + SpinLockAcquire(&slot->mutex); + slot->data.plugin = plugin_name; + SpinLockRelease(&slot->mutex); + + if (XLogRecPtrIsInvalid(restart_lsn)) + ReplicationSlotReserveWal(); + else + { + SpinLockAcquire(&slot->mutex); + slot->data.restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + } + + /* ---- + * This is a bit tricky: We need to determine a safe xmin horizon to start + * decoding from, to avoid starting from a running xacts record referring + * to xids whose rows have been vacuumed or pruned + * already. GetOldestSafeDecodingTransactionId() returns such a value, but + * without further interlock its return value might immediately be out of + * date. + * + * So we have to acquire the ProcArrayLock to prevent computation of new + * xmin horizons by other backends, get the safe decoding xid, and inform + * the slot machinery about the new limit. Once that's done the + * ProcArrayLock can be released as the slot machinery now is + * protecting against vacuum. + * + * Note that, temporarily, the data, not just the catalog, xmin has to be + * reserved if a data snapshot is to be exported. Otherwise the initial + * data snapshot created here is not guaranteed to be valid. After that + * the data xmin doesn't need to be managed anymore and the global xmin + * should be recomputed. As we are fine with losing the pegged data xmin + * after crash - no chance a snapshot would get exported anymore - we can + * get away with just setting the slot's + * effective_xmin. ReplicationSlotRelease will reset it again. + * + * ---- + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(!need_full_snapshot); + + SpinLockAcquire(&slot->mutex); + slot->effective_catalog_xmin = xmin_horizon; + slot->data.catalog_xmin = xmin_horizon; + if (need_full_snapshot) + slot->effective_xmin = xmin_horizon; + SpinLockRelease(&slot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + + ctx = StartupDecodingContext(NIL, restart_lsn, xmin_horizon, + need_full_snapshot, false, + xl_routine, prepare_write, do_write, + update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, true); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions when the two_phase is + * enabled at the time of slot creation, or when the two_phase option is + * given at the streaming start, provided the plugin supports all the + * callbacks for two-phase. + */ + ctx->twophase &= slot->data.two_phase; + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + return ctx; +} + +/* + * Create a new decoding context, for a logical slot that has previously been + * used already. + * + * start_lsn + * The LSN at which to start decoding. If InvalidXLogRecPtr, restart + * from the slot's confirmed_flush; otherwise, start from the specified + * location (but move it forwards to confirmed_flush if it's older than + * that, see below). + * + * output_plugin_options + * options passed to the output plugin. + * + * fast_forward + * bypass the generation of logical changes. + * + * xl_routine + * XLogReaderRoutine used by underlying xlogreader + * + * prepare_write, do_write, update_progress + * callbacks that have to be filled to perform the use-case dependent, + * actual work. + * + * Needs to be called while in a memory context that's at least as long lived + * as the decoding context because further memory contexts will be created + * inside it. + * + * Returns an initialized decoding context after calling the output plugin's + * startup function. + */ +LogicalDecodingContext * +CreateDecodingContext(XLogRecPtr start_lsn, + List *output_plugin_options, + bool fast_forward, + XLogReaderRoutine *xl_routine, + LogicalOutputPluginWriterPrepareWrite prepare_write, + LogicalOutputPluginWriterWrite do_write, + LogicalOutputPluginWriterUpdateProgress update_progress) +{ + LogicalDecodingContext *ctx; + ReplicationSlot *slot; + MemoryContext old_context; + + /* shorter lines... */ + slot = MyReplicationSlot; + + /* first some sanity checks that are unlikely to be violated */ + if (slot == NULL) + elog(ERROR, "cannot perform logical decoding without an acquired slot"); + + /* make sure the passed slot is suitable, these are user facing errors */ + if (SlotIsPhysical(slot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use physical replication slot for logical decoding"))); + + if (slot->data.database != MyDatabaseId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" was not created in this database", + NameStr(slot->data.name)))); + + /* + * Check if slot has been invalidated due to max_slot_wal_keep_size. Avoid + * "cannot get changes" wording in this errmsg because that'd be + * confusingly ambiguous about no changes being available when called from + * pg_logical_slot_get_changes_guts(). + */ + if (MyReplicationSlot->data.invalidated == RS_INVAL_WAL_REMOVED) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer get changes from replication slot \"%s\"", + NameStr(MyReplicationSlot->data.name)), + errdetail("This slot has been invalidated because it exceeded the maximum reserved size."))); + + if (MyReplicationSlot->data.invalidated != RS_INVAL_NONE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer get changes from replication slot \"%s\"", + NameStr(MyReplicationSlot->data.name)), + errdetail("This slot has been invalidated because it was conflicting with recovery."))); + + Assert(MyReplicationSlot->data.invalidated == RS_INVAL_NONE); + Assert(MyReplicationSlot->data.restart_lsn != InvalidXLogRecPtr); + + if (start_lsn == InvalidXLogRecPtr) + { + /* continue from last position */ + start_lsn = slot->data.confirmed_flush; + } + else if (start_lsn < slot->data.confirmed_flush) + { + /* + * It might seem like we should error out in this case, but it's + * pretty common for a client to acknowledge a LSN it doesn't have to + * do anything for, and thus didn't store persistently, because the + * xlog records didn't result in anything relevant for logical + * decoding. Clients have to be able to do that to support synchronous + * replication. + * + * Starting at a different LSN than requested might not catch certain + * kinds of client errors; so the client may wish to check that + * confirmed_flush_lsn matches its expectations. + */ + elog(LOG, "%X/%X has been already streamed, forwarding to %X/%X", + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(slot->data.confirmed_flush)); + + start_lsn = slot->data.confirmed_flush; + } + + ctx = StartupDecodingContext(output_plugin_options, + start_lsn, InvalidTransactionId, false, + fast_forward, xl_routine, prepare_write, + do_write, update_progress); + + /* call output plugin initialization callback */ + old_context = MemoryContextSwitchTo(ctx->context); + if (ctx->callbacks.startup_cb != NULL) + startup_cb_wrapper(ctx, &ctx->options, false); + MemoryContextSwitchTo(old_context); + + /* + * We allow decoding of prepared transactions when the two_phase is + * enabled at the time of slot creation, or when the two_phase option is + * given at the streaming start, provided the plugin supports all the + * callbacks for two-phase. + */ + ctx->twophase &= (slot->data.two_phase || ctx->twophase_opt_given); + + /* Mark slot to allow two_phase decoding if not already marked */ + if (ctx->twophase && !slot->data.two_phase) + { + SpinLockAcquire(&slot->mutex); + slot->data.two_phase = true; + slot->data.two_phase_at = start_lsn; + SpinLockRelease(&slot->mutex); + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + SnapBuildSetTwoPhaseAt(ctx->snapshot_builder, start_lsn); + } + + ctx->reorder->output_rewrites = ctx->options.receive_rewrites; + + ereport(LOG, + (errmsg("starting logical decoding for slot \"%s\"", + NameStr(slot->data.name)), + errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + LSN_FORMAT_ARGS(slot->data.confirmed_flush), + LSN_FORMAT_ARGS(slot->data.restart_lsn)))); + + return ctx; +} + +/* + * Returns true if a consistent initial decoding snapshot has been built. + */ +bool +DecodingContextReady(LogicalDecodingContext *ctx) +{ + return SnapBuildCurrentState(ctx->snapshot_builder) == SNAPBUILD_CONSISTENT; +} + +/* + * Read from the decoding slot, until it is ready to start extracting changes. + */ +void +DecodingContextFindStartpoint(LogicalDecodingContext *ctx) +{ + ReplicationSlot *slot = ctx->slot; + + /* Initialize from where to start reading WAL. */ + XLogBeginRead(ctx->reader, slot->data.restart_lsn); + + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + LSN_FORMAT_ARGS(slot->data.restart_lsn)); + + /* Wait for a consistent starting point */ + for (;;) + { + XLogRecord *record; + char *err = NULL; + + /* the read_page callback waits for new WAL */ + record = XLogReadRecord(ctx->reader, &err); + if (err) + elog(ERROR, "could not find logical decoding starting point: %s", err); + if (!record) + elog(ERROR, "could not find logical decoding starting point"); + + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* only continue till we found a consistent spot */ + if (DecodingContextReady(ctx)) + break; + + CHECK_FOR_INTERRUPTS(); + } + + SpinLockAcquire(&slot->mutex); + slot->data.confirmed_flush = ctx->reader->EndRecPtr; + if (slot->data.two_phase) + slot->data.two_phase_at = ctx->reader->EndRecPtr; + SpinLockRelease(&slot->mutex); +} + +/* + * Free a previously allocated decoding context, invoking the shutdown + * callback if necessary. + */ +void +FreeDecodingContext(LogicalDecodingContext *ctx) +{ + if (ctx->callbacks.shutdown_cb != NULL) + shutdown_cb_wrapper(ctx); + + ReorderBufferFree(ctx->reorder); + FreeSnapshotBuilder(ctx->snapshot_builder); + XLogReaderFree(ctx->reader); + MemoryContextDelete(ctx->context); +} + +/* + * Prepare a write using the context's output routine. + */ +void +OutputPluginPrepareWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->accept_writes) + elog(ERROR, "writes are only accepted in commit, begin and change callbacks"); + + ctx->prepare_write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = true; +} + +/* + * Perform a write using the context's output routine. + */ +void +OutputPluginWrite(struct LogicalDecodingContext *ctx, bool last_write) +{ + if (!ctx->prepared_write) + elog(ERROR, "OutputPluginPrepareWrite needs to be called before OutputPluginWrite"); + + ctx->write(ctx, ctx->write_location, ctx->write_xid, last_write); + ctx->prepared_write = false; +} + +/* + * Update progress tracking (if supported). + */ +void +OutputPluginUpdateProgress(struct LogicalDecodingContext *ctx, + bool skipped_xact) +{ + if (!ctx->update_progress) + return; + + ctx->update_progress(ctx, ctx->write_location, ctx->write_xid, + skipped_xact); +} + +/* + * Load the output plugin, lookup its output plugin init function, and check + * that it provides the required callbacks. + */ +static void +LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin) +{ + LogicalOutputPluginInit plugin_init; + + plugin_init = (LogicalOutputPluginInit) + load_external_function(plugin, "_PG_output_plugin_init", false, NULL); + + if (plugin_init == NULL) + elog(ERROR, "output plugins have to declare the _PG_output_plugin_init symbol"); + + /* ask the output plugin to fill the callback struct */ + plugin_init(callbacks); + + if (callbacks->begin_cb == NULL) + elog(ERROR, "output plugins have to register a begin callback"); + if (callbacks->change_cb == NULL) + elog(ERROR, "output plugins have to register a change callback"); + if (callbacks->commit_cb == NULL) + elog(ERROR, "output plugins have to register a commit callback"); +} + +static void +output_plugin_error_callback(void *arg) +{ + LogicalErrorCallbackState *state = (LogicalErrorCallbackState *) arg; + + /* not all callbacks have an associated LSN */ + if (state->report_location != InvalidXLogRecPtr) + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name, + LSN_FORMAT_ARGS(state->report_location)); + else + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback", + NameStr(state->ctx->slot->data.name), + NameStr(state->ctx->slot->data.plugin), + state->callback_name); +} + +static void +startup_cb_wrapper(LogicalDecodingContext *ctx, OutputPluginOptions *opt, bool is_init) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "startup"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.startup_cb(ctx, opt, is_init); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +shutdown_cb_wrapper(LogicalDecodingContext *ctx) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "shutdown"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.shutdown_cb(ctx); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + + +/* + * Callbacks for ReorderBuffer which add in some more information and then call + * output_plugin.h plugins. + */ +static void +begin_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.begin_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* do the actual work: call callback */ + ctx->callbacks.commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * The functionality of begin_prepare is quite similar to begin with the + * exception that this will have gid (global transaction id) information which + * can be used by plugin. Now, we thought about extending the existing begin + * but that would break the replication protocol and additionally this looks + * cleaner. + */ +static void +begin_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "begin_prepare"; + state.report_location = txn->first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->first_lsn; + ctx->end_xact = false; + + /* + * If the plugin supports two-phase commits then begin prepare callback is + * mandatory + */ + if (ctx->callbacks.begin_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "begin_prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.begin_prepare_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "prepare"; + state.report_location = txn->final_lsn; /* beginning of prepare record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin supports two-phase commits then prepare callback is + * mandatory + */ + if (ctx->callbacks.prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "prepare_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +commit_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "commit_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then commit prepared callback + * is mandatory + */ + if (ctx->callbacks.commit_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "commit_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.commit_prepared_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +rollback_prepared_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when two-phase commits are supported */ + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "rollback_prepared"; + state.report_location = txn->final_lsn; /* beginning of commit record */ + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; /* points to the end of the record */ + ctx->end_xact = true; + + /* + * If the plugin support two-phase commits then rollback prepared callback + * is mandatory + */ + if (ctx->callbacks.rollback_prepared_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication at prepare time requires a %s callback", + "rollback_prepared_cb"))); + + /* do the actual work: call callback */ + ctx->callbacks.rollback_prepared_cb(ctx, txn, prepare_end_lsn, + prepare_time); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (!ctx->callbacks.truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +bool +filter_prepare_cb_wrapper(LogicalDecodingContext *ctx, TransactionId xid, + const char *gid) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_prepare"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_prepare_cb(ctx, xid, gid); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +bool +filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + bool ret; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "filter_by_origin"; + state.report_location = InvalidXLogRecPtr; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ret = ctx->callbacks.filter_by_origin_cb(ctx, origin_id); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + return ret; +} + +static void +message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + if (ctx->callbacks.message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr first_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_start"; + state.report_location = first_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this message's lsn so replies from clients can give an + * up-to-date answer. This won't ever be enough (and shouldn't be!) to + * confirm receipt of this transaction, but it might allow another + * transaction's commit to be confirmed with one message. + */ + ctx->write_location = first_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_start_cb is required */ + if (ctx->callbacks.stream_start_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_start_cb"))); + + ctx->callbacks.stream_start_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr last_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_stop"; + state.report_location = last_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this message's lsn so replies from clients can give an + * up-to-date answer. This won't ever be enough (and shouldn't be!) to + * confirm receipt of this transaction, but it might allow another + * transaction's commit to be confirmed with one message. + */ + ctx->write_location = last_lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_stop_cb is required */ + if (ctx->callbacks.stream_stop_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_stop_cb"))); + + ctx->callbacks.stream_stop_cb(ctx, txn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_abort"; + state.report_location = abort_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = abort_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_abort_cb is required */ + if (ctx->callbacks.stream_abort_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_abort_cb"))); + + ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_prepare_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* + * We're only supposed to call this when streaming and two-phase commits + * are supported. + */ + Assert(ctx->streaming); + Assert(ctx->twophase); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_prepare"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode with two-phase commits, stream_prepare_cb is required */ + if (ctx->callbacks.stream_prepare_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming at prepare time requires a %s callback", + "stream_prepare_cb"))); + + ctx->callbacks.stream_prepare_cb(ctx, txn, prepare_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_commit"; + state.report_location = txn->final_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + ctx->write_location = txn->end_lsn; + ctx->end_xact = true; + + /* in streaming mode, stream_commit_cb is required */ + if (ctx->callbacks.stream_commit_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_commit_cb"))); + + ctx->callbacks.stream_commit_cb(ctx, txn, commit_lsn); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_change"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + /* in streaming mode, stream_change_cb is required */ + if (ctx->callbacks.stream_change_cb == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical streaming requires a %s callback", + "stream_change_cb"))); + + ctx->callbacks.stream_change_cb(ctx, txn, relation, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, + const char *prefix, Size message_size, const char *message) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (ctx->callbacks.stream_message_cb == NULL) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_message"; + state.report_location = message_lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId; + ctx->write_location = message_lsn; + ctx->end_xact = false; + + /* do the actual work: call callback */ + ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix, + message_size, message); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + int nrelations, Relation relations[], + ReorderBufferChange *change) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* We're only supposed to call this when streaming is supported. */ + Assert(ctx->streaming); + + /* this callback is optional */ + if (!ctx->callbacks.stream_truncate_cb) + return; + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "stream_truncate"; + state.report_location = change->lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = true; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = change->lsn; + + ctx->end_xact = false; + + ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +static void +update_progress_txn_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, + XLogRecPtr lsn) +{ + LogicalDecodingContext *ctx = cache->private_data; + LogicalErrorCallbackState state; + ErrorContextCallback errcallback; + + Assert(!ctx->fast_forward); + + /* Push callback + info on the error context stack */ + state.ctx = ctx; + state.callback_name = "update_progress_txn"; + state.report_location = lsn; + errcallback.callback = output_plugin_error_callback; + errcallback.arg = (void *) &state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* set output state */ + ctx->accept_writes = false; + ctx->write_xid = txn->xid; + + /* + * Report this change's lsn so replies from clients can give an up-to-date + * answer. This won't ever be enough (and shouldn't be!) to confirm + * receipt of this transaction, but it might allow another transaction's + * commit to be confirmed with one message. + */ + ctx->write_location = lsn; + + ctx->end_xact = false; + + OutputPluginUpdateProgress(ctx, false); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * Set the required catalog xmin horizon for historic snapshots in the current + * replication slot. + * + * Note that in the most cases, we won't be able to immediately use the xmin + * to increase the xmin horizon: we need to wait till the client has confirmed + * receiving current_lsn with LogicalConfirmReceivedLocation(). + */ +void +LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) +{ + bool updated_xmin = false; + ReplicationSlot *slot; + bool got_new_xmin = false; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + + SpinLockAcquire(&slot->mutex); + + /* + * don't overwrite if we already have a newer xmin. This can happen if we + * restart decoding in a slot. + */ + if (TransactionIdPrecedesOrEquals(xmin, slot->data.catalog_xmin)) + { + } + + /* + * If the client has already confirmed up to this lsn, we directly can + * mark this as accepted. This can happen if we restart decoding in a + * slot. + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* our candidate can directly be used */ + updated_xmin = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. + */ + else if (slot->candidate_xmin_lsn == InvalidXLogRecPtr) + { + slot->candidate_catalog_xmin = xmin; + slot->candidate_xmin_lsn = current_lsn; + + /* + * Log new xmin at an appropriate log level after releasing the + * spinlock. + */ + got_new_xmin = true; + } + SpinLockRelease(&slot->mutex); + + if (got_new_xmin) + elog(DEBUG1, "got new catalog xmin %u at %X/%X", xmin, + LSN_FORMAT_ARGS(current_lsn)); + + /* candidate already valid with the current flush position, apply */ + if (updated_xmin) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Mark the minimal LSN (restart_lsn) we need to read to replay all + * transactions that have not yet committed at current_lsn. + * + * Just like LogicalIncreaseXminForSlot this only takes effect when the + * client has confirmed to have received current_lsn. + */ +void +LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart_lsn) +{ + bool updated_lsn = false; + ReplicationSlot *slot; + + slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(restart_lsn != InvalidXLogRecPtr); + Assert(current_lsn != InvalidXLogRecPtr); + + SpinLockAcquire(&slot->mutex); + + /* don't overwrite if have a newer restart lsn */ + if (restart_lsn <= slot->data.restart_lsn) + { + } + + /* + * We might have already flushed far enough to directly accept this lsn, + * in this case there is no need to check for existing candidate LSNs + */ + else if (current_lsn <= slot->data.confirmed_flush) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + + /* our candidate can directly be used */ + updated_lsn = true; + } + + /* + * Only increase if the previous values have been applied, otherwise we + * might never end up updating if the receiver acks too slowly. A missed + * value here will just cause some extra effort after reconnecting. + */ + if (slot->candidate_restart_valid == InvalidXLogRecPtr) + { + slot->candidate_restart_valid = current_lsn; + slot->candidate_restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn)); + } + else + { + XLogRecPtr candidate_restart_lsn; + XLogRecPtr candidate_restart_valid; + XLogRecPtr confirmed_flush; + + candidate_restart_lsn = slot->candidate_restart_lsn; + candidate_restart_valid = slot->candidate_restart_valid; + confirmed_flush = slot->data.confirmed_flush; + SpinLockRelease(&slot->mutex); + + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + LSN_FORMAT_ARGS(restart_lsn), + LSN_FORMAT_ARGS(current_lsn), + LSN_FORMAT_ARGS(candidate_restart_lsn), + LSN_FORMAT_ARGS(candidate_restart_valid), + LSN_FORMAT_ARGS(confirmed_flush)); + } + + /* candidates are already valid with the current flush position, apply */ + if (updated_lsn) + LogicalConfirmReceivedLocation(slot->data.confirmed_flush); +} + +/* + * Handle a consumer's confirmation having received all changes up to lsn. + */ +void +LogicalConfirmReceivedLocation(XLogRecPtr lsn) +{ + Assert(lsn != InvalidXLogRecPtr); + + /* Do an unlocked check for candidate_lsn first. */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr || + MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr) + { + bool updated_xmin = false; + bool updated_restart = false; + + SpinLockAcquire(&MyReplicationSlot->mutex); + + MyReplicationSlot->data.confirmed_flush = lsn; + + /* if we're past the location required for bumping xmin, do so */ + if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr && + MyReplicationSlot->candidate_xmin_lsn <= lsn) + { + /* + * We have to write the changed xmin to disk *before* we change + * the in-memory value, otherwise after a crash we wouldn't know + * that some catalog tuples might have been removed already. + * + * Ensure that by first writing to ->xmin and only update + * ->effective_xmin once the new state is synced to disk. After a + * crash ->effective_xmin is set to ->xmin. + */ + if (TransactionIdIsValid(MyReplicationSlot->candidate_catalog_xmin) && + MyReplicationSlot->data.catalog_xmin != MyReplicationSlot->candidate_catalog_xmin) + { + MyReplicationSlot->data.catalog_xmin = MyReplicationSlot->candidate_catalog_xmin; + MyReplicationSlot->candidate_catalog_xmin = InvalidTransactionId; + MyReplicationSlot->candidate_xmin_lsn = InvalidXLogRecPtr; + updated_xmin = true; + } + } + + if (MyReplicationSlot->candidate_restart_valid != InvalidXLogRecPtr && + MyReplicationSlot->candidate_restart_valid <= lsn) + { + Assert(MyReplicationSlot->candidate_restart_lsn != InvalidXLogRecPtr); + + MyReplicationSlot->data.restart_lsn = MyReplicationSlot->candidate_restart_lsn; + MyReplicationSlot->candidate_restart_lsn = InvalidXLogRecPtr; + MyReplicationSlot->candidate_restart_valid = InvalidXLogRecPtr; + updated_restart = true; + } + + SpinLockRelease(&MyReplicationSlot->mutex); + + /* first write new xmin to disk, so we know what's up after a crash */ + if (updated_xmin || updated_restart) + { + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); + } + + /* + * Now the new xmin is safely on disk, we can let the global value + * advance. We do not take ProcArrayLock or similar since we only + * advance xmin here and there's not much harm done by a concurrent + * computation missing that. + */ + if (updated_xmin) + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_catalog_xmin = MyReplicationSlot->data.catalog_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + } + } + else + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->data.confirmed_flush = lsn; + SpinLockRelease(&MyReplicationSlot->mutex); + } +} + +/* + * Clear logical streaming state during (sub)transaction abort. + */ +void +ResetLogicalStreamingState(void) +{ + CheckXidAlive = InvalidTransactionId; + bsysscan = false; +} + +/* + * Report stats for a slot. + */ +void +UpdateDecodingStats(LogicalDecodingContext *ctx) +{ + ReorderBuffer *rb = ctx->reorder; + PgStat_StatReplSlotEntry repSlotStat; + + /* Nothing to do if we don't have any replication stats to be sent. */ + if (rb->spillBytes <= 0 && rb->streamBytes <= 0 && rb->totalBytes <= 0) + return; + + elog(DEBUG2, "UpdateDecodingStats: updating stats %p %lld %lld %lld %lld %lld %lld %lld %lld", + rb, + (long long) rb->spillTxns, + (long long) rb->spillCount, + (long long) rb->spillBytes, + (long long) rb->streamTxns, + (long long) rb->streamCount, + (long long) rb->streamBytes, + (long long) rb->totalTxns, + (long long) rb->totalBytes); + + repSlotStat.spill_txns = rb->spillTxns; + repSlotStat.spill_count = rb->spillCount; + repSlotStat.spill_bytes = rb->spillBytes; + repSlotStat.stream_txns = rb->streamTxns; + repSlotStat.stream_count = rb->streamCount; + repSlotStat.stream_bytes = rb->streamBytes; + repSlotStat.total_txns = rb->totalTxns; + repSlotStat.total_bytes = rb->totalBytes; + + pgstat_report_replslot(ctx->slot, &repSlotStat); + + rb->spillTxns = 0; + rb->spillCount = 0; + rb->spillBytes = 0; + rb->streamTxns = 0; + rb->streamCount = 0; + rb->streamBytes = 0; + rb->totalTxns = 0; + rb->totalBytes = 0; +} diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c new file mode 100644 index 0000000..55a24c0 --- /dev/null +++ b/src/backend/replication/logical/logicalfuncs.c @@ -0,0 +1,377 @@ +/*------------------------------------------------------------------------- + * + * logicalfuncs.c + * + * Support functions for using logical decoding and management of + * logical replication slots via SQL. + * + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/logicalfuncs.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "storage/fd.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/regproc.h" +#include "utils/resowner.h" + +/* Private data for writing out data */ +typedef struct DecodingOutputState +{ + Tuplestorestate *tupstore; + TupleDesc tupdesc; + bool binary_output; + int64 returned_rows; +} DecodingOutputState; + +/* + * Prepare for an output plugin write. + */ +static void +LogicalOutputPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + resetStringInfo(ctx->out); +} + +/* + * Perform output plugin write into tuplestore. + */ +static void +LogicalOutputWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + Datum values[3]; + bool nulls[3]; + DecodingOutputState *p; + + /* SQL Datums can only be of a limited length... */ + if (ctx->out->len > MaxAllocSize - VARHDRSZ) + elog(ERROR, "too much output for sql interface"); + + p = (DecodingOutputState *) ctx->output_writer_private; + + memset(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(lsn); + values[1] = TransactionIdGetDatum(xid); + + /* + * Assert ctx->out is in database encoding when we're writing textual + * output. + */ + if (!p->binary_output) + Assert(pg_verify_mbstr(GetDatabaseEncoding(), + ctx->out->data, ctx->out->len, + false)); + + /* ick, but cstring_to_text_with_len works for bytea perfectly fine */ + values[2] = PointerGetDatum(cstring_to_text_with_len(ctx->out->data, ctx->out->len)); + + tuplestore_putvalues(p->tupstore, p->tupdesc, values, nulls); + p->returned_rows++; +} + +/* + * Helper function for the various SQL callable logical decoding functions. + */ +static Datum +pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) +{ + Name name; + XLogRecPtr upto_lsn; + int32 upto_nchanges; + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + XLogRecPtr end_of_wal; + LogicalDecodingContext *ctx; + ResourceOwner old_resowner = CurrentResourceOwner; + ArrayType *arr; + Size ndim; + List *options = NIL; + DecodingOutputState *p; + + CheckSlotPermissions(); + + CheckLogicalDecodingRequirements(); + + if (PG_ARGISNULL(0)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("slot name must not be null"))); + name = PG_GETARG_NAME(0); + + if (PG_ARGISNULL(1)) + upto_lsn = InvalidXLogRecPtr; + else + upto_lsn = PG_GETARG_LSN(1); + + if (PG_ARGISNULL(2)) + upto_nchanges = InvalidXLogRecPtr; + else + upto_nchanges = PG_GETARG_INT32(2); + + if (PG_ARGISNULL(3)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("options array must not be null"))); + arr = PG_GETARG_ARRAYTYPE_P(3); + + /* state to write output to */ + p = palloc0(sizeof(DecodingOutputState)); + + p->binary_output = binary; + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Deconstruct options array */ + ndim = ARR_NDIM(arr); + if (ndim > 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must be one-dimensional"))); + } + else if (array_contains_nulls(arr)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must not contain nulls"))); + } + else if (ndim == 1) + { + int nelems; + Datum *datum_opts; + int i; + + Assert(ARR_ELEMTYPE(arr) == TEXTOID); + + deconstruct_array_builtin(arr, TEXTOID, &datum_opts, NULL, &nelems); + + if (nelems % 2 != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("array must have even number of elements"))); + + for (i = 0; i < nelems; i += 2) + { + char *name = TextDatumGetCString(datum_opts[i]); + char *opt = TextDatumGetCString(datum_opts[i + 1]); + + options = lappend(options, makeDefElem(name, (Node *) makeString(opt), -1)); + } + } + + InitMaterializedSRF(fcinfo, 0); + p->tupstore = rsinfo->setResult; + p->tupdesc = rsinfo->setDesc; + + /* + * Compute the current end-of-wal. + */ + if (!RecoveryInProgress()) + end_of_wal = GetFlushRecPtr(NULL); + else + end_of_wal = GetXLogReplayRecPtr(NULL); + + ReplicationSlotAcquire(NameStr(*name), true); + + PG_TRY(); + { + /* restart at slot's confirmed_flush */ + ctx = CreateDecodingContext(InvalidXLogRecPtr, + options, + false, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + LogicalOutputPrepareWrite, + LogicalOutputWrite, NULL); + + MemoryContextSwitchTo(oldcontext); + + /* + * Check whether the output plugin writes textual output if that's + * what we need. + */ + if (!binary && + ctx->options.output_type !=OUTPUT_PLUGIN_TEXTUAL_OUTPUT) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("logical decoding output plugin \"%s\" produces binary output, but function \"%s\" expects textual data", + NameStr(MyReplicationSlot->data.plugin), + format_procedure(fcinfo->flinfo->fn_oid)))); + + ctx->output_writer_private = p; + + /* + * Decoding of WAL must start at restart_lsn so that the entirety of + * xacts that committed after the slot's confirmed_flush can be + * accumulated into reorder buffers. + */ + XLogBeginRead(ctx->reader, MyReplicationSlot->data.restart_lsn); + + /* invalidate non-timetravel entries */ + InvalidateSystemCaches(); + + /* Decode until we run out of records */ + while (ctx->reader->EndRecPtr < end_of_wal) + { + XLogRecord *record; + char *errm = NULL; + + record = XLogReadRecord(ctx->reader, &errm); + if (errm) + elog(ERROR, "could not find record for logical decoding: %s", errm); + + /* + * The {begin_txn,change,commit_txn}_wrapper callbacks above will + * store the description into our tuplestore. + */ + if (record != NULL) + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* check limits */ + if (upto_lsn != InvalidXLogRecPtr && + upto_lsn <= ctx->reader->EndRecPtr) + break; + if (upto_nchanges != 0 && + upto_nchanges <= p->returned_rows) + break; + CHECK_FOR_INTERRUPTS(); + } + + /* + * Logical decoding could have clobbered CurrentResourceOwner during + * transaction management, so restore the executor's value. (This is + * a kluge, but it's not worth cleaning up right now.) + */ + CurrentResourceOwner = old_resowner; + + /* + * Next time, start where we left off. (Hunting things, the family + * business..) + */ + if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) + { + LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); + + /* + * If only the confirmed_flush_lsn has changed the slot won't get + * marked as dirty by the above. Callers on the walsender + * interface are expected to keep track of their own progress and + * don't need it written out. But SQL-interface users cannot + * specify their own start positions and it's harder for them to + * keep track of their progress, so we should make more of an + * effort to save it for them. + * + * Dirty the slot so it's written out at the next checkpoint. + * We'll still lose its position on crash, as documented, but it's + * better than always losing the position even on clean restart. + */ + ReplicationSlotMarkDirty(); + } + + /* free context, call shutdown callback */ + FreeDecodingContext(ctx); + + ReplicationSlotRelease(); + InvalidateSystemCaches(); + } + PG_CATCH(); + { + /* clear all timetravel entries */ + InvalidateSystemCaches(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + return (Datum) 0; +} + +/* + * SQL function returning the changestream as text, consuming the data. + */ +Datum +pg_logical_slot_get_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, false); +} + +/* + * SQL function returning the changestream as text, only peeking ahead. + */ +Datum +pg_logical_slot_peek_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, false); +} + +/* + * SQL function returning the changestream in binary, consuming the data. + */ +Datum +pg_logical_slot_get_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, true, true); +} + +/* + * SQL function returning the changestream in binary, only peeking ahead. + */ +Datum +pg_logical_slot_peek_binary_changes(PG_FUNCTION_ARGS) +{ + return pg_logical_slot_get_changes_guts(fcinfo, false, true); +} + + +/* + * SQL function for writing logical decoding message into WAL. + */ +Datum +pg_logical_emit_message_bytea(PG_FUNCTION_ARGS) +{ + bool transactional = PG_GETARG_BOOL(0); + char *prefix = text_to_cstring(PG_GETARG_TEXT_PP(1)); + bytea *data = PG_GETARG_BYTEA_PP(2); + XLogRecPtr lsn; + + lsn = LogLogicalMessage(prefix, VARDATA_ANY(data), VARSIZE_ANY_EXHDR(data), + transactional); + PG_RETURN_LSN(lsn); +} + +Datum +pg_logical_emit_message_text(PG_FUNCTION_ARGS) +{ + /* bytea and text are compatible */ + return pg_logical_emit_message_bytea(fcinfo); +} diff --git a/src/backend/replication/logical/meson.build b/src/backend/replication/logical/meson.build new file mode 100644 index 0000000..d48cd4c --- /dev/null +++ b/src/backend/replication/logical/meson.build @@ -0,0 +1,17 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'applyparallelworker.c', + 'decode.c', + 'launcher.c', + 'logical.c', + 'logicalfuncs.c', + 'message.c', + 'origin.c', + 'proto.c', + 'relation.c', + 'reorderbuffer.c', + 'snapbuild.c', + 'tablesync.c', + 'worker.c', +) diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c new file mode 100644 index 0000000..c5de14a --- /dev/null +++ b/src/backend/replication/logical/message.c @@ -0,0 +1,89 @@ +/*------------------------------------------------------------------------- + * + * message.c + * Generic logical messages. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/message.c + * + * NOTES + * + * Generic logical messages allow XLOG logging of arbitrary binary blobs that + * get passed to the logical decoding plugin. In normal XLOG processing they + * are same as NOOP. + * + * These messages can be either transactional or non-transactional. + * Transactional messages are part of current transaction and will be sent to + * decoding plugin using in a same way as DML operations. + * Non-transactional messages are sent to the plugin at the time when the + * logical decoding reads them from XLOG. This also means that transactional + * messages won't be delivered if the transaction was rolled back but the + * non-transactional one will always be delivered. + * + * Every message carries prefix to avoid conflicts between different decoding + * plugins. The plugin authors must take extra care to use unique prefix, + * good options seems to be for example to use the name of the extension. + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "replication/logical.h" +#include "replication/message.h" +#include "utils/memutils.h" + +/* + * Write logical decoding message into XLog. + */ +XLogRecPtr +LogLogicalMessage(const char *prefix, const char *message, size_t size, + bool transactional) +{ + xl_logical_message xlrec; + + /* + * Force xid to be allocated if we're emitting a transactional message. + */ + if (transactional) + { + Assert(IsTransactionState()); + GetCurrentTransactionId(); + } + + xlrec.dbId = MyDatabaseId; + xlrec.transactional = transactional; + /* trailing zero is critical; see logicalmsg_desc */ + xlrec.prefix_size = strlen(prefix) + 1; + xlrec.message_size = size; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfLogicalMessage); + XLogRegisterData(unconstify(char *, prefix), xlrec.prefix_size); + XLogRegisterData(unconstify(char *, message), size); + + /* allow origin filtering */ + XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); + + return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE); +} + +/* + * Redo is basically just noop for logical decoding messages. + */ +void +logicalmsg_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info != XLOG_LOGICAL_MESSAGE) + elog(PANIC, "logicalmsg_redo: unknown op code %u", info); + + /* This is only interesting for logical decoding, see decode.c. */ +} diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c new file mode 100644 index 0000000..b0255ff --- /dev/null +++ b/src/backend/replication/logical/origin.c @@ -0,0 +1,1581 @@ +/*------------------------------------------------------------------------- + * + * origin.c + * Logical replication progress tracking support. + * + * Copyright (c) 2013-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/origin.c + * + * NOTES + * + * This file provides the following: + * * An infrastructure to name nodes in a replication setup + * * A facility to efficiently store and persist replication progress in an + * efficient and durable manner. + * + * Replication origin consist out of a descriptive, user defined, external + * name and a short, thus space efficient, internal 2 byte one. This split + * exists because replication origin have to be stored in WAL and shared + * memory and long descriptors would be inefficient. For now only use 2 bytes + * for the internal id of a replication origin as it seems unlikely that there + * soon will be more than 65k nodes in one replication setup; and using only + * two bytes allow us to be more space efficient. + * + * Replication progress is tracked in a shared memory table + * (ReplicationState) that's dumped to disk every checkpoint. Entries + * ('slots') in this table are identified by the internal id. That's the case + * because it allows to increase replication progress during crash + * recovery. To allow doing so we store the original LSN (from the originating + * system) of a transaction in the commit record. That allows to recover the + * precise replayed state after crash recovery; without requiring synchronous + * commits. Allowing logical replication to use asynchronous commit is + * generally good for performance, but especially important as it allows a + * single threaded replay process to keep up with a source that has multiple + * backends generating changes concurrently. For efficiency and simplicity + * reasons a backend can setup one replication origin that's from then used as + * the source of changes produced by the backend, until reset again. + * + * This infrastructure is intended to be used in cooperation with logical + * decoding. When replaying from a remote system the configured origin is + * provided to output plugins, allowing prevention of replication loops and + * other filtering. + * + * There are several levels of locking at work: + * + * * To create and drop replication origins an exclusive lock on + * pg_replication_slot is required for the duration. That allows us to + * safely and conflict free assign new origins using a dirty snapshot. + * + * * When creating an in-memory replication progress slot the ReplicationOrigin + * LWLock has to be held exclusively; when iterating over the replication + * progress a shared lock has to be held, the same when advancing the + * replication progress of an individual backend that has not setup as the + * session's replication origin. + * + * * When manipulating or looking at the remote_lsn and local_lsn fields of a + * replication progress slot that slot's lwlock has to be held. That's + * primarily because we do not assume 8 byte writes (the LSN) is atomic on + * all our platforms, but it also simplifies memory ordering concerns + * between the remote and local lsn. We use a lwlock instead of a spinlock + * so it's less harmful to hold the lock over a WAL write + * (cf. AdvanceReplicationProgress). + * + * --------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> +#include <sys/stat.h> + +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/table.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/pg_subscription.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/origin.h" +#include "storage/condition_variable.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" + +/* + * Replay progress of a single remote node. + */ +typedef struct ReplicationState +{ + /* + * Local identifier for the remote node. + */ + RepOriginId roident; + + /* + * Location of the latest commit from the remote side. + */ + XLogRecPtr remote_lsn; + + /* + * Remember the local lsn of the commit record so we can XLogFlush() to it + * during a checkpoint so we know the commit record actually is safe on + * disk. + */ + XLogRecPtr local_lsn; + + /* + * PID of backend that's acquired slot, or 0 if none. + */ + int acquired_by; + + /* + * Condition variable that's signaled when acquired_by changes. + */ + ConditionVariable origin_cv; + + /* + * Lock protecting remote_lsn and local_lsn. + */ + LWLock lock; +} ReplicationState; + +/* + * On disk version of ReplicationState. + */ +typedef struct ReplicationStateOnDisk +{ + RepOriginId roident; + XLogRecPtr remote_lsn; +} ReplicationStateOnDisk; + + +typedef struct ReplicationStateCtl +{ + /* Tranche to use for per-origin LWLocks */ + int tranche_id; + /* Array of length max_replication_slots */ + ReplicationState states[FLEXIBLE_ARRAY_MEMBER]; +} ReplicationStateCtl; + +/* external variables */ +RepOriginId replorigin_session_origin = InvalidRepOriginId; /* assumed identity */ +XLogRecPtr replorigin_session_origin_lsn = InvalidXLogRecPtr; +TimestampTz replorigin_session_origin_timestamp = 0; + +/* + * Base address into a shared memory array of replication states of size + * max_replication_slots. + * + * XXX: Should we use a separate variable to size this rather than + * max_replication_slots? + */ +static ReplicationState *replication_states; + +/* + * Actual shared memory block (replication_states[] is now part of this). + */ +static ReplicationStateCtl *replication_states_ctl; + +/* + * Backend-local, cached element from ReplicationState for use in a backend + * replaying remote commits, so we don't have to search ReplicationState for + * the backends current RepOriginId. + */ +static ReplicationState *session_replication_state = NULL; + +/* Magic for on disk files. */ +#define REPLICATION_STATE_MAGIC ((uint32) 0x1257DADE) + +static void +replorigin_check_prerequisites(bool check_slots, bool recoveryOK) +{ + if (check_slots && max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot query or manipulate replication origin when max_replication_slots = 0"))); + + if (!recoveryOK && RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), + errmsg("cannot manipulate replication origins during recovery"))); +} + + +/* + * IsReservedOriginName + * True iff name is either "none" or "any". + */ +static bool +IsReservedOriginName(const char *name) +{ + return ((pg_strcasecmp(name, LOGICALREP_ORIGIN_NONE) == 0) || + (pg_strcasecmp(name, LOGICALREP_ORIGIN_ANY) == 0)); +} + +/* --------------------------------------------------------------------------- + * Functions for working with replication origins themselves. + * --------------------------------------------------------------------------- + */ + +/* + * Check for a persistent replication origin identified by name. + * + * Returns InvalidOid if the node isn't known yet and missing_ok is true. + */ +RepOriginId +replorigin_by_name(const char *roname, bool missing_ok) +{ + Form_pg_replication_origin ident; + Oid roident = InvalidOid; + HeapTuple tuple; + Datum roname_d; + + roname_d = CStringGetTextDatum(roname); + + tuple = SearchSysCache1(REPLORIGNAME, roname_d); + if (HeapTupleIsValid(tuple)) + { + ident = (Form_pg_replication_origin) GETSTRUCT(tuple); + roident = ident->roident; + ReleaseSysCache(tuple); + } + else if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin \"%s\" does not exist", + roname))); + + return roident; +} + +/* + * Create a replication origin. + * + * Needs to be called in a transaction. + */ +RepOriginId +replorigin_create(const char *roname) +{ + Oid roident; + HeapTuple tuple = NULL; + Relation rel; + Datum roname_d; + SnapshotData SnapshotDirty; + SysScanDesc scan; + ScanKeyData key; + + roname_d = CStringGetTextDatum(roname); + + Assert(IsTransactionState()); + + /* + * We need the numeric replication origin to be 16bit wide, so we cannot + * rely on the normal oid allocation. Instead we simply scan + * pg_replication_origin for the first unused id. That's not particularly + * efficient, but this should be a fairly infrequent operation - we can + * easily spend a bit more code on this when it turns out it needs to be + * faster. + * + * We handle concurrency by taking an exclusive lock (allowing reads!) + * over the table for the duration of the search. Because we use a "dirty + * snapshot" we can read rows that other in-progress sessions have + * written, even though they would be invisible with normal snapshots. Due + * to the exclusive lock there's no danger that new rows can appear while + * we're checking. + */ + InitDirtySnapshot(SnapshotDirty); + + rel = table_open(ReplicationOriginRelationId, ExclusiveLock); + + for (roident = InvalidOid + 1; roident < PG_UINT16_MAX; roident++) + { + bool nulls[Natts_pg_replication_origin]; + Datum values[Natts_pg_replication_origin]; + bool collides; + + CHECK_FOR_INTERRUPTS(); + + ScanKeyInit(&key, + Anum_pg_replication_origin_roident, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(roident)); + + scan = systable_beginscan(rel, ReplicationOriginIdentIndex, + true /* indexOK */ , + &SnapshotDirty, + 1, &key); + + collides = HeapTupleIsValid(systable_getnext(scan)); + + systable_endscan(scan); + + if (!collides) + { + /* + * Ok, found an unused roident, insert the new row and do a CCI, + * so our callers can look it up if they want to. + */ + memset(&nulls, 0, sizeof(nulls)); + + values[Anum_pg_replication_origin_roident - 1] = ObjectIdGetDatum(roident); + values[Anum_pg_replication_origin_roname - 1] = roname_d; + + tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); + CatalogTupleInsert(rel, tuple); + CommandCounterIncrement(); + break; + } + } + + /* now release lock again, */ + table_close(rel, ExclusiveLock); + + if (tuple == NULL) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("could not find free replication origin ID"))); + + heap_freetuple(tuple); + return roident; +} + +/* + * Helper function to drop a replication origin. + */ +static void +replorigin_state_clear(RepOriginId roident, bool nowait) +{ + int i; + + /* + * Clean up the slot state info, if there is any matching slot. + */ +restart: + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + if (state->roident == roident) + { + /* found our slot, is it busy? */ + if (state->acquired_by != 0) + { + ConditionVariable *cv; + + if (nowait) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("could not drop replication origin with ID %d, in use by PID %d", + state->roident, + state->acquired_by))); + + /* + * We must wait and then retry. Since we don't know which CV + * to wait on until here, we can't readily use + * ConditionVariablePrepareToSleep (calling it here would be + * wrong, since we could miss the signal if we did so); just + * use ConditionVariableSleep directly. + */ + cv = &state->origin_cv; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP); + goto restart; + } + + /* first make a WAL log entry */ + { + xl_replorigin_drop xlrec; + + xlrec.node_id = roident; + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_DROP); + } + + /* then clear the in-memory slot */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + LWLockRelease(ReplicationOriginLock); + ConditionVariableCancelSleep(); +} + +/* + * Drop replication origin (by name). + * + * Needs to be called in a transaction. + */ +void +replorigin_drop_by_name(const char *name, bool missing_ok, bool nowait) +{ + RepOriginId roident; + Relation rel; + HeapTuple tuple; + + Assert(IsTransactionState()); + + rel = table_open(ReplicationOriginRelationId, RowExclusiveLock); + + roident = replorigin_by_name(name, missing_ok); + + /* Lock the origin to prevent concurrent drops. */ + LockSharedObject(ReplicationOriginRelationId, roident, 0, + AccessExclusiveLock); + + tuple = SearchSysCache1(REPLORIGIDENT, ObjectIdGetDatum(roident)); + if (!HeapTupleIsValid(tuple)) + { + if (!missing_ok) + elog(ERROR, "cache lookup failed for replication origin with ID %d", + roident); + + /* + * We don't need to retain the locks if the origin is already dropped. + */ + UnlockSharedObject(ReplicationOriginRelationId, roident, 0, + AccessExclusiveLock); + table_close(rel, RowExclusiveLock); + return; + } + + replorigin_state_clear(roident, nowait); + + /* + * Now, we can delete the catalog entry. + */ + CatalogTupleDelete(rel, &tuple->t_self); + ReleaseSysCache(tuple); + + CommandCounterIncrement(); + + /* We keep the lock on pg_replication_origin until commit */ + table_close(rel, NoLock); +} + +/* + * Lookup replication origin via its oid and return the name. + * + * The external name is palloc'd in the calling context. + * + * Returns true if the origin is known, false otherwise. + */ +bool +replorigin_by_oid(RepOriginId roident, bool missing_ok, char **roname) +{ + HeapTuple tuple; + Form_pg_replication_origin ric; + + Assert(OidIsValid((Oid) roident)); + Assert(roident != InvalidRepOriginId); + Assert(roident != DoNotReplicateId); + + tuple = SearchSysCache1(REPLORIGIDENT, + ObjectIdGetDatum((Oid) roident)); + + if (HeapTupleIsValid(tuple)) + { + ric = (Form_pg_replication_origin) GETSTRUCT(tuple); + *roname = text_to_cstring(&ric->roname); + ReleaseSysCache(tuple); + + return true; + } + else + { + *roname = NULL; + + if (!missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication origin with ID %d does not exist", + roident))); + + return false; + } +} + + +/* --------------------------------------------------------------------------- + * Functions for handling replication progress. + * --------------------------------------------------------------------------- + */ + +Size +ReplicationOriginShmemSize(void) +{ + Size size = 0; + + /* + * XXX: max_replication_slots is arguably the wrong thing to use, as here + * we keep the replay state of *remote* transactions. But for now it seems + * sufficient to reuse it, rather than introduce a separate GUC. + */ + if (max_replication_slots == 0) + return size; + + size = add_size(size, offsetof(ReplicationStateCtl, states)); + + size = add_size(size, + mul_size(max_replication_slots, sizeof(ReplicationState))); + return size; +} + +void +ReplicationOriginShmemInit(void) +{ + bool found; + + if (max_replication_slots == 0) + return; + + replication_states_ctl = (ReplicationStateCtl *) + ShmemInitStruct("ReplicationOriginState", + ReplicationOriginShmemSize(), + &found); + replication_states = replication_states_ctl->states; + + if (!found) + { + int i; + + MemSet(replication_states_ctl, 0, ReplicationOriginShmemSize()); + + replication_states_ctl->tranche_id = LWTRANCHE_REPLICATION_ORIGIN_STATE; + + for (i = 0; i < max_replication_slots; i++) + { + LWLockInitialize(&replication_states[i].lock, + replication_states_ctl->tranche_id); + ConditionVariableInit(&replication_states[i].origin_cv); + } + } +} + +/* --------------------------------------------------------------------------- + * Perform a checkpoint of each replication origin's progress with respect to + * the replayed remote_lsn. Make sure that all transactions we refer to in the + * checkpoint (local_lsn) are actually on-disk. This might not yet be the case + * if the transactions were originally committed asynchronously. + * + * We store checkpoints in the following format: + * +-------+------------------------+------------------+-----+--------+ + * | MAGIC | ReplicationStateOnDisk | struct Replic... | ... | CRC32C | EOF + * +-------+------------------------+------------------+-----+--------+ + * + * So its just the magic, followed by the statically sized + * ReplicationStateOnDisk structs. Note that the maximum number of + * ReplicationState is determined by max_replication_slots. + * --------------------------------------------------------------------------- + */ +void +CheckPointReplicationOrigin(void) +{ + const char *tmppath = "pg_logical/replorigin_checkpoint.tmp"; + const char *path = "pg_logical/replorigin_checkpoint"; + int tmpfd; + int i; + uint32 magic = REPLICATION_STATE_MAGIC; + pg_crc32c crc; + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + /* make sure no old temp file is remaining */ + if (unlink(tmppath) < 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + tmppath))); + + /* + * no other backend can perform this at the same time; only one checkpoint + * can happen at a time. + */ + tmpfd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (tmpfd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmppath))); + + /* write magic */ + errno = 0; + if ((write(tmpfd, &magic, sizeof(magic))) != sizeof(magic)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + /* prevent concurrent creations/drops */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* write actual data */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationStateOnDisk disk_state; + ReplicationState *curstate = &replication_states[i]; + XLogRecPtr local_lsn; + + if (curstate->roident == InvalidRepOriginId) + continue; + + /* zero, to avoid uninitialized padding bytes */ + memset(&disk_state, 0, sizeof(disk_state)); + + LWLockAcquire(&curstate->lock, LW_SHARED); + + disk_state.roident = curstate->roident; + + disk_state.remote_lsn = curstate->remote_lsn; + local_lsn = curstate->local_lsn; + + LWLockRelease(&curstate->lock); + + /* make sure we only write out a commit that's persistent */ + XLogFlush(local_lsn); + + errno = 0; + if ((write(tmpfd, &disk_state, sizeof(disk_state))) != + sizeof(disk_state)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + } + + LWLockRelease(ReplicationOriginLock); + + /* write out the CRC */ + FIN_CRC32C(crc); + errno = 0; + if ((write(tmpfd, &crc, sizeof(crc))) != sizeof(crc)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + } + + if (CloseTransientFile(tmpfd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + tmppath))); + + /* fsync, rename to permanent file, fsync file and directory */ + durable_rename(tmppath, path, PANIC); +} + +/* + * Recover replication replay status from checkpoint data saved earlier by + * CheckPointReplicationOrigin. + * + * This only needs to be called at startup and *not* during every checkpoint + * read during recovery (e.g. in HS or PITR from a base backup) afterwards. All + * state thereafter can be recovered by looking at commit records. + */ +void +StartupReplicationOrigin(void) +{ + const char *path = "pg_logical/replorigin_checkpoint"; + int fd; + int readBytes; + uint32 magic = REPLICATION_STATE_MAGIC; + int last_state = 0; + pg_crc32c file_crc; + pg_crc32c crc; + + /* don't want to overwrite already existing state */ +#ifdef USE_ASSERT_CHECKING + static bool already_started = false; + + Assert(!already_started); + already_started = true; +#endif + + if (max_replication_slots == 0) + return; + + INIT_CRC32C(crc); + + elog(DEBUG2, "starting up replication origin progress state"); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + /* + * might have had max_replication_slots == 0 last run, or we just brought + * up a standby. + */ + if (fd < 0 && errno == ENOENT) + return; + else if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + + /* verify magic, that is written even if nothing was active */ + readBytes = read(fd, &magic, sizeof(magic)); + if (readBytes != sizeof(magic)) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(magic)))); + } + COMP_CRC32C(crc, &magic, sizeof(magic)); + + if (magic != REPLICATION_STATE_MAGIC) + ereport(PANIC, + (errmsg("replication checkpoint has wrong magic %u instead of %u", + magic, REPLICATION_STATE_MAGIC))); + + /* we can skip locking here, no other access is possible */ + + /* recover individual states, until there are no more to be found */ + while (true) + { + ReplicationStateOnDisk disk_state; + + readBytes = read(fd, &disk_state, sizeof(disk_state)); + + /* no further data */ + if (readBytes == sizeof(crc)) + { + /* not pretty, but simple ... */ + file_crc = *(pg_crc32c *) &disk_state; + break; + } + + if (readBytes < 0) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + } + + if (readBytes != sizeof(disk_state)) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, sizeof(disk_state)))); + } + + COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); + + if (last_state == max_replication_slots) + ereport(PANIC, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state, increase max_replication_slots"))); + + /* copy data to shared memory */ + replication_states[last_state].roident = disk_state.roident; + replication_states[last_state].remote_lsn = disk_state.remote_lsn; + last_state++; + + ereport(LOG, + (errmsg("recovered replication state of node %d to %X/%X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + } + + /* now check checksum */ + FIN_CRC32C(crc); + if (file_crc != crc) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot checkpoint has wrong checksum %u, expected %u", + crc, file_crc))); + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + path))); +} + +void +replorigin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_REPLORIGIN_SET: + { + xl_replorigin_set *xlrec = + (xl_replorigin_set *) XLogRecGetData(record); + + replorigin_advance(xlrec->node_id, + xlrec->remote_lsn, record->EndRecPtr, + xlrec->force /* backward */ , + false /* WAL log */ ); + break; + } + case XLOG_REPLORIGIN_DROP: + { + xl_replorigin_drop *xlrec; + int i; + + xlrec = (xl_replorigin_drop *) XLogRecGetData(record); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state = &replication_states[i]; + + /* found our slot */ + if (state->roident == xlrec->node_id) + { + /* reset entry */ + state->roident = InvalidRepOriginId; + state->remote_lsn = InvalidXLogRecPtr; + state->local_lsn = InvalidXLogRecPtr; + break; + } + } + break; + } + default: + elog(PANIC, "replorigin_redo: unknown op code %u", info); + } +} + + +/* + * Tell the replication origin progress machinery that a commit from 'node' + * that originated at the LSN remote_commit on the remote node was replayed + * successfully and that we don't need to do so again. In combination with + * setting up replorigin_session_origin_lsn and replorigin_session_origin + * that ensures we won't lose knowledge about that after a crash if the + * transaction had a persistent effect (think of asynchronous commits). + * + * local_commit needs to be a local LSN of the commit so that we can make sure + * upon a checkpoint that enough WAL has been persisted to disk. + * + * Needs to be called with a RowExclusiveLock on pg_replication_origin, + * unless running in recovery. + */ +void +replorigin_advance(RepOriginId node, + XLogRecPtr remote_commit, XLogRecPtr local_commit, + bool go_backward, bool wal_log) +{ + int i; + ReplicationState *replication_state = NULL; + ReplicationState *free_state = NULL; + + Assert(node != InvalidRepOriginId); + + /* we don't track DoNotReplicateId */ + if (node == DoNotReplicateId) + return; + + /* + * XXX: For the case where this is called by WAL replay, it'd be more + * efficient to restore into a backend local hashtable and only dump into + * shmem after recovery is finished. Let's wait with implementing that + * till it's shown to be a measurable expense + */ + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_state == NULL) + { + free_state = curstate; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + { + continue; + } + + /* ok, found slot */ + replication_state = curstate; + + LWLockAcquire(&replication_state->lock, LW_EXCLUSIVE); + + /* Make sure it's not used by somebody else */ + if (replication_state->acquired_by != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with ID %d is already active for PID %d", + replication_state->roident, + replication_state->acquired_by))); + } + + break; + } + + if (replication_state == NULL && free_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with ID %d", + node), + errhint("Increase max_replication_slots and try again."))); + + if (replication_state == NULL) + { + /* initialize new slot */ + LWLockAcquire(&free_state->lock, LW_EXCLUSIVE); + replication_state = free_state; + Assert(replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(replication_state->local_lsn == InvalidXLogRecPtr); + replication_state->roident = node; + } + + Assert(replication_state->roident != InvalidRepOriginId); + + /* + * If somebody "forcefully" sets this slot, WAL log it, so it's durable + * and the standby gets the message. Primarily this will be called during + * WAL replay (of commit records) where no WAL logging is necessary. + */ + if (wal_log) + { + xl_replorigin_set xlrec; + + xlrec.remote_lsn = remote_commit; + xlrec.node_id = node; + xlrec.force = go_backward; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); + + XLogInsert(RM_REPLORIGIN_ID, XLOG_REPLORIGIN_SET); + } + + /* + * Due to - harmless - race conditions during a checkpoint we could see + * values here that are older than the ones we already have in memory. We + * could also see older values for prepared transactions when the prepare + * is sent at a later point of time along with commit prepared and there + * are other transactions commits between prepare and commit prepared. See + * ReorderBufferFinishPrepared. Don't overwrite those. + */ + if (go_backward || replication_state->remote_lsn < remote_commit) + replication_state->remote_lsn = remote_commit; + if (local_commit != InvalidXLogRecPtr && + (go_backward || replication_state->local_lsn < local_commit)) + replication_state->local_lsn = local_commit; + LWLockRelease(&replication_state->lock); + + /* + * Release *after* changing the LSNs, slot isn't acquired and thus could + * otherwise be dropped anytime. + */ + LWLockRelease(ReplicationOriginLock); +} + + +XLogRecPtr +replorigin_get_progress(RepOriginId node, bool flush) +{ + int i; + XLogRecPtr local_lsn = InvalidXLogRecPtr; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + + state = &replication_states[i]; + + if (state->roident == node) + { + LWLockAcquire(&state->lock, LW_SHARED); + + remote_lsn = state->remote_lsn; + local_lsn = state->local_lsn; + + LWLockRelease(&state->lock); + + break; + } + } + + LWLockRelease(ReplicationOriginLock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + +/* + * Tear down a (possibly) configured session replication origin during process + * exit. + */ +static void +ReplicationOriginExitCleanup(int code, Datum arg) +{ + ConditionVariable *cv = NULL; + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + if (session_replication_state != NULL && + session_replication_state->acquired_by == MyProcPid) + { + cv = &session_replication_state->origin_cv; + + session_replication_state->acquired_by = 0; + session_replication_state = NULL; + } + + LWLockRelease(ReplicationOriginLock); + + if (cv) + ConditionVariableBroadcast(cv); +} + +/* + * Setup a replication origin in the shared memory struct if it doesn't + * already exist and cache access to the specific ReplicationSlot so the + * array doesn't have to be searched when calling + * replorigin_session_advance(). + * + * Normally only one such cached origin can exist per process so the cached + * value can only be set again after the previous value is torn down with + * replorigin_session_reset(). For this normal case pass acquired_by = 0 + * (meaning the slot is not allowed to be already acquired by another process). + * + * However, sometimes multiple processes can safely re-use the same origin slot + * (for example, multiple parallel apply processes can safely use the same + * origin, provided they maintain commit order by allowing only one process to + * commit at a time). For this case the first process must pass acquired_by = + * 0, and then the other processes sharing that same origin can pass + * acquired_by = PID of the first process. + */ +void +replorigin_session_setup(RepOriginId node, int acquired_by) +{ + static bool registered_cleanup; + int i; + int free_slot = -1; + + if (!registered_cleanup) + { + on_shmem_exit(ReplicationOriginExitCleanup, 0); + registered_cleanup = true; + } + + Assert(max_replication_slots > 0); + + if (session_replication_state != NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot setup replication origin when one is already setup"))); + + /* Lock exclusively, as we may have to create a new table entry. */ + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + /* + * Search for either an existing slot for the origin, or a free one we can + * use. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *curstate = &replication_states[i]; + + /* remember where to insert if necessary */ + if (curstate->roident == InvalidRepOriginId && + free_slot == -1) + { + free_slot = i; + continue; + } + + /* not our slot */ + if (curstate->roident != node) + continue; + + else if (curstate->acquired_by != 0 && acquired_by == 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication origin with ID %d is already active for PID %d", + curstate->roident, curstate->acquired_by))); + } + + /* ok, found slot */ + session_replication_state = curstate; + } + + + if (session_replication_state == NULL && free_slot == -1) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("could not find free replication state slot for replication origin with ID %d", + node), + errhint("Increase max_replication_slots and try again."))); + else if (session_replication_state == NULL) + { + /* initialize new slot */ + session_replication_state = &replication_states[free_slot]; + Assert(session_replication_state->remote_lsn == InvalidXLogRecPtr); + Assert(session_replication_state->local_lsn == InvalidXLogRecPtr); + session_replication_state->roident = node; + } + + + Assert(session_replication_state->roident != InvalidRepOriginId); + + if (acquired_by == 0) + session_replication_state->acquired_by = MyProcPid; + else if (session_replication_state->acquired_by != acquired_by) + elog(ERROR, "could not find replication state slot for replication origin with OID %u which was acquired by %d", + node, acquired_by); + + LWLockRelease(ReplicationOriginLock); + + /* probably this one is pointless */ + ConditionVariableBroadcast(&session_replication_state->origin_cv); +} + +/* + * Reset replay state previously setup in this session. + * + * This function may only be called if an origin was setup with + * replorigin_session_setup(). + */ +void +replorigin_session_reset(void) +{ + ConditionVariable *cv; + + Assert(max_replication_slots != 0); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); + + session_replication_state->acquired_by = 0; + cv = &session_replication_state->origin_cv; + session_replication_state = NULL; + + LWLockRelease(ReplicationOriginLock); + + ConditionVariableBroadcast(cv); +} + +/* + * Do the same work replorigin_advance() does, just on the session's + * configured origin. + * + * This is noticeably cheaper than using replorigin_advance(). + */ +void +replorigin_session_advance(XLogRecPtr remote_commit, XLogRecPtr local_commit) +{ + Assert(session_replication_state != NULL); + Assert(session_replication_state->roident != InvalidRepOriginId); + + LWLockAcquire(&session_replication_state->lock, LW_EXCLUSIVE); + if (session_replication_state->local_lsn < local_commit) + session_replication_state->local_lsn = local_commit; + if (session_replication_state->remote_lsn < remote_commit) + session_replication_state->remote_lsn = remote_commit; + LWLockRelease(&session_replication_state->lock); +} + +/* + * Ask the machinery about the point up to which we successfully replayed + * changes from an already setup replication origin. + */ +XLogRecPtr +replorigin_session_get_progress(bool flush) +{ + XLogRecPtr remote_lsn; + XLogRecPtr local_lsn; + + Assert(session_replication_state != NULL); + + LWLockAcquire(&session_replication_state->lock, LW_SHARED); + remote_lsn = session_replication_state->remote_lsn; + local_lsn = session_replication_state->local_lsn; + LWLockRelease(&session_replication_state->lock); + + if (flush && local_lsn != InvalidXLogRecPtr) + XLogFlush(local_lsn); + + return remote_lsn; +} + + + +/* --------------------------------------------------------------------------- + * SQL functions for working with replication origin. + * + * These mostly should be fairly short wrappers around more generic functions. + * --------------------------------------------------------------------------- + */ + +/* + * Create replication origin for the passed in name, and return the assigned + * oid. + */ +Datum +pg_replication_origin_create(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + /* + * Replication origins "any and "none" are reserved for system options. + * The origins "pg_xxx" are reserved for internal use. + */ + if (IsReservedName(name) || IsReservedOriginName(name)) + ereport(ERROR, + (errcode(ERRCODE_RESERVED_NAME), + errmsg("replication origin name \"%s\" is reserved", + name), + errdetail("Origin names \"%s\", \"%s\", and names starting with \"pg_\" are reserved.", + LOGICALREP_ORIGIN_ANY, LOGICALREP_ORIGIN_NONE))); + + /* + * If built with appropriate switch, whine when regression-testing + * conventions for replication origin names are violated. + */ +#ifdef ENFORCE_REGRESSION_TEST_NAME_RESTRICTIONS + if (strncmp(name, "regress_", 8) != 0) + elog(WARNING, "replication origins created by regression test cases should have names starting with \"regress_\""); +#endif + + roident = replorigin_create(name); + + pfree(name); + + PG_RETURN_OID(roident); +} + +/* + * Drop replication origin. + */ +Datum +pg_replication_origin_drop(PG_FUNCTION_ARGS) +{ + char *name; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + + replorigin_drop_by_name(name, false, true); + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Return oid of a replication origin. + */ +Datum +pg_replication_origin_oid(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId roident; + + replorigin_check_prerequisites(false, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + roident = replorigin_by_name(name, true); + + pfree(name); + + if (OidIsValid(roident)) + PG_RETURN_OID(roident); + PG_RETURN_NULL(); +} + +/* + * Setup a replication origin for this session. + */ +Datum +pg_replication_origin_session_setup(PG_FUNCTION_ARGS) +{ + char *name; + RepOriginId origin; + + replorigin_check_prerequisites(true, false); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + origin = replorigin_by_name(name, false); + replorigin_session_setup(origin, 0); + + replorigin_session_origin = origin; + + pfree(name); + + PG_RETURN_VOID(); +} + +/* + * Reset previously setup origin in this session + */ +Datum +pg_replication_origin_session_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_reset(); + + replorigin_session_origin = InvalidRepOriginId; + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + +/* + * Has a replication origin been setup for this session. + */ +Datum +pg_replication_origin_session_is_setup(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(false, false); + + PG_RETURN_BOOL(replorigin_session_origin != InvalidRepOriginId); +} + + +/* + * Return the replication progress for origin setup in the current session. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_session_progress(PG_FUNCTION_ARGS) +{ + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + bool flush = PG_GETARG_BOOL(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + remote_lsn = replorigin_session_get_progress(flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + +Datum +pg_replication_origin_xact_setup(PG_FUNCTION_ARGS) +{ + XLogRecPtr location = PG_GETARG_LSN(0); + + replorigin_check_prerequisites(true, false); + + if (session_replication_state == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("no replication origin is configured"))); + + replorigin_session_origin_lsn = location; + replorigin_session_origin_timestamp = PG_GETARG_TIMESTAMPTZ(1); + + PG_RETURN_VOID(); +} + +Datum +pg_replication_origin_xact_reset(PG_FUNCTION_ARGS) +{ + replorigin_check_prerequisites(true, false); + + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + PG_RETURN_VOID(); +} + + +Datum +pg_replication_origin_advance(PG_FUNCTION_ARGS) +{ + text *name = PG_GETARG_TEXT_PP(0); + XLogRecPtr remote_commit = PG_GETARG_LSN(1); + RepOriginId node; + + replorigin_check_prerequisites(true, false); + + /* lock to prevent the replication origin from vanishing */ + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + node = replorigin_by_name(text_to_cstring(name), false); + + /* + * Can't sensibly pass a local commit to be flushed at checkpoint - this + * xact hasn't committed yet. This is why this function should be used to + * set up the initial replication state, but not for replay. + */ + replorigin_advance(node, remote_commit, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + PG_RETURN_VOID(); +} + + +/* + * Return the replication progress for an individual replication origin. + * + * If 'flush' is set to true it is ensured that the returned value corresponds + * to a local transaction that has been flushed. This is useful if asynchronous + * commits are used when replaying replicated transactions. + */ +Datum +pg_replication_origin_progress(PG_FUNCTION_ARGS) +{ + char *name; + bool flush; + RepOriginId roident; + XLogRecPtr remote_lsn = InvalidXLogRecPtr; + + replorigin_check_prerequisites(true, true); + + name = text_to_cstring((text *) DatumGetPointer(PG_GETARG_DATUM(0))); + flush = PG_GETARG_BOOL(1); + + roident = replorigin_by_name(name, false); + Assert(OidIsValid(roident)); + + remote_lsn = replorigin_get_progress(roident, flush); + + if (remote_lsn == InvalidXLogRecPtr) + PG_RETURN_NULL(); + + PG_RETURN_LSN(remote_lsn); +} + + +Datum +pg_show_replication_origin_status(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int i; +#define REPLICATION_ORIGIN_PROGRESS_COLS 4 + + /* we want to return 0 rows if slot is set to zero */ + replorigin_check_prerequisites(false, true); + + InitMaterializedSRF(fcinfo, 0); + + /* prevent slots from being concurrently dropped */ + LWLockAcquire(ReplicationOriginLock, LW_SHARED); + + /* + * Iterate through all possible replication_states, display if they are + * filled. Note that we do not take any locks, so slightly corrupted/out + * of date values are a possibility. + */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationState *state; + Datum values[REPLICATION_ORIGIN_PROGRESS_COLS]; + bool nulls[REPLICATION_ORIGIN_PROGRESS_COLS]; + char *roname; + + state = &replication_states[i]; + + /* unused slot, nothing to display */ + if (state->roident == InvalidRepOriginId) + continue; + + memset(values, 0, sizeof(values)); + memset(nulls, 1, sizeof(nulls)); + + values[0] = ObjectIdGetDatum(state->roident); + nulls[0] = false; + + /* + * We're not preventing the origin to be dropped concurrently, so + * silently accept that it might be gone. + */ + if (replorigin_by_oid(state->roident, true, + &roname)) + { + values[1] = CStringGetTextDatum(roname); + nulls[1] = false; + } + + LWLockAcquire(&state->lock, LW_SHARED); + + values[2] = LSNGetDatum(state->remote_lsn); + nulls[2] = false; + + values[3] = LSNGetDatum(state->local_lsn); + nulls[3] = false; + + LWLockRelease(&state->lock); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + LWLockRelease(ReplicationOriginLock); + +#undef REPLICATION_ORIGIN_PROGRESS_COLS + + return (Datum) 0; +} diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c new file mode 100644 index 0000000..504f94d --- /dev/null +++ b/src/backend/replication/logical/proto.c @@ -0,0 +1,1271 @@ +/*------------------------------------------------------------------------- + * + * proto.c + * logical replication protocol functions + * + * Copyright (c) 2015-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/proto.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/sysattr.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_type.h" +#include "libpq/pqformat.h" +#include "replication/logicalproto.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" + +/* + * Protocol message flags. + */ +#define LOGICALREP_IS_REPLICA_IDENTITY 1 + +#define MESSAGE_TRANSACTIONAL (1<<0) +#define TRUNCATE_CASCADE (1<<0) +#define TRUNCATE_RESTART_SEQS (1<<1) + +static void logicalrep_write_attrs(StringInfo out, Relation rel, + Bitmapset *columns); +static void logicalrep_write_tuple(StringInfo out, Relation rel, + TupleTableSlot *slot, + bool binary, Bitmapset *columns); +static void logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel); +static void logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple); + +static void logicalrep_write_namespace(StringInfo out, Oid nspid); +static const char *logicalrep_read_namespace(StringInfo in); + +/* + * Check if a column is covered by a column list. + * + * Need to be careful about NULL, which is treated as a column list covering + * all columns. + */ +static bool +column_in_column_list(int attnum, Bitmapset *columns) +{ + return (columns == NULL || bms_is_member(attnum, columns)); +} + + +/* + * Write BEGIN to the output stream. + */ +void +logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_BEGIN); + + /* fixed fields */ + pq_sendint64(out, txn->final_lsn); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); +} + +/* + * Read transaction BEGIN from the stream. + */ +void +logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) +{ + /* read fields */ + begin_data->final_lsn = pq_getmsgint64(in); + if (begin_data->final_lsn == InvalidXLogRecPtr) + elog(ERROR, "final_lsn not set in begin message"); + begin_data->committime = pq_getmsgint64(in); + begin_data->xid = pq_getmsgint(in, 4); +} + + +/* + * Write COMMIT to the output stream. + */ +void +logicalrep_write_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_COMMIT); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); +} + +/* + * Read transaction COMMIT from the stream. + */ +void +logicalrep_read_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + /* read flags (unused for now) */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); +} + +/* + * Write BEGIN PREPARE to the output stream. + */ +void +logicalrep_write_begin_prepare(StringInfo out, ReorderBufferTXN *txn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_BEGIN_PREPARE); + + /* fixed fields */ + pq_sendint64(out, txn->final_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction BEGIN PREPARE from the stream. + */ +void +logicalrep_read_begin_prepare(StringInfo in, LogicalRepPreparedTxnData *begin_data) +{ + /* read fields */ + begin_data->prepare_lsn = pq_getmsgint64(in); + if (begin_data->prepare_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_lsn not set in begin prepare message"); + begin_data->end_lsn = pq_getmsgint64(in); + if (begin_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn not set in begin prepare message"); + begin_data->prepare_time = pq_getmsgint64(in); + begin_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(begin_data->gid, pq_getmsgstring(in), sizeof(begin_data->gid)); +} + +/* + * The core functionality for logicalrep_write_prepare and + * logicalrep_write_stream_prepare. + */ +static void +logicalrep_write_prepare_common(StringInfo out, LogicalRepMsgType type, + ReorderBufferTXN *txn, XLogRecPtr prepare_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, type); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + Assert(rbtxn_prepared(txn)); + Assert(TransactionIdIsValid(txn->xid)); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, prepare_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.prepare_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Write PREPARE to the output stream. + */ +void +logicalrep_write_prepare(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + logicalrep_write_prepare_common(out, LOGICAL_REP_MSG_PREPARE, + txn, prepare_lsn); +} + +/* + * The core functionality for logicalrep_read_prepare and + * logicalrep_read_stream_prepare. + */ +static void +logicalrep_read_prepare_common(StringInfo in, char *msgtype, + LogicalRepPreparedTxnData *prepare_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in %s message", flags, msgtype); + + /* read fields */ + prepare_data->prepare_lsn = pq_getmsgint64(in); + if (prepare_data->prepare_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_lsn is not set in %s message", msgtype); + prepare_data->end_lsn = pq_getmsgint64(in); + if (prepare_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn is not set in %s message", msgtype); + prepare_data->prepare_time = pq_getmsgint64(in); + prepare_data->xid = pq_getmsgint(in, 4); + if (prepare_data->xid == InvalidTransactionId) + elog(ERROR, "invalid two-phase transaction ID in %s message", msgtype); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(prepare_data->gid, pq_getmsgstring(in), sizeof(prepare_data->gid)); +} + +/* + * Read transaction PREPARE from the stream. + */ +void +logicalrep_read_prepare(StringInfo in, LogicalRepPreparedTxnData *prepare_data) +{ + logicalrep_read_prepare_common(in, "prepare", prepare_data); +} + +/* + * Write COMMIT PREPARED to the output stream. + */ +void +logicalrep_write_commit_prepared(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_COMMIT_PREPARED); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction COMMIT PREPARED from the stream. + */ +void +logicalrep_read_commit_prepared(StringInfo in, LogicalRepCommitPreparedTxnData *prepare_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit prepared message", flags); + + /* read fields */ + prepare_data->commit_lsn = pq_getmsgint64(in); + if (prepare_data->commit_lsn == InvalidXLogRecPtr) + elog(ERROR, "commit_lsn is not set in commit prepared message"); + prepare_data->end_lsn = pq_getmsgint64(in); + if (prepare_data->end_lsn == InvalidXLogRecPtr) + elog(ERROR, "end_lsn is not set in commit prepared message"); + prepare_data->commit_time = pq_getmsgint64(in); + prepare_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(prepare_data->gid, pq_getmsgstring(in), sizeof(prepare_data->gid)); +} + +/* + * Write ROLLBACK PREPARED to the output stream. + */ +void +logicalrep_write_rollback_prepared(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_ROLLBACK_PREPARED); + + /* + * This should only ever happen for two-phase commit transactions, in + * which case we expect to have a valid GID. + */ + Assert(txn->gid != NULL); + + /* send the flags field */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, prepare_end_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, prepare_time); + pq_sendint64(out, txn->xact_time.commit_time); + pq_sendint32(out, txn->xid); + + /* send gid */ + pq_sendstring(out, txn->gid); +} + +/* + * Read transaction ROLLBACK PREPARED from the stream. + */ +void +logicalrep_read_rollback_prepared(StringInfo in, + LogicalRepRollbackPreparedTxnData *rollback_data) +{ + /* read flags */ + uint8 flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in rollback prepared message", flags); + + /* read fields */ + rollback_data->prepare_end_lsn = pq_getmsgint64(in); + if (rollback_data->prepare_end_lsn == InvalidXLogRecPtr) + elog(ERROR, "prepare_end_lsn is not set in rollback prepared message"); + rollback_data->rollback_end_lsn = pq_getmsgint64(in); + if (rollback_data->rollback_end_lsn == InvalidXLogRecPtr) + elog(ERROR, "rollback_end_lsn is not set in rollback prepared message"); + rollback_data->prepare_time = pq_getmsgint64(in); + rollback_data->rollback_time = pq_getmsgint64(in); + rollback_data->xid = pq_getmsgint(in, 4); + + /* read gid (copy it into a pre-allocated buffer) */ + strlcpy(rollback_data->gid, pq_getmsgstring(in), sizeof(rollback_data->gid)); +} + +/* + * Write STREAM PREPARE to the output stream. + */ +void +logicalrep_write_stream_prepare(StringInfo out, + ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + logicalrep_write_prepare_common(out, LOGICAL_REP_MSG_STREAM_PREPARE, + txn, prepare_lsn); +} + +/* + * Read STREAM PREPARE from the stream. + */ +void +logicalrep_read_stream_prepare(StringInfo in, LogicalRepPreparedTxnData *prepare_data) +{ + logicalrep_read_prepare_common(in, "stream prepare", prepare_data); +} + +/* + * Write ORIGIN to the output stream. + */ +void +logicalrep_write_origin(StringInfo out, const char *origin, + XLogRecPtr origin_lsn) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_ORIGIN); + + /* fixed fields */ + pq_sendint64(out, origin_lsn); + + /* origin string */ + pq_sendstring(out, origin); +} + +/* + * Read ORIGIN from the output stream. + */ +char * +logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn) +{ + /* fixed fields */ + *origin_lsn = pq_getmsgint64(in); + + /* return origin */ + return pstrdup(pq_getmsgstring(in)); +} + +/* + * Write INSERT to the output stream. + */ +void +logicalrep_write_insert(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *newslot, bool binary, Bitmapset *columns) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_INSERT); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newslot, binary, columns); +} + +/* + * Read INSERT from stream. + * + * Fills the new tuple. + */ +LogicalRepRelId +logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + action = pq_getmsgbyte(in); + if (action != 'N') + elog(ERROR, "expected new tuple but got %d", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write UPDATE to the output stream. + */ +void +logicalrep_write_update(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *oldslot, TupleTableSlot *newslot, + bool binary, Bitmapset *columns) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_UPDATE); + + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (oldslot != NULL) + { + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + logicalrep_write_tuple(out, rel, oldslot, binary, columns); + } + + pq_sendbyte(out, 'N'); /* new tuple follows */ + logicalrep_write_tuple(out, rel, newslot, binary, columns); +} + +/* + * Read UPDATE from stream. + */ +LogicalRepRelId +logicalrep_read_update(StringInfo in, bool *has_oldtuple, + LogicalRepTupleData *oldtup, + LogicalRepTupleData *newtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O' && action != 'N') + elog(ERROR, "expected action 'N', 'O' or 'K', got %c", + action); + + /* check for old tuple */ + if (action == 'K' || action == 'O') + { + logicalrep_read_tuple(in, oldtup); + *has_oldtuple = true; + + action = pq_getmsgbyte(in); + } + else + *has_oldtuple = false; + + /* check for new tuple */ + if (action != 'N') + elog(ERROR, "expected action 'N', got %c", + action); + + logicalrep_read_tuple(in, newtup); + + return relid; +} + +/* + * Write DELETE to the output stream. + */ +void +logicalrep_write_delete(StringInfo out, TransactionId xid, Relation rel, + TupleTableSlot *oldslot, bool binary, + Bitmapset *columns) +{ + Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT || + rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL || + rel->rd_rel->relreplident == REPLICA_IDENTITY_INDEX); + + pq_sendbyte(out, LOGICAL_REP_MSG_DELETE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL) + pq_sendbyte(out, 'O'); /* old tuple follows */ + else + pq_sendbyte(out, 'K'); /* old key follows */ + + logicalrep_write_tuple(out, rel, oldslot, binary, columns); +} + +/* + * Read DELETE from stream. + * + * Fills the old tuple. + */ +LogicalRepRelId +logicalrep_read_delete(StringInfo in, LogicalRepTupleData *oldtup) +{ + char action; + LogicalRepRelId relid; + + /* read the relation id */ + relid = pq_getmsgint(in, 4); + + /* read and verify action */ + action = pq_getmsgbyte(in); + if (action != 'K' && action != 'O') + elog(ERROR, "expected action 'O' or 'K', got %c", action); + + logicalrep_read_tuple(in, oldtup); + + return relid; +} + +/* + * Write TRUNCATE to the output stream. + */ +void +logicalrep_write_truncate(StringInfo out, + TransactionId xid, + int nrelids, + Oid relids[], + bool cascade, bool restart_seqs) +{ + int i; + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_TRUNCATE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint32(out, nrelids); + + /* encode and send truncate flags */ + if (cascade) + flags |= TRUNCATE_CASCADE; + if (restart_seqs) + flags |= TRUNCATE_RESTART_SEQS; + pq_sendint8(out, flags); + + for (i = 0; i < nrelids; i++) + pq_sendint32(out, relids[i]); +} + +/* + * Read TRUNCATE from stream. + */ +List * +logicalrep_read_truncate(StringInfo in, + bool *cascade, bool *restart_seqs) +{ + int i; + int nrelids; + List *relids = NIL; + uint8 flags; + + nrelids = pq_getmsgint(in, 4); + + /* read and decode truncate flags */ + flags = pq_getmsgint(in, 1); + *cascade = (flags & TRUNCATE_CASCADE) > 0; + *restart_seqs = (flags & TRUNCATE_RESTART_SEQS) > 0; + + for (i = 0; i < nrelids; i++) + relids = lappend_oid(relids, pq_getmsgint(in, 4)); + + return relids; +} + +/* + * Write MESSAGE to stream + */ +void +logicalrep_write_message(StringInfo out, TransactionId xid, XLogRecPtr lsn, + bool transactional, const char *prefix, Size sz, + const char *message) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_MESSAGE); + + /* encode and send message flags */ + if (transactional) + flags |= MESSAGE_TRANSACTIONAL; + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + pq_sendint8(out, flags); + pq_sendint64(out, lsn); + pq_sendstring(out, prefix); + pq_sendint32(out, sz); + pq_sendbytes(out, message, sz); +} + +/* + * Write relation description to the output stream. + */ +void +logicalrep_write_rel(StringInfo out, TransactionId xid, Relation rel, + Bitmapset *columns) +{ + char *relname; + + pq_sendbyte(out, LOGICAL_REP_MSG_RELATION); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + /* use Oid as relation identifier */ + pq_sendint32(out, RelationGetRelid(rel)); + + /* send qualified relation name */ + logicalrep_write_namespace(out, RelationGetNamespace(rel)); + relname = RelationGetRelationName(rel); + pq_sendstring(out, relname); + + /* send replica identity */ + pq_sendbyte(out, rel->rd_rel->relreplident); + + /* send the attribute info */ + logicalrep_write_attrs(out, rel, columns); +} + +/* + * Read the relation info from stream and return as LogicalRepRelation. + */ +LogicalRepRelation * +logicalrep_read_rel(StringInfo in) +{ + LogicalRepRelation *rel = palloc(sizeof(LogicalRepRelation)); + + rel->remoteid = pq_getmsgint(in, 4); + + /* Read relation name from stream */ + rel->nspname = pstrdup(logicalrep_read_namespace(in)); + rel->relname = pstrdup(pq_getmsgstring(in)); + + /* Read the replica identity. */ + rel->replident = pq_getmsgbyte(in); + + /* Get attribute description */ + logicalrep_read_attrs(in, rel); + + return rel; +} + +/* + * Write type info to the output stream. + * + * This function will always write base type info. + */ +void +logicalrep_write_typ(StringInfo out, TransactionId xid, Oid typoid) +{ + Oid basetypoid = getBaseType(typoid); + HeapTuple tup; + Form_pg_type typtup; + + pq_sendbyte(out, LOGICAL_REP_MSG_TYPE); + + /* transaction ID (if not valid, we're not streaming) */ + if (TransactionIdIsValid(xid)) + pq_sendint32(out, xid); + + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(basetypoid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", basetypoid); + typtup = (Form_pg_type) GETSTRUCT(tup); + + /* use Oid as relation identifier */ + pq_sendint32(out, typoid); + + /* send qualified type name */ + logicalrep_write_namespace(out, typtup->typnamespace); + pq_sendstring(out, NameStr(typtup->typname)); + + ReleaseSysCache(tup); +} + +/* + * Read type info from the output stream. + */ +void +logicalrep_read_typ(StringInfo in, LogicalRepTyp *ltyp) +{ + ltyp->remoteid = pq_getmsgint(in, 4); + + /* Read type name from stream */ + ltyp->nspname = pstrdup(logicalrep_read_namespace(in)); + ltyp->typname = pstrdup(pq_getmsgstring(in)); +} + +/* + * Write a tuple to the outputstream, in the most efficient format possible. + */ +static void +logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, + bool binary, Bitmapset *columns) +{ + TupleDesc desc; + Datum *values; + bool *isnull; + int i; + uint16 nliveatts = 0; + + desc = RelationGetDescr(rel); + + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + nliveatts++; + } + pq_sendint16(out, nliveatts); + + slot_getallattrs(slot); + values = slot->tts_values; + isnull = slot->tts_isnull; + + /* Write the values */ + for (i = 0; i < desc->natts; i++) + { + HeapTuple typtup; + Form_pg_type typclass; + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + if (isnull[i]) + { + pq_sendbyte(out, LOGICALREP_COLUMN_NULL); + continue; + } + + if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + { + /* + * Unchanged toasted datum. (Note that we don't promise to detect + * unchanged data in general; this is just a cheap check to avoid + * sending large values unnecessarily.) + */ + pq_sendbyte(out, LOGICALREP_COLUMN_UNCHANGED); + continue; + } + + typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(att->atttypid)); + if (!HeapTupleIsValid(typtup)) + elog(ERROR, "cache lookup failed for type %u", att->atttypid); + typclass = (Form_pg_type) GETSTRUCT(typtup); + + /* + * Send in binary if requested and type has suitable send function. + */ + if (binary && OidIsValid(typclass->typsend)) + { + bytea *outputbytes; + int len; + + pq_sendbyte(out, LOGICALREP_COLUMN_BINARY); + outputbytes = OidSendFunctionCall(typclass->typsend, values[i]); + len = VARSIZE(outputbytes) - VARHDRSZ; + pq_sendint(out, len, 4); /* length */ + pq_sendbytes(out, VARDATA(outputbytes), len); /* data */ + pfree(outputbytes); + } + else + { + char *outputstr; + + pq_sendbyte(out, LOGICALREP_COLUMN_TEXT); + outputstr = OidOutputFunctionCall(typclass->typoutput, values[i]); + pq_sendcountedtext(out, outputstr, strlen(outputstr), false); + pfree(outputstr); + } + + ReleaseSysCache(typtup); + } +} + +/* + * Read tuple in logical replication format from stream. + */ +static void +logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple) +{ + int i; + int natts; + + /* Get number of attributes */ + natts = pq_getmsgint(in, 2); + + /* Allocate space for per-column values; zero out unused StringInfoDatas */ + tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData)); + tuple->colstatus = (char *) palloc(natts * sizeof(char)); + tuple->ncols = natts; + + /* Read the data */ + for (i = 0; i < natts; i++) + { + char kind; + int len; + StringInfo value = &tuple->colvalues[i]; + + kind = pq_getmsgbyte(in); + tuple->colstatus[i] = kind; + + switch (kind) + { + case LOGICALREP_COLUMN_NULL: + /* nothing more to do */ + break; + case LOGICALREP_COLUMN_UNCHANGED: + /* we don't receive the value of an unchanged column */ + break; + case LOGICALREP_COLUMN_TEXT: + case LOGICALREP_COLUMN_BINARY: + len = pq_getmsgint(in, 4); /* read length */ + + /* and data */ + value->data = palloc(len + 1); + pq_copymsgbytes(in, value->data, len); + + /* + * Not strictly necessary for LOGICALREP_COLUMN_BINARY, but + * per StringInfo practice. + */ + value->data[len] = '\0'; + + /* make StringInfo fully valid */ + value->len = len; + value->cursor = 0; + value->maxlen = len; + break; + default: + elog(ERROR, "unrecognized data representation type '%c'", kind); + } + } +} + +/* + * Write relation attribute metadata to the stream. + */ +static void +logicalrep_write_attrs(StringInfo out, Relation rel, Bitmapset *columns) +{ + TupleDesc desc; + int i; + uint16 nliveatts = 0; + Bitmapset *idattrs = NULL; + bool replidentfull; + + desc = RelationGetDescr(rel); + + /* send number of live attributes */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + nliveatts++; + } + pq_sendint16(out, nliveatts); + + /* fetch bitmap of REPLICATION IDENTITY attributes */ + replidentfull = (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL); + if (!replidentfull) + idattrs = RelationGetIdentityKeyBitmap(rel); + + /* send the attributes */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + uint8 flags = 0; + + if (att->attisdropped || att->attgenerated) + continue; + + if (!column_in_column_list(att->attnum, columns)) + continue; + + /* REPLICA IDENTITY FULL means all columns are sent as part of key. */ + if (replidentfull || + bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + idattrs)) + flags |= LOGICALREP_IS_REPLICA_IDENTITY; + + pq_sendbyte(out, flags); + + /* attribute name */ + pq_sendstring(out, NameStr(att->attname)); + + /* attribute type id */ + pq_sendint32(out, (int) att->atttypid); + + /* attribute mode */ + pq_sendint32(out, att->atttypmod); + } + + bms_free(idattrs); +} + +/* + * Read relation attribute metadata from the stream. + */ +static void +logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel) +{ + int i; + int natts; + char **attnames; + Oid *atttyps; + Bitmapset *attkeys = NULL; + + natts = pq_getmsgint(in, 2); + attnames = palloc(natts * sizeof(char *)); + atttyps = palloc(natts * sizeof(Oid)); + + /* read the attributes */ + for (i = 0; i < natts; i++) + { + uint8 flags; + + /* Check for replica identity column */ + flags = pq_getmsgbyte(in); + if (flags & LOGICALREP_IS_REPLICA_IDENTITY) + attkeys = bms_add_member(attkeys, i); + + /* attribute name */ + attnames[i] = pstrdup(pq_getmsgstring(in)); + + /* attribute type id */ + atttyps[i] = (Oid) pq_getmsgint(in, 4); + + /* we ignore attribute mode for now */ + (void) pq_getmsgint(in, 4); + } + + rel->attnames = attnames; + rel->atttyps = atttyps; + rel->attkeys = attkeys; + rel->natts = natts; +} + +/* + * Write the namespace name or empty string for pg_catalog (to save space). + */ +static void +logicalrep_write_namespace(StringInfo out, Oid nspid) +{ + if (nspid == PG_CATALOG_NAMESPACE) + pq_sendbyte(out, '\0'); + else + { + char *nspname = get_namespace_name(nspid); + + if (nspname == NULL) + elog(ERROR, "cache lookup failed for namespace %u", + nspid); + + pq_sendstring(out, nspname); + } +} + +/* + * Read the namespace name while treating empty string as pg_catalog. + */ +static const char * +logicalrep_read_namespace(StringInfo in) +{ + const char *nspname = pq_getmsgstring(in); + + if (nspname[0] == '\0') + nspname = "pg_catalog"; + + return nspname; +} + +/* + * Write the information for the start stream message to the output stream. + */ +void +logicalrep_write_stream_start(StringInfo out, + TransactionId xid, bool first_segment) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_START); + + Assert(TransactionIdIsValid(xid)); + + /* transaction ID (we're starting to stream, so must be valid) */ + pq_sendint32(out, xid); + + /* 1 if this is the first streaming segment for this xid */ + pq_sendbyte(out, first_segment ? 1 : 0); +} + +/* + * Read the information about the start stream message from output stream. + */ +TransactionId +logicalrep_read_stream_start(StringInfo in, bool *first_segment) +{ + TransactionId xid; + + Assert(first_segment); + + xid = pq_getmsgint(in, 4); + *first_segment = (pq_getmsgbyte(in) == 1); + + return xid; +} + +/* + * Write the stop stream message to the output stream. + */ +void +logicalrep_write_stream_stop(StringInfo out) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_STOP); +} + +/* + * Write STREAM COMMIT to the output stream. + */ +void +logicalrep_write_stream_commit(StringInfo out, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + uint8 flags = 0; + + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_COMMIT); + + Assert(TransactionIdIsValid(txn->xid)); + + /* transaction ID */ + pq_sendint32(out, txn->xid); + + /* send the flags field (unused for now) */ + pq_sendbyte(out, flags); + + /* send fields */ + pq_sendint64(out, commit_lsn); + pq_sendint64(out, txn->end_lsn); + pq_sendint64(out, txn->xact_time.commit_time); +} + +/* + * Read STREAM COMMIT from the output stream. + */ +TransactionId +logicalrep_read_stream_commit(StringInfo in, LogicalRepCommitData *commit_data) +{ + TransactionId xid; + uint8 flags; + + xid = pq_getmsgint(in, 4); + + /* read flags (unused for now) */ + flags = pq_getmsgbyte(in); + + if (flags != 0) + elog(ERROR, "unrecognized flags %u in commit message", flags); + + /* read fields */ + commit_data->commit_lsn = pq_getmsgint64(in); + commit_data->end_lsn = pq_getmsgint64(in); + commit_data->committime = pq_getmsgint64(in); + + return xid; +} + +/* + * Write STREAM ABORT to the output stream. Note that xid and subxid will be + * same for the top-level transaction abort. + * + * If write_abort_info is true, send the abort_lsn and abort_time fields, + * otherwise don't. + */ +void +logicalrep_write_stream_abort(StringInfo out, TransactionId xid, + TransactionId subxid, XLogRecPtr abort_lsn, + TimestampTz abort_time, bool write_abort_info) +{ + pq_sendbyte(out, LOGICAL_REP_MSG_STREAM_ABORT); + + Assert(TransactionIdIsValid(xid) && TransactionIdIsValid(subxid)); + + /* transaction ID */ + pq_sendint32(out, xid); + pq_sendint32(out, subxid); + + if (write_abort_info) + { + pq_sendint64(out, abort_lsn); + pq_sendint64(out, abort_time); + } +} + +/* + * Read STREAM ABORT from the output stream. + * + * If read_abort_info is true, read the abort_lsn and abort_time fields, + * otherwise don't. + */ +void +logicalrep_read_stream_abort(StringInfo in, + LogicalRepStreamAbortData *abort_data, + bool read_abort_info) +{ + Assert(abort_data); + + abort_data->xid = pq_getmsgint(in, 4); + abort_data->subxid = pq_getmsgint(in, 4); + + if (read_abort_info) + { + abort_data->abort_lsn = pq_getmsgint64(in); + abort_data->abort_time = pq_getmsgint64(in); + } + else + { + abort_data->abort_lsn = InvalidXLogRecPtr; + abort_data->abort_time = 0; + } +} + +/* + * Get string representing LogicalRepMsgType. + */ +const char * +logicalrep_message_type(LogicalRepMsgType action) +{ + static char err_unknown[20]; + + switch (action) + { + case LOGICAL_REP_MSG_BEGIN: + return "BEGIN"; + case LOGICAL_REP_MSG_COMMIT: + return "COMMIT"; + case LOGICAL_REP_MSG_ORIGIN: + return "ORIGIN"; + case LOGICAL_REP_MSG_INSERT: + return "INSERT"; + case LOGICAL_REP_MSG_UPDATE: + return "UPDATE"; + case LOGICAL_REP_MSG_DELETE: + return "DELETE"; + case LOGICAL_REP_MSG_TRUNCATE: + return "TRUNCATE"; + case LOGICAL_REP_MSG_RELATION: + return "RELATION"; + case LOGICAL_REP_MSG_TYPE: + return "TYPE"; + case LOGICAL_REP_MSG_MESSAGE: + return "MESSAGE"; + case LOGICAL_REP_MSG_BEGIN_PREPARE: + return "BEGIN PREPARE"; + case LOGICAL_REP_MSG_PREPARE: + return "PREPARE"; + case LOGICAL_REP_MSG_COMMIT_PREPARED: + return "COMMIT PREPARED"; + case LOGICAL_REP_MSG_ROLLBACK_PREPARED: + return "ROLLBACK PREPARED"; + case LOGICAL_REP_MSG_STREAM_START: + return "STREAM START"; + case LOGICAL_REP_MSG_STREAM_STOP: + return "STREAM STOP"; + case LOGICAL_REP_MSG_STREAM_COMMIT: + return "STREAM COMMIT"; + case LOGICAL_REP_MSG_STREAM_ABORT: + return "STREAM ABORT"; + case LOGICAL_REP_MSG_STREAM_PREPARE: + return "STREAM PREPARE"; + } + + /* + * This message provides context in the error raised when applying a + * logical message. So we can't throw an error here. Return an unknown + * indicator value so that the original error is still reported. + */ + snprintf(err_unknown, sizeof(err_unknown), "??? (%d)", action); + + return err_unknown; +} diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c new file mode 100644 index 0000000..6edbd56 --- /dev/null +++ b/src/backend/replication/logical/relation.c @@ -0,0 +1,889 @@ +/*------------------------------------------------------------------------- + * relation.c + * PostgreSQL logical replication relation mapping cache + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/relation.c + * + * NOTES + * Routines in this file mainly have to do with mapping the properties + * of local replication target relations to the properties of their + * remote counterpart. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/table.h" +#include "catalog/namespace.h" +#include "catalog/pg_am_d.h" +#include "catalog/pg_subscription_rel.h" +#include "executor/executor.h" +#include "nodes/makefuncs.h" +#include "replication/logicalrelation.h" +#include "replication/worker_internal.h" +#include "utils/inval.h" + + +static MemoryContext LogicalRepRelMapContext = NULL; + +static HTAB *LogicalRepRelMap = NULL; + +/* + * Partition map (LogicalRepPartMap) + * + * When a partitioned table is used as replication target, replicated + * operations are actually performed on its leaf partitions, which requires + * the partitions to also be mapped to the remote relation. Parent's entry + * (LogicalRepRelMapEntry) cannot be used as-is for all partitions, because + * individual partitions may have different attribute numbers, which means + * attribute mappings to remote relation's attributes must be maintained + * separately for each partition. + */ +static MemoryContext LogicalRepPartMapContext = NULL; +static HTAB *LogicalRepPartMap = NULL; +typedef struct LogicalRepPartMapEntry +{ + Oid partoid; /* LogicalRepPartMap's key */ + LogicalRepRelMapEntry relmapentry; +} LogicalRepPartMapEntry; + +static Oid FindLogicalRepLocalIndex(Relation localrel, LogicalRepRelation *remoterel, + AttrMap *attrMap); + +/* + * Relcache invalidation callback for our relation map cache. + */ +static void +logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepRelMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepRelMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->localreloid == reloid) + { + entry->localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepRelMap); + + while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) + entry->localrelvalid = false; + } +} + +/* + * Initialize the relation map cache. + */ +static void +logicalrep_relmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepRelMapContext) + LogicalRepRelMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepRelMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(LogicalRepRelId); + ctl.entrysize = sizeof(LogicalRepRelMapEntry); + ctl.hcxt = LogicalRepRelMapContext; + + LogicalRepRelMap = hash_create("logicalrep relation map cache", 128, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_relmap_invalidate_cb, + (Datum) 0); +} + +/* + * Free the entry of a relation map cache. + */ +static void +logicalrep_relmap_free_entry(LogicalRepRelMapEntry *entry) +{ + LogicalRepRelation *remoterel; + + remoterel = &entry->remoterel; + + pfree(remoterel->nspname); + pfree(remoterel->relname); + + if (remoterel->natts > 0) + { + int i; + + for (i = 0; i < remoterel->natts; i++) + pfree(remoterel->attnames[i]); + + pfree(remoterel->attnames); + pfree(remoterel->atttyps); + } + bms_free(remoterel->attkeys); + + if (entry->attrmap) + free_attrmap(entry->attrmap); +} + +/* + * Add new entry or update existing entry in the relation map cache. + * + * Called when new relation mapping is sent by the publisher to update + * our expected view of incoming data from said publisher. + */ +void +logicalrep_relmap_update(LogicalRepRelation *remoterel) +{ + MemoryContext oldctx; + LogicalRepRelMapEntry *entry; + bool found; + int i; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* + * HASH_ENTER returns the existing entry if present or creates a new one. + */ + entry = hash_search(LogicalRepRelMap, &remoterel->remoteid, + HASH_ENTER, &found); + + if (found) + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + + /* Make cached copy of the data */ + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + MemoryContextSwitchTo(oldctx); +} + +/* + * Find attribute index in TupleDesc struct by attribute name. + * + * Returns -1 if not found. + */ +static int +logicalrep_rel_att_by_name(LogicalRepRelation *remoterel, const char *attname) +{ + int i; + + for (i = 0; i < remoterel->natts; i++) + { + if (strcmp(remoterel->attnames[i], attname) == 0) + return i; + } + + return -1; +} + +/* + * Report error with names of the missing local relation column(s), if any. + */ +static void +logicalrep_report_missing_attrs(LogicalRepRelation *remoterel, + Bitmapset *missingatts) +{ + if (!bms_is_empty(missingatts)) + { + StringInfoData missingattsbuf; + int missingattcnt = 0; + int i; + + initStringInfo(&missingattsbuf); + + i = -1; + while ((i = bms_next_member(missingatts, i)) >= 0) + { + missingattcnt++; + if (missingattcnt == 1) + appendStringInfo(&missingattsbuf, _("\"%s\""), + remoterel->attnames[i]); + else + appendStringInfo(&missingattsbuf, _(", \"%s\""), + remoterel->attnames[i]); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg_plural("logical replication target relation \"%s.%s\" is missing replicated column: %s", + "logical replication target relation \"%s.%s\" is missing replicated columns: %s", + missingattcnt, + remoterel->nspname, + remoterel->relname, + missingattsbuf.data))); + } +} + +/* + * Check if replica identity matches and mark the updatable flag. + * + * We allow for stricter replica identity (fewer columns) on subscriber as + * that will not stop us from finding unique tuple. IE, if publisher has + * identity (id,timestamp) and subscriber just (id) this will not be a + * problem, but in the opposite scenario it will. + * + * We just mark the relation entry as not updatable here if the local + * replica identity is found to be insufficient for applying + * updates/deletes (inserts don't care!) and leave it to + * check_relation_updatable() to throw the actual error if needed. + */ +static void +logicalrep_rel_mark_updatable(LogicalRepRelMapEntry *entry) +{ + Bitmapset *idkey; + LogicalRepRelation *remoterel = &entry->remoterel; + int i; + + entry->updatable = true; + + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + /* fallback to PK if no replica identity */ + if (idkey == NULL) + { + idkey = RelationGetIndexAttrBitmap(entry->localrel, + INDEX_ATTR_BITMAP_PRIMARY_KEY); + + /* + * If no replica identity index and no PK, the published table must + * have replica identity FULL. + */ + if (idkey == NULL && remoterel->replident != REPLICA_IDENTITY_FULL) + entry->updatable = false; + } + + i = -1; + while ((i = bms_next_member(idkey, i)) >= 0) + { + int attnum = i + FirstLowInvalidHeapAttributeNumber; + + if (!AttrNumberIsForUserDefinedAttr(attnum)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" uses " + "system columns in REPLICA IDENTITY index", + remoterel->nspname, remoterel->relname))); + + attnum = AttrNumberGetAttrOffset(attnum); + + if (entry->attrmap->attnums[attnum] < 0 || + !bms_is_member(entry->attrmap->attnums[attnum], remoterel->attkeys)) + { + entry->updatable = false; + break; + } + } +} + +/* + * Open the local relation associated with the remote one. + * + * Rebuilds the Relcache mapping if it was invalidated by local DDL. + */ +LogicalRepRelMapEntry * +logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lockmode) +{ + LogicalRepRelMapEntry *entry; + bool found; + LogicalRepRelation *remoterel; + + if (LogicalRepRelMap == NULL) + logicalrep_relmap_init(); + + /* Search for existing entry. */ + entry = hash_search(LogicalRepRelMap, &remoteid, + HASH_FIND, &found); + + if (!found) + elog(ERROR, "no relation map entry for remote relation ID %u", + remoteid); + + remoterel = &entry->remoterel; + + /* Ensure we don't leak a relcache refcount. */ + if (entry->localrel) + elog(ERROR, "remote relation ID %u is already open", remoteid); + + /* + * When opening and locking a relation, pending invalidation messages are + * processed which can invalidate the relation. Hence, if the entry is + * currently considered valid, try to open the local relation by OID and + * see if invalidation ensues. + */ + if (entry->localrelvalid) + { + entry->localrel = try_table_open(entry->localreloid, lockmode); + if (!entry->localrel) + { + /* Table was renamed or dropped. */ + entry->localrelvalid = false; + } + else if (!entry->localrelvalid) + { + /* Note we release the no-longer-useful lock here. */ + table_close(entry->localrel, lockmode); + entry->localrel = NULL; + } + } + + /* + * If the entry has been marked invalid since we last had lock on it, + * re-open the local relation by name and rebuild all derived data. + */ + if (!entry->localrelvalid) + { + Oid relid; + TupleDesc desc; + MemoryContext oldctx; + int i; + Bitmapset *missingatts; + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + /* Try to find and lock the relation by name. */ + relid = RangeVarGetRelid(makeRangeVar(remoterel->nspname, + remoterel->relname, -1), + lockmode, true); + if (!OidIsValid(relid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" does not exist", + remoterel->nspname, remoterel->relname))); + entry->localrel = table_open(relid, NoLock); + entry->localreloid = relid; + + /* Check for supported relkind. */ + CheckSubscriptionRelkind(entry->localrel->rd_rel->relkind, + remoterel->nspname, remoterel->relname); + + /* + * Build the mapping of local attribute numbers to remote attribute + * numbers and validate that we don't miss any replicated columns as + * that would result in potentially unwanted data loss. + */ + desc = RelationGetDescr(entry->localrel); + oldctx = MemoryContextSwitchTo(LogicalRepRelMapContext); + entry->attrmap = make_attrmap(desc->natts); + MemoryContextSwitchTo(oldctx); + + /* check and report missing attrs, if any */ + missingatts = bms_add_range(NULL, 0, remoterel->natts - 1); + for (i = 0; i < desc->natts; i++) + { + int attnum; + Form_pg_attribute attr = TupleDescAttr(desc, i); + + if (attr->attisdropped || attr->attgenerated) + { + entry->attrmap->attnums[i] = -1; + continue; + } + + attnum = logicalrep_rel_att_by_name(remoterel, + NameStr(attr->attname)); + + entry->attrmap->attnums[i] = attnum; + if (attnum >= 0) + missingatts = bms_del_member(missingatts, attnum); + } + + logicalrep_report_missing_attrs(remoterel, missingatts); + + /* be tidy */ + bms_free(missingatts); + + /* + * Set if the table's replica identity is enough to apply + * update/delete. + */ + logicalrep_rel_mark_updatable(entry); + + /* + * Finding a usable index is an infrequent task. It occurs when an + * operation is first performed on the relation, or after invalidation + * of the relation cache entry (such as ANALYZE or CREATE/DROP index + * on the relation). + */ + entry->localindexoid = FindLogicalRepLocalIndex(entry->localrel, remoterel, + entry->attrmap); + + entry->localrelvalid = true; + } + + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(MySubscription->oid, + entry->localreloid, + &entry->statelsn); + + return entry; +} + +/* + * Close the previously opened logical relation. + */ +void +logicalrep_rel_close(LogicalRepRelMapEntry *rel, LOCKMODE lockmode) +{ + table_close(rel->localrel, lockmode); + rel->localrel = NULL; +} + +/* + * Partition cache: look up partition LogicalRepRelMapEntry's + * + * Unlike relation map cache, this is keyed by partition OID, not remote + * relation OID, because we only have to use this cache in the case where + * partitions are not directly mapped to any remote relation, such as when + * replication is occurring with one of their ancestors as target. + */ + +/* + * Relcache invalidation callback + */ +static void +logicalrep_partmap_invalidate_cb(Datum arg, Oid reloid) +{ + LogicalRepPartMapEntry *entry; + + /* Just to be sure. */ + if (LogicalRepPartMap == NULL) + return; + + if (reloid != InvalidOid) + { + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + /* TODO, use inverse lookup hashtable? */ + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + if (entry->relmapentry.localreloid == reloid) + { + entry->relmapentry.localrelvalid = false; + hash_seq_term(&status); + break; + } + } + } + else + { + /* invalidate all cache entries */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, LogicalRepPartMap); + + while ((entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + entry->relmapentry.localrelvalid = false; + } +} + +/* + * Reset the entries in the partition map that refer to remoterel. + * + * Called when new relation mapping is sent by the publisher to update our + * expected view of incoming data from said publisher. + * + * Note that we don't update the remoterel information in the entry here, + * we will update the information in logicalrep_partition_open to avoid + * unnecessary work. + */ +void +logicalrep_partmap_reset_relmap(LogicalRepRelation *remoterel) +{ + HASH_SEQ_STATUS status; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelMapEntry *entry; + + if (LogicalRepPartMap == NULL) + return; + + hash_seq_init(&status, LogicalRepPartMap); + while ((part_entry = (LogicalRepPartMapEntry *) hash_seq_search(&status)) != NULL) + { + entry = &part_entry->relmapentry; + + if (entry->remoterel.remoteid != remoterel->remoteid) + continue; + + logicalrep_relmap_free_entry(entry); + + memset(entry, 0, sizeof(LogicalRepRelMapEntry)); + } +} + +/* + * Initialize the partition map cache. + */ +static void +logicalrep_partmap_init(void) +{ + HASHCTL ctl; + + if (!LogicalRepPartMapContext) + LogicalRepPartMapContext = + AllocSetContextCreate(CacheMemoryContext, + "LogicalRepPartMapContext", + ALLOCSET_DEFAULT_SIZES); + + /* Initialize the relation hash table. */ + ctl.keysize = sizeof(Oid); /* partition OID */ + ctl.entrysize = sizeof(LogicalRepPartMapEntry); + ctl.hcxt = LogicalRepPartMapContext; + + LogicalRepPartMap = hash_create("logicalrep partition map cache", 64, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Watch for invalidation events. */ + CacheRegisterRelcacheCallback(logicalrep_partmap_invalidate_cb, + (Datum) 0); +} + +/* + * logicalrep_partition_open + * + * Returned entry reuses most of the values of the root table's entry, save + * the attribute map, which can be different for the partition. However, + * we must physically copy all the data, in case the root table's entry + * gets freed/rebuilt. + * + * Note there's no logicalrep_partition_close, because the caller closes the + * component relation. + */ +LogicalRepRelMapEntry * +logicalrep_partition_open(LogicalRepRelMapEntry *root, + Relation partrel, AttrMap *map) +{ + LogicalRepRelMapEntry *entry; + LogicalRepPartMapEntry *part_entry; + LogicalRepRelation *remoterel = &root->remoterel; + Oid partOid = RelationGetRelid(partrel); + AttrMap *attrmap = root->attrmap; + bool found; + MemoryContext oldctx; + + if (LogicalRepPartMap == NULL) + logicalrep_partmap_init(); + + /* Search for existing entry. */ + part_entry = (LogicalRepPartMapEntry *) hash_search(LogicalRepPartMap, + &partOid, + HASH_ENTER, &found); + + entry = &part_entry->relmapentry; + + /* + * We must always overwrite entry->localrel with the latest partition + * Relation pointer, because the Relation pointed to by the old value may + * have been cleared after the caller would have closed the partition + * relation after the last use of this entry. Note that localrelvalid is + * only updated by the relcache invalidation callback, so it may still be + * true irrespective of whether the Relation pointed to by localrel has + * been cleared or not. + */ + if (found && entry->localrelvalid) + { + entry->localrel = partrel; + return entry; + } + + /* Switch to longer-lived context. */ + oldctx = MemoryContextSwitchTo(LogicalRepPartMapContext); + + if (!found) + { + memset(part_entry, 0, sizeof(LogicalRepPartMapEntry)); + part_entry->partoid = partOid; + } + + /* Release the no-longer-useful attrmap, if any. */ + if (entry->attrmap) + { + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + } + + if (!entry->remoterel.remoteid) + { + int i; + + /* Remote relation is copied as-is from the root entry. */ + entry = &part_entry->relmapentry; + entry->remoterel.remoteid = remoterel->remoteid; + entry->remoterel.nspname = pstrdup(remoterel->nspname); + entry->remoterel.relname = pstrdup(remoterel->relname); + entry->remoterel.natts = remoterel->natts; + entry->remoterel.attnames = palloc(remoterel->natts * sizeof(char *)); + entry->remoterel.atttyps = palloc(remoterel->natts * sizeof(Oid)); + for (i = 0; i < remoterel->natts; i++) + { + entry->remoterel.attnames[i] = pstrdup(remoterel->attnames[i]); + entry->remoterel.atttyps[i] = remoterel->atttyps[i]; + } + entry->remoterel.replident = remoterel->replident; + entry->remoterel.attkeys = bms_copy(remoterel->attkeys); + } + + entry->localrel = partrel; + entry->localreloid = partOid; + + /* + * If the partition's attributes don't match the root relation's, we'll + * need to make a new attrmap which maps partition attribute numbers to + * remoterel's, instead of the original which maps root relation's + * attribute numbers to remoterel's. + * + * Note that 'map' which comes from the tuple routing data structure + * contains 1-based attribute numbers (of the parent relation). However, + * the map in 'entry', a logical replication data structure, contains + * 0-based attribute numbers (of the remote relation). + */ + if (map) + { + AttrNumber attno; + + entry->attrmap = make_attrmap(map->maplen); + for (attno = 0; attno < entry->attrmap->maplen; attno++) + { + AttrNumber root_attno = map->attnums[attno]; + + /* 0 means it's a dropped attribute. See comments atop AttrMap. */ + if (root_attno == 0) + entry->attrmap->attnums[attno] = -1; + else + entry->attrmap->attnums[attno] = attrmap->attnums[root_attno - 1]; + } + } + else + { + /* Lacking copy_attmap, do this the hard way. */ + entry->attrmap = make_attrmap(attrmap->maplen); + memcpy(entry->attrmap->attnums, attrmap->attnums, + attrmap->maplen * sizeof(AttrNumber)); + } + + /* Set if the table's replica identity is enough to apply update/delete. */ + logicalrep_rel_mark_updatable(entry); + + /* state and statelsn are left set to 0. */ + MemoryContextSwitchTo(oldctx); + + /* + * Finding a usable index is an infrequent task. It occurs when an + * operation is first performed on the relation, or after invalidation of + * the relation cache entry (such as ANALYZE or CREATE/DROP index on the + * relation). + * + * We also prefer to run this code on the oldctx so that we do not leak + * anything in the LogicalRepPartMapContext (hence CacheMemoryContext). + */ + entry->localindexoid = FindLogicalRepLocalIndex(partrel, remoterel, + entry->attrmap); + + entry->localrelvalid = true; + + return entry; +} + +/* + * Returns the oid of an index that can be used by the apply worker to scan + * the relation. + * + * We expect to call this function when REPLICA IDENTITY FULL is defined for + * the remote relation. + * + * If no suitable index is found, returns InvalidOid. + */ +static Oid +FindUsableIndexForReplicaIdentityFull(Relation localrel, AttrMap *attrmap) +{ + List *idxlist = RelationGetIndexList(localrel); + ListCell *lc; + + foreach(lc, idxlist) + { + Oid idxoid = lfirst_oid(lc); + bool isUsableIdx; + Relation idxRel; + IndexInfo *idxInfo; + + idxRel = index_open(idxoid, AccessShareLock); + idxInfo = BuildIndexInfo(idxRel); + isUsableIdx = IsIndexUsableForReplicaIdentityFull(idxInfo, attrmap); + index_close(idxRel, AccessShareLock); + + /* Return the first eligible index found */ + if (isUsableIdx) + return idxoid; + } + + return InvalidOid; +} + +/* + * Returns true if the index is usable for replica identity full. + * + * The index must be btree, non-partial, and the leftmost field must be a + * column (not an expression) that references the remote relation column. + * These limitations help to keep the index scan similar to PK/RI index + * scans. + * + * attrmap is a map of local attributes to remote ones. We can consult this + * map to check whether the local index attribute has a corresponding remote + * attribute. + * + * Note that the limitations of index scans for replica identity full only + * adheres to a subset of the limitations of PK/RI. For example, we support + * columns that are marked as [NULL] or we are not interested in the [NOT + * DEFERRABLE] aspect of constraints here. It works for us because we always + * compare the tuples for non-PK/RI index scans. See + * RelationFindReplTupleByIndex(). + * + * XXX: There are no fundamental problems for supporting non-btree indexes. + * We mostly need to relax the limitations in RelationFindReplTupleByIndex(). + * For partial indexes, the required changes are likely to be larger. If + * none of the tuples satisfy the expression for the index scan, we fall-back + * to sequential execution, which might not be a good idea in some cases. + */ +bool +IsIndexUsableForReplicaIdentityFull(IndexInfo *indexInfo, AttrMap *attrmap) +{ + AttrNumber keycol; + + /* The index must be a Btree index */ + if (indexInfo->ii_Am != BTREE_AM_OID) + return false; + + /* The index must not be a partial index */ + if (indexInfo->ii_Predicate != NIL) + return false; + + Assert(indexInfo->ii_NumIndexAttrs >= 1); + + /* The leftmost index field must not be an expression */ + keycol = indexInfo->ii_IndexAttrNumbers[0]; + if (!AttributeNumberIsValid(keycol)) + return false; + + /* + * And the leftmost index field must reference the remote relation column. + * This is because if it doesn't, the sequential scan is favorable over + * index scan in most cases. + */ + if (attrmap->maplen <= AttrNumberGetAttrOffset(keycol) || + attrmap->attnums[AttrNumberGetAttrOffset(keycol)] < 0) + return false; + + return true; +} + +/* + * Get replica identity index or if it is not defined a primary key. + * + * If neither is defined, returns InvalidOid + */ +Oid +GetRelationIdentityOrPK(Relation rel) +{ + Oid idxoid; + + idxoid = RelationGetReplicaIndex(rel); + + if (!OidIsValid(idxoid)) + idxoid = RelationGetPrimaryKeyIndex(rel); + + return idxoid; +} + +/* + * Returns the index oid if we can use an index for subscriber. Otherwise, + * returns InvalidOid. + */ +static Oid +FindLogicalRepLocalIndex(Relation localrel, LogicalRepRelation *remoterel, + AttrMap *attrMap) +{ + Oid idxoid; + + /* + * We never need index oid for partitioned tables, always rely on leaf + * partition's index. + */ + if (localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return InvalidOid; + + /* + * Simple case, we already have a primary key or a replica identity index. + */ + idxoid = GetRelationIdentityOrPK(localrel); + if (OidIsValid(idxoid)) + return idxoid; + + if (remoterel->replident == REPLICA_IDENTITY_FULL) + { + /* + * We are looking for one more opportunity for using an index. If + * there are any indexes defined on the local relation, try to pick a + * suitable index. + * + * The index selection safely assumes that all the columns are going + * to be available for the index scan given that remote relation has + * replica identity full. + * + * Note that we are not using the planner to find the cheapest method + * to scan the relation as that would require us to either use lower + * level planner functions which would be a maintenance burden in the + * long run or use the full-fledged planner which could cause + * overhead. + */ + return FindUsableIndexForReplicaIdentityFull(localrel, attrMap); + } + + return InvalidOid; +} diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c new file mode 100644 index 0000000..0dab0bb --- /dev/null +++ b/src/backend/replication/logical/reorderbuffer.c @@ -0,0 +1,5280 @@ +/*------------------------------------------------------------------------- + * + * reorderbuffer.c + * PostgreSQL logical replay/reorder buffer management + * + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/logical/reorderbuffer.c + * + * NOTES + * This module gets handed individual pieces of transactions in the order + * they are written to the WAL and is responsible to reassemble them into + * toplevel transaction sized pieces. When a transaction is completely + * reassembled - signaled by reading the transaction commit record - it + * will then call the output plugin (cf. ReorderBufferCommit()) with the + * individual changes. The output plugins rely on snapshots built by + * snapbuild.c which hands them to us. + * + * Transactions and subtransactions/savepoints in postgres are not + * immediately linked to each other from outside the performing + * backend. Only at commit/abort (or special xact_assignment records) they + * are linked together. Which means that we will have to splice together a + * toplevel transaction from its subtransactions. To do that efficiently we + * build a binary heap indexed by the smallest current lsn of the individual + * subtransactions' changestreams. As the individual streams are inherently + * ordered by LSN - since that is where we build them from - the transaction + * can easily be reassembled by always using the subtransaction with the + * smallest current LSN from the heap. + * + * In order to cope with large transactions - which can be several times as + * big as the available memory - this module supports spooling the contents + * of a large transactions to disk. When the transaction is replayed the + * contents of individual (sub-)transactions will be read from disk in + * chunks. + * + * This module also has to deal with reassembling toast records from the + * individual chunks stored in WAL. When a new (or initial) version of a + * tuple is stored in WAL it will always be preceded by the toast chunks + * emitted for the columns stored out of line. Within a single toplevel + * transaction there will be no other data carrying records between a row's + * toast chunks and the row data itself. See ReorderBufferToast* for + * details. + * + * ReorderBuffer uses two special memory context types - SlabContext for + * allocations of fixed-length structures (changes and transactions), and + * GenerationContext for the variable-length transaction data (allocated + * and freed in groups with similar lifespans). + * + * To limit the amount of memory used by decoded changes, we track memory + * used at the reorder buffer level (i.e. total amount of memory), and for + * each transaction. When the total amount of used memory exceeds the + * limit, the transaction consuming the most memory is then serialized to + * disk. + * + * Only decoded changes are evicted from memory (spilled to disk), not the + * transaction records. The number of toplevel transactions is limited, + * but a transaction with many subtransactions may still consume significant + * amounts of memory. However, the transaction records are fairly small and + * are not included in the memory limit. + * + * The current eviction algorithm is very simple - the transaction is + * picked merely by size, while it might be useful to also consider age + * (LSN) of the changes for example. With the new Generational memory + * allocator, evicting the oldest changes would make it more likely the + * memory gets actually freed. + * + * We still rely on max_changes_in_memory when loading serialized changes + * back into memory. At that point we can't use the memory limit directly + * as we load the subxacts independently. One option to deal with this + * would be to count the subxacts, and allow each to allocate 1/N of the + * memory limit. That however does not seem very appealing, because with + * many subtransactions it may easily cause thrashing (short cycles of + * deserializing and applying very few changes). We probably should give + * a bit more memory to the oldest subtransactions, because it's likely + * they are the source for the next sequence of changes. + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <sys/stat.h> + +#include "access/detoast.h" +#include "access/heapam.h" +#include "access/rewriteheap.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */ +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/sinval.h" +#include "utils/builtins.h" +#include "utils/combocid.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relfilenumbermap.h" + + +/* entry for a hash table we use to map from xid to our transaction state */ +typedef struct ReorderBufferTXNByIdEnt +{ + TransactionId xid; + ReorderBufferTXN *txn; +} ReorderBufferTXNByIdEnt; + +/* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */ +typedef struct ReorderBufferTupleCidKey +{ + RelFileLocator rlocator; + ItemPointerData tid; +} ReorderBufferTupleCidKey; + +typedef struct ReorderBufferTupleCidEnt +{ + ReorderBufferTupleCidKey key; + CommandId cmin; + CommandId cmax; + CommandId combocid; /* just for debugging */ +} ReorderBufferTupleCidEnt; + +/* Virtual file descriptor with file offset tracking */ +typedef struct TXNEntryFile +{ + File vfd; /* -1 when the file is closed */ + off_t curOffset; /* offset for next write or read. Reset to 0 + * when vfd is opened. */ +} TXNEntryFile; + +/* k-way in-order change iteration support structures */ +typedef struct ReorderBufferIterTXNEntry +{ + XLogRecPtr lsn; + ReorderBufferChange *change; + ReorderBufferTXN *txn; + TXNEntryFile file; + XLogSegNo segno; +} ReorderBufferIterTXNEntry; + +typedef struct ReorderBufferIterTXNState +{ + binaryheap *heap; + Size nr_txns; + dlist_head old_change; + ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER]; +} ReorderBufferIterTXNState; + +/* toast datastructures */ +typedef struct ReorderBufferToastEnt +{ + Oid chunk_id; /* toast_table.chunk_id */ + int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we + * have seen */ + Size num_chunks; /* number of chunks we've already seen */ + Size size; /* combined size of chunks seen */ + dlist_head chunks; /* linked list of chunks */ + struct varlena *reconstructed; /* reconstructed varlena now pointed to in + * main tup */ +} ReorderBufferToastEnt; + +/* Disk serialization support datastructures */ +typedef struct ReorderBufferDiskChange +{ + Size size; + ReorderBufferChange change; + /* data follows */ +} ReorderBufferDiskChange; + +#define IsSpecInsert(action) \ +( \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \ +) +#define IsSpecConfirmOrAbort(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \ +) +#define IsInsertOrUpdate(action) \ +( \ + (((action) == REORDER_BUFFER_CHANGE_INSERT) || \ + ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \ + ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \ +) + +/* + * Maximum number of changes kept in memory, per transaction. After that, + * changes are spooled to disk. + * + * The current value should be sufficient to decode the entire transaction + * without hitting disk in OLTP workloads, while starting to spool to disk in + * other workloads reasonably fast. + * + * At some point in the future it probably makes sense to have a more elaborate + * resource management here, but it's not entirely clear what that would look + * like. + */ +int logical_decoding_work_mem; +static const Size max_changes_in_memory = 4096; /* XXX for restore only */ + +/* GUC variable */ +int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED; + +/* --------------------------------------- + * primary reorderbuffer support routines + * --------------------------------------- + */ +static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb); +static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb, + TransactionId xid, bool create, bool *is_new, + XLogRecPtr lsn, bool create_as_top); +static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn); + +static void AssertTXNLsnOrder(ReorderBuffer *rb); + +/* --------------------------------------- + * support functions for lsn-order iterating over the ->changes of a + * transaction and its subtransactions + * + * used for iteration over the k-way heap merge of a transaction and its + * subtransactions + * --------------------------------------- + */ +static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state); +static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state); +static void ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state); +static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs); + +/* + * --------------------------------------- + * Disk serialization support functions + * --------------------------------------- + */ +static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb); +static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change); +static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno); +static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data); +static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + bool txn_prepared); +static void ReorderBufferCleanupSerializedTXNs(const char *slotname); +static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, + TransactionId xid, XLogSegNo segno); + +static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap); +static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid); + +/* + * --------------------------------------- + * Streaming support functions + * --------------------------------------- + */ +static inline bool ReorderBufferCanStream(ReorderBuffer *rb); +static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb); +static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn); + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ +static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn); +static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); +static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change); + +/* + * --------------------------------------- + * memory accounting + * --------------------------------------- + */ +static Size ReorderBufferChangeSize(ReorderBufferChange *change); +static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz); + +/* + * Allocate a new ReorderBuffer and clean out any old serialized state from + * prior ReorderBuffer instances for the same slot. + */ +ReorderBuffer * +ReorderBufferAllocate(void) +{ + ReorderBuffer *buffer; + HASHCTL hash_ctl; + MemoryContext new_ctx; + + Assert(MyReplicationSlot != NULL); + + /* allocate memory in own context, to have better accountability */ + new_ctx = AllocSetContextCreate(CurrentMemoryContext, + "ReorderBuffer", + ALLOCSET_DEFAULT_SIZES); + + buffer = + (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer)); + + memset(&hash_ctl, 0, sizeof(hash_ctl)); + + buffer->context = new_ctx; + + buffer->change_context = SlabContextCreate(new_ctx, + "Change", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferChange)); + + buffer->txn_context = SlabContextCreate(new_ctx, + "TXN", + SLAB_DEFAULT_BLOCK_SIZE, + sizeof(ReorderBufferTXN)); + + /* + * XXX the allocation sizes used below pre-date generation context's block + * growing code. These values should likely be benchmarked and set to + * more suitable values. + */ + buffer->tup_context = GenerationContextCreate(new_ctx, + "Tuples", + SLAB_LARGE_BLOCK_SIZE, + SLAB_LARGE_BLOCK_SIZE, + SLAB_LARGE_BLOCK_SIZE); + + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt); + hash_ctl.hcxt = buffer->context; + + buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + buffer->by_txn_last_xid = InvalidTransactionId; + buffer->by_txn_last_txn = NULL; + + buffer->outbuf = NULL; + buffer->outbufsize = 0; + buffer->size = 0; + + buffer->spillTxns = 0; + buffer->spillCount = 0; + buffer->spillBytes = 0; + buffer->streamTxns = 0; + buffer->streamCount = 0; + buffer->streamBytes = 0; + buffer->totalTxns = 0; + buffer->totalBytes = 0; + + buffer->current_restart_decoding_lsn = InvalidXLogRecPtr; + + dlist_init(&buffer->toplevel_by_lsn); + dlist_init(&buffer->txns_by_base_snapshot_lsn); + dclist_init(&buffer->catchange_txns); + + /* + * Ensure there's no stale data from prior uses of this slot, in case some + * prior exit avoided calling ReorderBufferFree. Failure to do this can + * produce duplicated txns, and it's very cheap if there's nothing there. + */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); + + return buffer; +} + +/* + * Free a ReorderBuffer + */ +void +ReorderBufferFree(ReorderBuffer *rb) +{ + MemoryContext context = rb->context; + + /* + * We free separately allocated data by entirely scrapping reorderbuffer's + * memory context. + */ + MemoryContextDelete(context); + + /* Free disk space used by unconsumed reorder buffers */ + ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name)); +} + +/* + * Get an unused, possibly preallocated, ReorderBufferTXN. + */ +static ReorderBufferTXN * +ReorderBufferGetTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + txn = (ReorderBufferTXN *) + MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN)); + + memset(txn, 0, sizeof(ReorderBufferTXN)); + + dlist_init(&txn->changes); + dlist_init(&txn->tuplecids); + dlist_init(&txn->subtxns); + + /* InvalidCommandId is not zero, so set it explicitly */ + txn->command_id = InvalidCommandId; + txn->output_plugin_private = NULL; + + return txn; +} + +/* + * Free a ReorderBufferTXN. + */ +static void +ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* clean the lookup cache if we were cached (quite likely) */ + if (rb->by_txn_last_xid == txn->xid) + { + rb->by_txn_last_xid = InvalidTransactionId; + rb->by_txn_last_txn = NULL; + } + + /* free data that's contained */ + + if (txn->gid != NULL) + { + pfree(txn->gid); + txn->gid = NULL; + } + + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + if (txn->invalidations) + { + pfree(txn->invalidations); + txn->invalidations = NULL; + } + + /* Reset the toast hash */ + ReorderBufferToastReset(rb, txn); + + pfree(txn); +} + +/* + * Get a fresh ReorderBufferChange. + */ +ReorderBufferChange * +ReorderBufferGetChange(ReorderBuffer *rb) +{ + ReorderBufferChange *change; + + change = (ReorderBufferChange *) + MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange)); + + memset(change, 0, sizeof(ReorderBufferChange)); + return change; +} + +/* + * Free a ReorderBufferChange and update memory accounting, if requested. + */ +void +ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change, + bool upd_mem) +{ + /* update memory accounting info */ + if (upd_mem) + ReorderBufferChangeMemoryUpdate(rb, change, false, + ReorderBufferChangeSize(change)); + + /* free contained data */ + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.newtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple); + change->data.tp.newtuple = NULL; + } + + if (change->data.tp.oldtuple) + { + ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple); + change->data.tp.oldtuple = NULL; + } + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + if (change->data.msg.prefix != NULL) + pfree(change->data.msg.prefix); + change->data.msg.prefix = NULL; + if (change->data.msg.message != NULL) + pfree(change->data.msg.message); + change->data.msg.message = NULL; + break; + case REORDER_BUFFER_CHANGE_INVALIDATION: + if (change->data.inval.invalidations) + pfree(change->data.inval.invalidations); + change->data.inval.invalidations = NULL; + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + if (change->data.snapshot) + { + ReorderBufferFreeSnap(rb, change->data.snapshot); + change->data.snapshot = NULL; + } + break; + /* no data in addition to the struct itself */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + if (change->data.truncate.relids != NULL) + { + ReorderBufferReturnRelids(rb, change->data.truncate.relids); + change->data.truncate.relids = NULL; + } + break; + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + pfree(change); +} + +/* + * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size + * tuple_len (excluding header overhead). + */ +ReorderBufferTupleBuf * +ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len) +{ + ReorderBufferTupleBuf *tuple; + Size alloc_len; + + alloc_len = tuple_len + SizeofHeapTupleHeader; + + tuple = (ReorderBufferTupleBuf *) + MemoryContextAlloc(rb->tup_context, + sizeof(ReorderBufferTupleBuf) + + MAXIMUM_ALIGNOF + alloc_len); + tuple->alloc_tuple_size = alloc_len; + tuple->tuple.t_data = ReorderBufferTupleBufData(tuple); + + return tuple; +} + +/* + * Free a ReorderBufferTupleBuf. + */ +void +ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple) +{ + pfree(tuple); +} + +/* + * Get an array for relids of truncated relations. + * + * We use the global memory context (for the whole reorder buffer), because + * none of the existing ones seems like a good match (some are SLAB, so we + * can't use those, and tup_context is meant for tuple data, not relids). We + * could add yet another context, but it seems like an overkill - TRUNCATE is + * not particularly common operation, so it does not seem worth it. + */ +Oid * +ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids) +{ + Oid *relids; + Size alloc_len; + + alloc_len = sizeof(Oid) * nrelids; + + relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len); + + return relids; +} + +/* + * Free an array of relids. + */ +void +ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids) +{ + pfree(relids); +} + +/* + * Return the ReorderBufferTXN from the given buffer, specified by Xid. + * If create is true, and a transaction doesn't already exist, create it + * (with the given LSN, and as top transaction if that's specified); + * when this happens, is_new is set to true. + */ +static ReorderBufferTXN * +ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create, + bool *is_new, XLogRecPtr lsn, bool create_as_top) +{ + ReorderBufferTXN *txn; + ReorderBufferTXNByIdEnt *ent; + bool found; + + Assert(TransactionIdIsValid(xid)); + + /* + * Check the one-entry lookup cache first + */ + if (TransactionIdIsValid(rb->by_txn_last_xid) && + rb->by_txn_last_xid == xid) + { + txn = rb->by_txn_last_txn; + + if (txn != NULL) + { + /* found it, and it's valid */ + if (is_new) + *is_new = false; + return txn; + } + + /* + * cached as non-existent, and asked not to create? Then nothing else + * to do. + */ + if (!create) + return NULL; + /* otherwise fall through to create it */ + } + + /* + * If the cache wasn't hit or it yielded a "does-not-exist" and we want to + * create an entry. + */ + + /* search the lookup table */ + ent = (ReorderBufferTXNByIdEnt *) + hash_search(rb->by_txn, + &xid, + create ? HASH_ENTER : HASH_FIND, + &found); + if (found) + txn = ent->txn; + else if (create) + { + /* initialize the new entry, if creation was requested */ + Assert(ent != NULL); + Assert(lsn != InvalidXLogRecPtr); + + ent->txn = ReorderBufferGetTXN(rb); + ent->txn->xid = xid; + txn = ent->txn; + txn->first_lsn = lsn; + txn->restart_decoding_lsn = rb->current_restart_decoding_lsn; + + if (create_as_top) + { + dlist_push_tail(&rb->toplevel_by_lsn, &txn->node); + AssertTXNLsnOrder(rb); + } + } + else + txn = NULL; /* not found and not asked to create */ + + /* update cache */ + rb->by_txn_last_xid = xid; + rb->by_txn_last_txn = txn; + + if (is_new) + *is_new = !found; + + Assert(!create || txn != NULL); + return txn; +} + +/* + * Record the partial change for the streaming of in-progress transactions. We + * can stream only complete changes so if we have a partial change like toast + * table insert or speculative insert then we mark such a 'txn' so that it + * can't be streamed. We also ensure that if the changes in such a 'txn' can + * be streamed and are above logical_decoding_work_mem threshold then we stream + * them as soon as we have a complete change. + */ +static void +ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, + bool toast_insert) +{ + ReorderBufferTXN *toptxn; + + /* + * The partial changes need to be processed only while streaming + * in-progress transactions. + */ + if (!ReorderBufferCanStream(rb)) + return; + + /* Get the top transaction. */ + toptxn = rbtxn_get_toptxn(txn); + + /* + * Indicate a partial change for toast inserts. The change will be + * considered as complete once we get the insert or update on the main + * table and we are sure that the pending toast chunks are not required + * anymore. + * + * If we allow streaming when there are pending toast chunks then such + * chunks won't be released till the insert (multi_insert) is complete and + * we expect the txn to have streamed all changes after streaming. This + * restriction is mainly to ensure the correctness of streamed + * transactions and it doesn't seem worth uplifting such a restriction + * just to allow this case because anyway we will stream the transaction + * once such an insert is complete. + */ + if (toast_insert) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsInsertOrUpdate(change->action) && + change->data.tp.clear_toast_afterwards) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Indicate a partial change for speculative inserts. The change will be + * considered as complete once we get the speculative confirm or abort + * token. + */ + if (IsSpecInsert(change->action)) + toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE; + else if (rbtxn_has_partial_change(toptxn) && + IsSpecConfirmOrAbort(change->action)) + toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE; + + /* + * Stream the transaction if it is serialized before and the changes are + * now complete in the top-level transaction. + * + * The reason for doing the streaming of such a transaction as soon as we + * get the complete change for it is that previously it would have reached + * the memory threshold and wouldn't get streamed because of incomplete + * changes. Delaying such transactions would increase apply lag for them. + */ + if (ReorderBufferCanStartStreaming(rb) && + !(rbtxn_has_partial_change(toptxn)) && + rbtxn_is_serialized(txn) && + rbtxn_has_streamable_change(toptxn)) + ReorderBufferStreamTXN(rb, toptxn); +} + +/* + * Queue a change into a transaction so it can be replayed upon commit or will be + * streamed when we reach logical_decoding_work_mem threshold. + */ +void +ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + ReorderBufferChange *change, bool toast_insert) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* + * While streaming the previous changes we have detected that the + * transaction is aborted. So there is no point in collecting further + * changes for it. + */ + if (txn->concurrent_abort) + { + /* + * We don't need to update memory accounting for this change as we + * have not added it to the queue yet. + */ + ReorderBufferReturnChange(rb, change, false); + return; + } + + /* + * The changes that are sent downstream are considered streamable. We + * remember such transactions so that only those will later be considered + * for streaming. + */ + if (change->action == REORDER_BUFFER_CHANGE_INSERT || + change->action == REORDER_BUFFER_CHANGE_UPDATE || + change->action == REORDER_BUFFER_CHANGE_DELETE || + change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT || + change->action == REORDER_BUFFER_CHANGE_TRUNCATE || + change->action == REORDER_BUFFER_CHANGE_MESSAGE) + { + ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn); + + toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE; + } + + change->lsn = lsn; + change->txn = txn; + + Assert(InvalidXLogRecPtr != lsn); + dlist_push_tail(&txn->changes, &change->node); + txn->nentries++; + txn->nentries_mem++; + + /* update memory accounting information */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); + + /* process partial change */ + ReorderBufferProcessPartialChange(rb, txn, change, toast_insert); + + /* check the memory limits and evict something if needed */ + ReorderBufferCheckMemoryLimit(rb); +} + +/* + * A transactional message is queued to be processed upon commit and a + * non-transactional message gets processed immediately. + */ +void +ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid, + Snapshot snap, XLogRecPtr lsn, + bool transactional, const char *prefix, + Size message_size, const char *message) +{ + if (transactional) + { + MemoryContext oldcontext; + ReorderBufferChange *change; + + Assert(xid != InvalidTransactionId); + + /* + * We don't expect snapshots for transactional changes - we'll use the + * snapshot derived later during apply (unless the change gets + * skipped). + */ + Assert(!snap); + + oldcontext = MemoryContextSwitchTo(rb->context); + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_MESSAGE; + change->data.msg.prefix = pstrdup(prefix); + change->data.msg.message_size = message_size; + change->data.msg.message = palloc(message_size); + memcpy(change->data.msg.message, message, message_size); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); + } + else + { + ReorderBufferTXN *txn = NULL; + volatile Snapshot snapshot_now = snap; + + /* Non-transactional changes require a valid snapshot. */ + Assert(snapshot_now); + + if (xid != InvalidTransactionId) + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + /* setup snapshot to allow catalog access */ + SetupHistoricSnapshot(snapshot_now, NULL); + PG_TRY(); + { + rb->message(rb, txn, lsn, false, prefix, message_size, message); + + TeardownHistoricSnapshot(false); + } + PG_CATCH(); + { + TeardownHistoricSnapshot(true); + PG_RE_THROW(); + } + PG_END_TRY(); + } +} + +/* + * AssertTXNLsnOrder + * Verify LSN ordering of transaction lists in the reorderbuffer + * + * Other LSN-related invariants are checked too. + * + * No-op if assertions are not in use. + */ +static void +AssertTXNLsnOrder(ReorderBuffer *rb) +{ +#ifdef USE_ASSERT_CHECKING + LogicalDecodingContext *ctx = rb->private_data; + dlist_iter iter; + XLogRecPtr prev_first_lsn = InvalidXLogRecPtr; + XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr; + + /* + * Skip the verification if we don't reach the LSN at which we start + * decoding the contents of transactions yet because until we reach the + * LSN, we could have transactions that don't have the association between + * the top-level transaction and subtransaction yet and consequently have + * the same LSN. We don't guarantee this association until we try to + * decode the actual contents of transaction. The ordering of the records + * prior to the start_decoding_at LSN should have been checked before the + * restart. + */ + if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr)) + return; + + dlist_foreach(iter, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node, + iter.cur); + + /* start LSN must be set */ + Assert(cur_txn->first_lsn != InvalidXLogRecPtr); + + /* If there is an end LSN, it must be higher than start LSN */ + if (cur_txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_txn->first_lsn <= cur_txn->end_lsn); + + /* Current initial LSN must be strictly higher than previous */ + if (prev_first_lsn != InvalidXLogRecPtr) + Assert(prev_first_lsn < cur_txn->first_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_first_lsn = cur_txn->first_lsn; + } + + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, + base_snapshot_node, + iter.cur); + + /* base snapshot (and its LSN) must be set */ + Assert(cur_txn->base_snapshot != NULL); + Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr); + + /* current LSN must be strictly higher than previous */ + if (prev_base_snap_lsn != InvalidXLogRecPtr) + Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn); + + /* known-as-subtxn txns must not be listed */ + Assert(!rbtxn_is_known_subxact(cur_txn)); + + prev_base_snap_lsn = cur_txn->base_snapshot_lsn; + } +#endif +} + +/* + * AssertChangeLsnOrder + * + * Check ordering of changes in the (sub)transaction. + */ +static void +AssertChangeLsnOrder(ReorderBufferTXN *txn) +{ +#ifdef USE_ASSERT_CHECKING + dlist_iter iter; + XLogRecPtr prev_lsn = txn->first_lsn; + + dlist_foreach(iter, &txn->changes) + { + ReorderBufferChange *cur_change; + + cur_change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(cur_change->lsn != InvalidXLogRecPtr); + Assert(txn->first_lsn <= cur_change->lsn); + + if (txn->end_lsn != InvalidXLogRecPtr) + Assert(cur_change->lsn <= txn->end_lsn); + + Assert(prev_lsn <= cur_change->lsn); + + prev_lsn = cur_change->lsn; + } +#endif +} + +/* + * ReorderBufferGetOldestTXN + * Return oldest transaction in reorderbuffer + */ +ReorderBufferTXN * +ReorderBufferGetOldestTXN(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->toplevel_by_lsn)) + return NULL; + + txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn); + + Assert(!rbtxn_is_known_subxact(txn)); + Assert(txn->first_lsn != InvalidXLogRecPtr); + return txn; +} + +/* + * ReorderBufferGetOldestXmin + * Return oldest Xmin in reorderbuffer + * + * Returns oldest possibly running Xid from the point of view of snapshots + * used in the transactions kept by reorderbuffer, or InvalidTransactionId if + * there are none. + * + * Since snapshots are assigned monotonically, this equals the Xmin of the + * base snapshot with minimal base_snapshot_lsn. + */ +TransactionId +ReorderBufferGetOldestXmin(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + AssertTXNLsnOrder(rb); + + if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn)) + return InvalidTransactionId; + + txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node, + &rb->txns_by_base_snapshot_lsn); + return txn->base_snapshot->xmin; +} + +void +ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr) +{ + rb->current_restart_decoding_lsn = ptr; +} + +/* + * ReorderBufferAssignChild + * + * Make note that we know that subxid is a subtransaction of xid, seen as of + * the given lsn. + */ +void +ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *subtxn; + bool new_top; + bool new_sub; + + txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true); + subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false); + + if (!new_sub) + { + if (rbtxn_is_known_subxact(subtxn)) + { + /* already associated, nothing to do */ + return; + } + else + { + /* + * We already saw this transaction, but initially added it to the + * list of top-level txns. Now that we know it's not top-level, + * remove it from there. + */ + dlist_delete(&subtxn->node); + } + } + + subtxn->txn_flags |= RBTXN_IS_SUBXACT; + subtxn->toplevel_xid = xid; + Assert(subtxn->nsubtxns == 0); + + /* set the reference to top-level transaction */ + subtxn->toptxn = txn; + + /* add to subtransaction list */ + dlist_push_tail(&txn->subtxns, &subtxn->node); + txn->nsubtxns++; + + /* Possibly transfer the subtxn's snapshot to its top-level txn. */ + ReorderBufferTransferSnapToParent(txn, subtxn); + + /* Verify LSN-ordering invariant */ + AssertTXNLsnOrder(rb); +} + +/* + * ReorderBufferTransferSnapToParent + * Transfer base snapshot from subtxn to top-level txn, if needed + * + * This is done if the top-level txn doesn't have a base snapshot, or if the + * subtxn's base snapshot has an earlier LSN than the top-level txn's base + * snapshot's LSN. This can happen if there are no changes in the toplevel + * txn but there are some in the subtxn, or the first change in subtxn has + * earlier LSN than first change in the top-level txn and we learned about + * their kinship only now. + * + * The subtransaction's snapshot is cleared regardless of the transfer + * happening, since it's not needed anymore in either case. + * + * We do this as soon as we become aware of their kinship, to avoid queueing + * extra snapshots to txns known-as-subtxns -- only top-level txns will + * receive further snapshots. + */ +static void +ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn, + ReorderBufferTXN *subtxn) +{ + Assert(subtxn->toplevel_xid == txn->xid); + + if (subtxn->base_snapshot != NULL) + { + if (txn->base_snapshot == NULL || + subtxn->base_snapshot_lsn < txn->base_snapshot_lsn) + { + /* + * If the toplevel transaction already has a base snapshot but + * it's newer than the subxact's, purge it. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * The snapshot is now the top transaction's; transfer it, and + * adjust the list position of the top transaction in the list by + * moving it to where the subtransaction is. + */ + txn->base_snapshot = subtxn->base_snapshot; + txn->base_snapshot_lsn = subtxn->base_snapshot_lsn; + dlist_insert_before(&subtxn->base_snapshot_node, + &txn->base_snapshot_node); + + /* + * The subtransaction doesn't have a snapshot anymore (so it + * mustn't be in the list.) + */ + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + dlist_delete(&subtxn->base_snapshot_node); + } + else + { + /* Base snap of toplevel is fine, so subxact's is not needed */ + SnapBuildSnapDecRefcount(subtxn->base_snapshot); + dlist_delete(&subtxn->base_snapshot_node); + subtxn->base_snapshot = NULL; + subtxn->base_snapshot_lsn = InvalidXLogRecPtr; + } + } +} + +/* + * Associate a subtransaction with its toplevel transaction at commit + * time. There may be no further changes added after this. + */ +void +ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid, + TransactionId subxid, XLogRecPtr commit_lsn, + XLogRecPtr end_lsn) +{ + ReorderBufferTXN *subtxn; + + subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL, + InvalidXLogRecPtr, false); + + /* + * No need to do anything if that subtxn didn't contain any changes + */ + if (!subtxn) + return; + + subtxn->final_lsn = commit_lsn; + subtxn->end_lsn = end_lsn; + + /* + * Assign this subxact as a child of the toplevel xact (no-op if already + * done.) + */ + ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr); +} + + +/* + * Support for efficiently iterating over a transaction's and its + * subtransactions' changes. + * + * We do by doing a k-way merge between transactions/subtransactions. For that + * we model the current heads of the different transactions as a binary heap + * so we easily know which (sub-)transaction has the change with the smallest + * lsn next. + * + * We assume the changes in individual transactions are already sorted by LSN. + */ + +/* + * Binary heap comparison function. + */ +static int +ReorderBufferIterCompare(Datum a, Datum b, void *arg) +{ + ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg; + XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn; + XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn; + + if (pos_a < pos_b) + return 1; + else if (pos_a == pos_b) + return 0; + return -1; +} + +/* + * Allocate & initialize an iterator which iterates in lsn order over a + * transaction and all its subtransactions. + * + * Note: The iterator state is returned through iter_state parameter rather + * than the function's return value. This is because the state gets cleaned up + * in a PG_CATCH block in the caller, so we want to make sure the caller gets + * back the state even if this function throws an exception. + */ +static void +ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferIterTXNState *volatile *iter_state) +{ + Size nr_txns = 0; + ReorderBufferIterTXNState *state; + dlist_iter cur_txn_i; + int32 off; + + *iter_state = NULL; + + /* Check ordering of changes in the toplevel transaction. */ + AssertChangeLsnOrder(txn); + + /* + * Calculate the size of our heap: one element for every transaction that + * contains changes. (Besides the transactions already in the reorder + * buffer, we count the one we were directly passed.) + */ + if (txn->nentries > 0) + nr_txns++; + + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + /* Check ordering of changes in this subtransaction. */ + AssertChangeLsnOrder(cur_txn); + + if (cur_txn->nentries > 0) + nr_txns++; + } + + /* allocate iteration state */ + state = (ReorderBufferIterTXNState *) + MemoryContextAllocZero(rb->context, + sizeof(ReorderBufferIterTXNState) + + sizeof(ReorderBufferIterTXNEntry) * nr_txns); + + state->nr_txns = nr_txns; + dlist_init(&state->old_change); + + for (off = 0; off < state->nr_txns; off++) + { + state->entries[off].file.vfd = -1; + state->entries[off].segno = 0; + } + + /* allocate heap */ + state->heap = binaryheap_allocate(state->nr_txns, + ReorderBufferIterCompare, + state); + + /* Now that the state fields are initialized, it is safe to return it. */ + *iter_state = state; + + /* + * Now insert items into the binary heap, in an unordered fashion. (We + * will run a heap assembly step at the end; this is more efficient.) + */ + + off = 0; + + /* add toplevel transaction if it contains changes */ + if (txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, txn); + ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file, + &state->entries[off].segno); + } + + cur_change = dlist_head_element(ReorderBufferChange, node, + &txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + + /* add subtransactions if they contain changes */ + dlist_foreach(cur_txn_i, &txn->subtxns) + { + ReorderBufferTXN *cur_txn; + + cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur); + + if (cur_txn->nentries > 0) + { + ReorderBufferChange *cur_change; + + if (rbtxn_is_serialized(cur_txn)) + { + /* serialize remaining changes */ + ReorderBufferSerializeTXN(rb, cur_txn); + ReorderBufferRestoreChanges(rb, cur_txn, + &state->entries[off].file, + &state->entries[off].segno); + } + cur_change = dlist_head_element(ReorderBufferChange, node, + &cur_txn->changes); + + state->entries[off].lsn = cur_change->lsn; + state->entries[off].change = cur_change; + state->entries[off].txn = cur_txn; + + binaryheap_add_unordered(state->heap, Int32GetDatum(off++)); + } + } + + /* assemble a valid binary heap */ + binaryheap_build(state->heap); +} + +/* + * Return the next change when iterating over a transaction and its + * subtransactions. + * + * Returns NULL when no further changes exist. + */ +static ReorderBufferChange * +ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) +{ + ReorderBufferChange *change; + ReorderBufferIterTXNEntry *entry; + int32 off; + + /* nothing there anymore */ + if (state->heap->bh_size == 0) + return NULL; + + off = DatumGetInt32(binaryheap_first(state->heap)); + entry = &state->entries[off]; + + /* free memory we might have "leaked" in the previous *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + change = entry->change; + + /* + * update heap with information about which transaction has the next + * relevant change in LSN order + */ + + /* there are in-memory changes */ + if (dlist_has_next(&entry->txn->changes, &entry->change->node)) + { + dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node); + ReorderBufferChange *next_change = + dlist_container(ReorderBufferChange, node, next); + + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + return change; + } + + /* try to load changes from disk */ + if (entry->txn->nentries != entry->txn->nentries_mem) + { + /* + * Ugly: restoring changes will reuse *Change records, thus delete the + * current one from the per-tx list and only free in the next call. + */ + dlist_delete(&change->node); + dlist_push_tail(&state->old_change, &change->node); + + /* + * Update the total bytes processed by the txn for which we are + * releasing the current set of changes and restoring the new set of + * changes. + */ + rb->totalBytes += entry->txn->size; + if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file, + &state->entries[off].segno)) + { + /* successfully restored changes from disk */ + ReorderBufferChange *next_change = + dlist_head_element(ReorderBufferChange, node, + &entry->txn->changes); + + elog(DEBUG2, "restored %u/%u changes from disk", + (uint32) entry->txn->nentries_mem, + (uint32) entry->txn->nentries); + + Assert(entry->txn->nentries_mem); + /* txn stays the same */ + state->entries[off].lsn = next_change->lsn; + state->entries[off].change = next_change; + binaryheap_replace_first(state->heap, Int32GetDatum(off)); + + return change; + } + } + + /* ok, no changes there anymore, remove */ + binaryheap_remove_first(state->heap); + + return change; +} + +/* + * Deallocate the iterator + */ +static void +ReorderBufferIterTXNFinish(ReorderBuffer *rb, + ReorderBufferIterTXNState *state) +{ + int32 off; + + for (off = 0; off < state->nr_txns; off++) + { + if (state->entries[off].file.vfd != -1) + FileClose(state->entries[off].file.vfd); + } + + /* free memory we might have "leaked" in the last *Next call */ + if (!dlist_is_empty(&state->old_change)) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, + dlist_pop_head_node(&state->old_change)); + ReorderBufferReturnChange(rb, change, true); + Assert(dlist_is_empty(&state->old_change)); + } + + binaryheap_free(state->heap); + pfree(state); +} + +/* + * Cleanup the contents of a transaction, usually after the transaction + * committed or aborted. + */ +static void +ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + bool found; + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferCleanupTXN(rb, subtxn); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the tuplecids we stored for decoding catalog snapshot access. + * They are always stored in the toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Cleanup the base snapshot, if set. + */ + if (txn->base_snapshot != NULL) + { + SnapBuildSnapDecRefcount(txn->base_snapshot); + dlist_delete(&txn->base_snapshot_node); + } + + /* + * Cleanup the snapshot for the last streamed run. + */ + if (txn->snapshot_now != NULL) + { + Assert(rbtxn_is_streamed(txn)); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + } + + /* + * Remove TXN from its containing lists. + * + * Note: if txn is known as subxact, we are deleting the TXN from its + * parent's list of known subxacts; this leaves the parent's nsubxacts + * count too high, but we don't care. Otherwise, we are deleting the TXN + * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the + * list of catalog modifying transactions as well. + */ + dlist_delete(&txn->node); + if (rbtxn_has_catalog_changes(txn)) + dclist_delete_from(&rb->catchange_txns, &txn->catchange_node); + + /* now remove reference from buffer */ + hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found); + Assert(found); + + /* remove entries spilled to disk */ + if (rbtxn_is_serialized(txn)) + ReorderBufferRestoreCleanup(rb, txn); + + /* deallocate */ + ReorderBufferReturnTXN(rb, txn); +} + +/* + * Discard changes from a transaction (and subtransactions), either after + * streaming or decoding them at PREPARE. Keep the remaining info - + * transactions, tuplecids, invalidations and snapshots. + * + * We additionally remove tuplecids after decoding the transaction at prepare + * time as we only need to perform invalidation at rollback or commit prepared. + * + * 'txn_prepared' indicates that we have decoded the transaction at prepare + * time. + */ +static void +ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared) +{ + dlist_mutable_iter iter; + + /* cleanup subtransactions & their changes */ + dlist_foreach_modify(iter, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, iter.cur); + + /* + * Subtransactions are always associated to the toplevel TXN, even if + * they originally were happening inside another subtxn, so we won't + * ever recurse more than one level deep here. + */ + Assert(rbtxn_is_known_subxact(subtxn)); + Assert(subtxn->nsubtxns == 0); + + ReorderBufferTruncateTXN(rb, subtxn, txn_prepared); + } + + /* cleanup changes in the txn */ + dlist_foreach_modify(iter, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + + /* remove the change from it's containing list */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + + /* + * Mark the transaction as streamed. + * + * The top-level transaction, is marked as streamed always, even if it + * does not contain any changes (that is, when all the changes are in + * subtransactions). + * + * For subtransactions, we only mark them as streamed when there are + * changes in them. + * + * We do it this way because of aborts - we don't want to send aborts for + * XIDs the downstream is not aware of. And of course, it always knows + * about the toplevel xact (we send the XID in all messages), but we never + * stream XIDs of empty subxacts. + */ + if ((!txn_prepared) && (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))) + txn->txn_flags |= RBTXN_IS_STREAMED; + + if (txn_prepared) + { + /* + * If this is a prepared txn, cleanup the tuplecids we stored for + * decoding catalog snapshot access. They are always stored in the + * toplevel transaction. + */ + dlist_foreach_modify(iter, &txn->tuplecids) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + /* Check we're not mixing changes from different transactions. */ + Assert(change->txn == txn); + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* Remove the change from its containing list. */ + dlist_delete(&change->node); + + ReorderBufferReturnChange(rb, change, true); + } + } + + /* + * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any + * memory. We could also keep the hash table and update it with new ctid + * values, but this seems simpler and good enough for now. + */ + if (txn->tuplecid_hash != NULL) + { + hash_destroy(txn->tuplecid_hash); + txn->tuplecid_hash = NULL; + } + + /* If this txn is serialized then clean the disk space. */ + if (rbtxn_is_serialized(txn)) + { + ReorderBufferRestoreCleanup(rb, txn); + txn->txn_flags &= ~RBTXN_IS_SERIALIZED; + + /* + * We set this flag to indicate if the transaction is ever serialized. + * We need this to accurately update the stats as otherwise the same + * transaction can be counted as serialized multiple times. + */ + txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR; + } + + /* also reset the number of entries in the transaction */ + txn->nentries_mem = 0; + txn->nentries = 0; +} + +/* + * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by + * HeapTupleSatisfiesHistoricMVCC. + */ +static void +ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter iter; + HASHCTL hash_ctl; + + if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids)) + return; + + hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey); + hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt); + hash_ctl.hcxt = rb->context; + + /* + * create the hash with the exact number of to-be-stored tuplecids from + * the start + */ + txn->tuplecid_hash = + hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + dlist_foreach(iter, &txn->tuplecids) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + bool found; + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, iter.cur); + + Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID); + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + key.rlocator = change->data.tuplecid.locator; + + ItemPointerCopy(&change->data.tuplecid.tid, + &key.tid); + + ent = (ReorderBufferTupleCidEnt *) + hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found); + if (!found) + { + ent->cmin = change->data.tuplecid.cmin; + ent->cmax = change->data.tuplecid.cmax; + ent->combocid = change->data.tuplecid.combocid; + } + else + { + /* + * Maybe we already saw this tuple before in this transaction, but + * if so it must have the same cmin. + */ + Assert(ent->cmin == change->data.tuplecid.cmin); + + /* + * cmax may be initially invalid, but once set it can only grow, + * and never become invalid again. + */ + Assert((ent->cmax == InvalidCommandId) || + ((change->data.tuplecid.cmax != InvalidCommandId) && + (change->data.tuplecid.cmax > ent->cmax))); + ent->cmax = change->data.tuplecid.cmax; + } + } +} + +/* + * Copy a provided snapshot so we can modify it privately. This is needed so + * that catalog modifying transactions can look into intermediate catalog + * states. + */ +static Snapshot +ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, + ReorderBufferTXN *txn, CommandId cid) +{ + Snapshot snap; + dlist_iter iter; + int i = 0; + Size size; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * orig_snap->xcnt + + sizeof(TransactionId) * (txn->nsubtxns + 1); + + snap = MemoryContextAllocZero(rb->context, size); + memcpy(snap, orig_snap, sizeof(SnapshotData)); + + snap->copied = true; + snap->active_count = 1; /* mark as active so nobody frees it */ + snap->regd_count = 0; + snap->xip = (TransactionId *) (snap + 1); + + memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt); + + /* + * snap->subxip contains all txids that belong to our transaction which we + * need to check via cmin/cmax. That's why we store the toplevel + * transaction in there as well. + */ + snap->subxip = snap->xip + snap->xcnt; + snap->subxip[i++] = txn->xid; + + /* + * subxcnt isn't decreased when subtransactions abort, so count manually. + * Since it's an upper boundary it is safe to use it for the allocation + * above. + */ + snap->subxcnt = 1; + + dlist_foreach(iter, &txn->subtxns) + { + ReorderBufferTXN *sub_txn; + + sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur); + snap->subxip[i++] = sub_txn->xid; + snap->subxcnt++; + } + + /* sort so we can bsearch() later */ + qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator); + + /* store the specified current CommandId */ + snap->curcid = cid; + + return snap; +} + +/* + * Free a previously ReorderBufferCopySnap'ed snapshot + */ +static void +ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap) +{ + if (snap->copied) + pfree(snap); + else + SnapBuildSnapDecRefcount(snap); +} + +/* + * If the transaction was (partially) streamed, we need to prepare or commit + * it in a 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_prepare or stream_commit message as per + * the case. + */ +static void +ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + /* we should only call this for previously streamed transactions */ + Assert(rbtxn_is_streamed(txn)); + + ReorderBufferStreamTXN(rb, txn); + + if (rbtxn_prepared(txn)) + { + /* + * Note, we send stream prepare even if a concurrent abort is + * detected. See DecodePrepare for more information. + */ + rb->stream_prepare(rb, txn, txn->final_lsn); + + /* + * This is a PREPARED transaction, part of a two-phase commit. The + * full cleanup will happen as part of the COMMIT PREPAREDs, so now + * just truncate txn by removing changes and tuplecids. + */ + ReorderBufferTruncateTXN(rb, txn, true); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + { + rb->stream_commit(rb, txn, txn->final_lsn); + ReorderBufferCleanupTXN(rb, txn); + } +} + +/* + * Set xid to detect concurrent aborts. + * + * While streaming an in-progress transaction or decoding a prepared + * transaction there is a possibility that the (sub)transaction might get + * aborted concurrently. In such case if the (sub)transaction has catalog + * update then we might decode the tuple using wrong catalog version. For + * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now, + * the transaction 501 updates the catalog tuple and after that we will have + * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is + * aborted and some other transaction say 502 updates the same catalog tuple + * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the + * problem is that when we try to decode the tuple inserted/updated in 501 + * after the catalog update, we will see the catalog tuple with (xmin: 500, + * xmax: 502) as visible because it will consider that the tuple is deleted by + * xid 502 which is not visible to our snapshot. And when we will try to + * decode with that catalog tuple, it can lead to a wrong result or a crash. + * So, it is necessary to detect concurrent aborts to allow streaming of + * in-progress transactions or decoding of prepared transactions. + * + * For detecting the concurrent abort we set CheckXidAlive to the current + * (sub)transaction's xid for which this change belongs to. And, during + * catalog scan we can check the status of the xid and if it is aborted we will + * report a specific error so that we can stop streaming current transaction + * and discard the already streamed changes on such an error. We might have + * already streamed some of the changes for the aborted (sub)transaction, but + * that is fine because when we decode the abort we will stream abort message + * to truncate the changes in the subscriber. Similarly, for prepared + * transactions, we stop decoding if concurrent abort is detected and then + * rollback the changes when rollback prepared is encountered. See + * DecodePrepare. + */ +static inline void +SetupCheckXidLive(TransactionId xid) +{ + /* + * If the input transaction id is already set as a CheckXidAlive then + * nothing to do. + */ + if (TransactionIdEquals(CheckXidAlive, xid)) + return; + + /* + * setup CheckXidAlive if it's not committed yet. We don't check if the + * xid is aborted. That will happen during catalog access. + */ + if (!TransactionIdDidCommit(xid)) + CheckXidAlive = xid; + else + CheckXidAlive = InvalidTransactionId; +} + +/* + * Helper function for ReorderBufferProcessTXN for applying change. + */ +static inline void +ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change, + bool streaming) +{ + if (streaming) + rb->stream_change(rb, txn, relation, change); + else + rb->apply_change(rb, txn, relation, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the truncate. + */ +static inline void +ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn, + int nrelations, Relation *relations, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_truncate(rb, txn, nrelations, relations, change); + else + rb->apply_truncate(rb, txn, nrelations, relations, change); +} + +/* + * Helper function for ReorderBufferProcessTXN for applying the message. + */ +static inline void +ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn, + ReorderBufferChange *change, bool streaming) +{ + if (streaming) + rb->stream_message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); + else + rb->message(rb, txn, change->lsn, true, + change->data.msg.prefix, + change->data.msg.message_size, + change->data.msg.message); +} + +/* + * Function to store the command id and snapshot at the end of the current + * stream so that we can reuse the same while sending the next stream. + */ +static inline void +ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, CommandId command_id) +{ + txn->command_id = command_id; + + /* Avoid copying if it's already copied. */ + if (snapshot_now->copied) + txn->snapshot_now = snapshot_now; + else + txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); +} + +/* + * Helper function for ReorderBufferProcessTXN to handle the concurrent + * abort of the streaming transaction. This resets the TXN such that it + * can be used to stream the remaining data of transaction being processed. + * This can happen when the subtransaction is aborted and we still want to + * continue processing the main or other subtransactions data. + */ +static void +ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + Snapshot snapshot_now, + CommandId command_id, + XLogRecPtr last_lsn, + ReorderBufferChange *specinsert) +{ + /* Discard the changes that we just streamed */ + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + + /* Free all resources allocated for toast reconstruction */ + ReorderBufferToastReset(rb, txn); + + /* Return the spec insert change if it is not NULL */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* + * For the streaming case, stop the stream and remember the command ID and + * snapshot for the streaming run. + */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_stop(rb, txn, last_lsn); + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + } +} + +/* + * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN. + * + * Send data of a transaction (and its subtransactions) to the + * output plugin. We iterate over the top and subtransactions (using a k-way + * merge) and replay the changes in lsn order. + * + * If streaming is true then data will be sent using stream API. + * + * Note: "volatile" markers on some parameters are to avoid trouble with + * PG_TRY inside the function. + */ +static void +ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn, + volatile Snapshot snapshot_now, + volatile CommandId command_id, + bool streaming) +{ + bool using_subtxn; + MemoryContext ccxt = CurrentMemoryContext; + ReorderBufferIterTXNState *volatile iterstate = NULL; + volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr; + ReorderBufferChange *volatile specinsert = NULL; + volatile bool stream_started = false; + ReorderBufferTXN *volatile curtxn = NULL; + + /* build data to be able to lookup the CommandIds of catalog tuples */ + ReorderBufferBuildTupleCidHash(rb, txn); + + /* setup the initial snapshot */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + + /* + * Decoding needs access to syscaches et al., which in turn use + * heavyweight locks and such. Thus we need to have enough state around to + * keep track of those. The easiest way is to simply use a transaction + * internally. That also allows us to easily enforce that nothing writes + * to the database by checking for xid assignments. + * + * When we're called via the SQL SRF there's already a transaction + * started, so start an explicit subtransaction there. + */ + using_subtxn = IsTransactionOrTransactionBlock(); + + PG_TRY(); + { + ReorderBufferChange *change; + int changes_count = 0; /* used to accumulate the number of + * changes */ + + if (using_subtxn) + BeginInternalSubTransaction(streaming ? "stream" : "replay"); + else + StartTransactionCommand(); + + /* + * We only need to send begin/begin-prepare for non-streamed + * transactions. + */ + if (!streaming) + { + if (rbtxn_prepared(txn)) + rb->begin_prepare(rb, txn); + else + rb->begin(rb, txn); + } + + ReorderBufferIterTXNInit(rb, txn, &iterstate); + while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL) + { + Relation relation = NULL; + Oid reloid; + + CHECK_FOR_INTERRUPTS(); + + /* + * We can't call start stream callback before processing first + * change. + */ + if (prev_lsn == InvalidXLogRecPtr) + { + if (streaming) + { + txn->origin_id = change->origin_id; + rb->stream_start(rb, txn, change->lsn); + stream_started = true; + } + } + + /* + * Enforce correct ordering of changes, merged from multiple + * subtransactions. The changes may have the same LSN due to + * MULTI_INSERT xlog records. + */ + Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn); + + prev_lsn = change->lsn; + + /* + * Set the current xid to detect concurrent aborts. This is + * required for the cases when we decode the changes before the + * COMMIT record is processed. + */ + if (streaming || rbtxn_prepared(change->txn)) + { + curtxn = change->txn; + SetupCheckXidLive(curtxn->xid); + } + + switch (change->action) + { + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + + /* + * Confirmation for speculative insertion arrived. Simply + * use as a normal record. It'll be cleaned up at the end + * of INSERT processing. + */ + if (specinsert == NULL) + elog(ERROR, "invalid ordering of speculative insertion changes"); + Assert(specinsert->data.tp.oldtuple == NULL); + change = specinsert; + change->action = REORDER_BUFFER_CHANGE_INSERT; + + /* intentionally fall through */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + Assert(snapshot_now); + + reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid, + change->data.tp.rlocator.relNumber); + + /* + * Mapped catalog tuple without data, emitted while + * catalog table was in the process of being rewritten. We + * can fail to look up the relfilenumber, because the + * relmapper has no "historic" view, in contrast to the + * normal catalog during decoding. Thus repeated rewrites + * can cause a lookup failure. That's OK because we do not + * decode catalog changes anyway. Normally such tuples + * would be skipped over below, but we can't identify + * whether the table should be logically logged without + * mapping the relfilenumber to the oid. + */ + if (reloid == InvalidOid && + change->data.tp.newtuple == NULL && + change->data.tp.oldtuple == NULL) + goto change_done; + else if (reloid == InvalidOid) + elog(ERROR, "could not map filenumber \"%s\" to relation OID", + relpathperm(change->data.tp.rlocator, + MAIN_FORKNUM)); + + relation = RelationIdGetRelation(reloid); + + if (!RelationIsValid(relation)) + elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")", + reloid, + relpathperm(change->data.tp.rlocator, + MAIN_FORKNUM)); + + if (!RelationIsLogicallyLogged(relation)) + goto change_done; + + /* + * Ignore temporary heaps created during DDL unless the + * plugin has asked for them. + */ + if (relation->rd_rel->relrewrite && !rb->output_rewrites) + goto change_done; + + /* + * For now ignore sequence changes entirely. Most of the + * time they don't log changes using records we + * understand, so it doesn't make sense to handle the few + * cases we do. + */ + if (relation->rd_rel->relkind == RELKIND_SEQUENCE) + goto change_done; + + /* user-triggered change */ + if (!IsToastRelation(relation)) + { + ReorderBufferToastReplace(rb, txn, relation, change); + ReorderBufferApplyChange(rb, txn, relation, change, + streaming); + + /* + * Only clear reassembled toast chunks if we're sure + * they're not required anymore. The creator of the + * tuple tells us. + */ + if (change->data.tp.clear_toast_afterwards) + ReorderBufferToastReset(rb, txn); + } + /* we're not interested in toast deletions */ + else if (change->action == REORDER_BUFFER_CHANGE_INSERT) + { + /* + * Need to reassemble the full toasted Datum in + * memory, to ensure the chunks don't get reused till + * we're done remove it from the list of this + * transaction's changes. Otherwise it will get + * freed/reused while restoring spooled data from + * disk. + */ + Assert(change->data.tp.newtuple != NULL); + + dlist_delete(&change->node); + ReorderBufferToastAppendChunk(rb, txn, relation, + change); + } + + change_done: + + /* + * If speculative insertion was confirmed, the record + * isn't needed anymore. + */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + if (RelationIsValid(relation)) + { + RelationClose(relation); + relation = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + + /* + * Speculative insertions are dealt with by delaying the + * processing of the insert until the confirmation record + * arrives. For that we simply unlink the record from the + * chain, so it does not get freed/reused while restoring + * spooled data from disk. + * + * This is safe in the face of concurrent catalog changes + * because the relevant relation can't be changed between + * speculative insertion and confirmation due to + * CheckTableNotInUse() and locking. + */ + + /* clear out a pending (and thus failed) speculation */ + if (specinsert != NULL) + { + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + + /* and memorize the pending insertion */ + dlist_delete(&change->node); + specinsert = change; + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + + /* + * Abort for speculative insertion arrived. So cleanup the + * specinsert tuple and toast hash. + * + * Note that we get the spec abort change for each toast + * entry but we need to perform the cleanup only the first + * time we get it for the main table. + */ + if (specinsert != NULL) + { + /* + * We must clean the toast hash before processing a + * completely new tuple to avoid confusion about the + * previous tuple's toast chunks. + */ + Assert(change->data.tp.clear_toast_afterwards); + ReorderBufferToastReset(rb, txn); + + /* We don't need this record anymore. */ + ReorderBufferReturnChange(rb, specinsert, true); + specinsert = NULL; + } + break; + + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + int i; + int nrelids = change->data.truncate.nrelids; + int nrelations = 0; + Relation *relations; + + relations = palloc0(nrelids * sizeof(Relation)); + for (i = 0; i < nrelids; i++) + { + Oid relid = change->data.truncate.relids[i]; + Relation rel; + + rel = RelationIdGetRelation(relid); + + if (!RelationIsValid(rel)) + elog(ERROR, "could not open relation with OID %u", relid); + + if (!RelationIsLogicallyLogged(rel)) + continue; + + relations[nrelations++] = rel; + } + + /* Apply the truncate. */ + ReorderBufferApplyTruncate(rb, txn, nrelations, + relations, change, + streaming); + + for (i = 0; i < nrelations; i++) + RelationClose(relations[i]); + + break; + } + + case REORDER_BUFFER_CHANGE_MESSAGE: + ReorderBufferApplyMessage(rb, txn, change, streaming); + break; + + case REORDER_BUFFER_CHANGE_INVALIDATION: + /* Execute the invalidation messages locally */ + ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations, + change->data.inval.invalidations); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + /* get rid of the old */ + TeardownHistoricSnapshot(false); + + if (snapshot_now->copied) + { + ReorderBufferFreeSnap(rb, snapshot_now); + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + + /* + * Restored from disk, need to be careful not to double + * free. We could introduce refcounting for that, but for + * now this seems infrequent enough not to care. + */ + else if (change->data.snapshot->copied) + { + snapshot_now = + ReorderBufferCopySnap(rb, change->data.snapshot, + txn, command_id); + } + else + { + snapshot_now = change->data.snapshot; + } + + /* and continue with the new one */ + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + Assert(change->data.command_id != InvalidCommandId); + + if (command_id < change->data.command_id) + { + command_id = change->data.command_id; + + if (!snapshot_now->copied) + { + /* we don't use the global one anymore */ + snapshot_now = ReorderBufferCopySnap(rb, snapshot_now, + txn, command_id); + } + + snapshot_now->curcid = command_id; + + TeardownHistoricSnapshot(false); + SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash); + } + + break; + + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + elog(ERROR, "tuplecid value in changequeue"); + break; + } + + /* + * It is possible that the data is not sent to downstream for a + * long time either because the output plugin filtered it or there + * is a DDL that generates a lot of data that is not processed by + * the plugin. So, in such cases, the downstream can timeout. To + * avoid that we try to send a keepalive message if required. + * Trying to send a keepalive message after every change has some + * overhead, but testing showed there is no noticeable overhead if + * we do it after every ~100 changes. + */ +#define CHANGES_THRESHOLD 100 + + if (++changes_count >= CHANGES_THRESHOLD) + { + rb->update_progress_txn(rb, txn, change->lsn); + changes_count = 0; + } + } + + /* speculative insertion record must be freed by now */ + Assert(!specinsert); + + /* clean up the iterator */ + ReorderBufferIterTXNFinish(rb, iterstate); + iterstate = NULL; + + /* + * Update total transaction count and total bytes processed by the + * transaction and its subtransactions. Ensure to not count the + * streamed transaction multiple times. + * + * Note that the statistics computation has to be done after + * ReorderBufferIterTXNFinish as it releases the serialized change + * which we have already accounted in ReorderBufferIterTXNNext. + */ + if (!rbtxn_is_streamed(txn)) + rb->totalTxns++; + + rb->totalBytes += txn->total_size; + + /* + * Done with current changes, send the last message for this set of + * changes depending upon streaming mode. + */ + if (streaming) + { + if (stream_started) + { + rb->stream_stop(rb, txn, prev_lsn); + stream_started = false; + } + } + else + { + /* + * Call either PREPARE (for two-phase transactions) or COMMIT (for + * regular ones). + */ + if (rbtxn_prepared(txn)) + rb->prepare(rb, txn, commit_lsn); + else + rb->commit(rb, txn, commit_lsn); + } + + /* this is just a sanity check against bad output plugin behaviour */ + if (GetCurrentTransactionIdIfAny() != InvalidTransactionId) + elog(ERROR, "output plugin used XID %u", + GetCurrentTransactionId()); + + /* + * Remember the command ID and snapshot for the next set of changes in + * streaming mode. + */ + if (streaming) + ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id); + else if (snapshot_now->copied) + ReorderBufferFreeSnap(rb, snapshot_now); + + /* cleanup */ + TeardownHistoricSnapshot(false); + + /* + * Aborting the current (sub-)transaction as a whole has the right + * semantics. We want all locks acquired in here to be released, not + * reassigned to the parent and we do not want any database access + * have persistent effects. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * We are here due to one of the four reasons: 1. Decoding an + * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a + * prepared txn that was (partially) streamed. 4. Decoding a committed + * txn. + * + * For 1, we allow truncation of txn data by removing the changes + * already streamed but still keeping other things like invalidations, + * snapshot, and tuplecids. For 2 and 3, we indicate + * ReorderBufferTruncateTXN to do more elaborate truncation of txn + * data as the entire transaction has been decoded except for commit. + * For 4, as the entire txn has been decoded, we can fully clean up + * the TXN reorder buffer. + */ + if (streaming || rbtxn_prepared(txn)) + { + ReorderBufferTruncateTXN(rb, txn, rbtxn_prepared(txn)); + /* Reset the CheckXidAlive */ + CheckXidAlive = InvalidTransactionId; + } + else + ReorderBufferCleanupTXN(rb, txn); + } + PG_CATCH(); + { + MemoryContext ecxt = MemoryContextSwitchTo(ccxt); + ErrorData *errdata = CopyErrorData(); + + /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */ + if (iterstate) + ReorderBufferIterTXNFinish(rb, iterstate); + + TeardownHistoricSnapshot(true); + + /* + * Force cache invalidation to happen outside of a valid transaction + * to prevent catalog access as we just caught an error. + */ + AbortCurrentTransaction(); + + /* make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + + if (using_subtxn) + RollbackAndReleaseCurrentSubTransaction(); + + /* + * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent + * abort of the (sub)transaction we are streaming or preparing. We + * need to do the cleanup and return gracefully on this error, see + * SetupCheckXidLive. + * + * This error code can be thrown by one of the callbacks we call + * during decoding so we need to ensure that we return gracefully only + * when we are sending the data in streaming mode and the streaming is + * not finished yet or when we are sending the data out on a PREPARE + * during a two-phase commit. + */ + if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK && + (stream_started || rbtxn_prepared(txn))) + { + /* curtxn must be set for streaming or prepared transactions */ + Assert(curtxn); + + /* Cleanup the temporary error state. */ + FlushErrorState(); + FreeErrorData(errdata); + errdata = NULL; + curtxn->concurrent_abort = true; + + /* Reset the TXN so that it is allowed to stream remaining data. */ + ReorderBufferResetTXN(rb, txn, snapshot_now, + command_id, prev_lsn, + specinsert); + } + else + { + ReorderBufferCleanupTXN(rb, txn); + MemoryContextSwitchTo(ecxt); + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* + * Perform the replay of a transaction and its non-aborted subtransactions. + * + * Subtransactions previously have to be processed by + * ReorderBufferCommitChild(), even if previously assigned to the toplevel + * transaction with ReorderBufferAssignChild. + * + * This interface is called once a prepare or toplevel commit is read for both + * streamed as well as non-streamed transactions. + */ +static void +ReorderBufferReplay(ReorderBufferTXN *txn, + ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + Snapshot snapshot_now; + CommandId command_id = FirstCommandId; + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + /* + * If the transaction was (partially) streamed, we need to commit it in a + * 'streamed' way. That is, we first stream the remaining part of the + * transaction, and then invoke stream_commit message. + * + * Called after everything (origin ID, LSN, ...) is stored in the + * transaction to avoid passing that information directly. + */ + if (rbtxn_is_streamed(txn)) + { + ReorderBufferStreamCommit(rb, txn); + return; + } + + /* + * If this transaction has no snapshot, it didn't make any changes to the + * database, so there's nothing to decode. Note that + * ReorderBufferCommitChild will have transferred any snapshots from + * subtransactions if there were any. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + + /* + * Removing this txn before a commit might result in the computation + * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts. + */ + if (!rbtxn_prepared(txn)) + ReorderBufferCleanupTXN(rb, txn); + return; + } + + snapshot_now = txn->base_snapshot; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now, + command_id, false); +} + +/* + * Commit a transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + TimestampTz commit_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time, + origin_id, origin_lsn); +} + +/* + * Record the prepare information for a transaction. + */ +bool +ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr prepare_lsn, XLogRecPtr end_lsn, + TimestampTz prepare_time, + RepOriginId origin_id, XLogRecPtr origin_lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return false; + + /* + * Remember the prepare information to be later used by commit prepared in + * case we skip doing prepare. + */ + txn->final_lsn = prepare_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.prepare_time = prepare_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + return true; +} + +/* Remember that we have skipped prepare */ +void +ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_SKIPPED_PREPARE; +} + +/* + * Prepare a two-phase transaction. + * + * See comments for ReorderBufferReplay(). + */ +void +ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid, + char *gid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown transaction, nothing to replay */ + if (txn == NULL) + return; + + txn->txn_flags |= RBTXN_PREPARE; + txn->gid = pstrdup(gid); + + /* The prepare info must have been updated in txn by now. */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + + /* + * We send the prepare for the concurrently aborted xacts so that later + * when rollback prepared is decoded and sent, the downstream should be + * able to rollback such a xact. See comments atop DecodePrepare. + * + * Note, for the concurrent_abort + streaming case a stream_prepare was + * already sent within the ReorderBufferReplay call above. + */ + if (txn->concurrent_abort && !rbtxn_is_streamed(txn)) + rb->prepare(rb, txn, txn->final_lsn); +} + +/* + * This is used to handle COMMIT/ROLLBACK PREPARED. + */ +void +ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr commit_lsn, XLogRecPtr end_lsn, + XLogRecPtr two_phase_at, + TimestampTz commit_time, RepOriginId origin_id, + XLogRecPtr origin_lsn, char *gid, bool is_commit) +{ + ReorderBufferTXN *txn; + XLogRecPtr prepare_end_lsn; + TimestampTz prepare_time; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false); + + /* unknown transaction, nothing to do */ + if (txn == NULL) + return; + + /* + * By this time the txn has the prepare record information, remember it to + * be later used for rollback. + */ + prepare_end_lsn = txn->end_lsn; + prepare_time = txn->xact_time.prepare_time; + + /* add the gid in the txn */ + txn->gid = pstrdup(gid); + + /* + * It is possible that this transaction is not decoded at prepare time + * either because by that time we didn't have a consistent snapshot, or + * two_phase was not enabled, or it was decoded earlier but we have + * restarted. We only need to send the prepare if it was not decoded + * earlier. We don't need to decode the xact for aborts if it is not done + * already. + */ + if ((txn->final_lsn < two_phase_at) && is_commit) + { + txn->txn_flags |= RBTXN_PREPARE; + + /* + * The prepare info must have been updated in txn even if we skip + * prepare. + */ + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* + * By this time the txn has the prepare record information and it is + * important to use that so that downstream gets the accurate + * information. If instead, we have passed commit information here + * then downstream can behave as it has already replayed commit + * prepared after the restart. + */ + ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn, + txn->xact_time.prepare_time, txn->origin_id, txn->origin_lsn); + } + + txn->final_lsn = commit_lsn; + txn->end_lsn = end_lsn; + txn->xact_time.commit_time = commit_time; + txn->origin_id = origin_id; + txn->origin_lsn = origin_lsn; + + if (is_commit) + rb->commit_prepared(rb, txn, commit_lsn); + else + rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time); + + /* cleanup: make sure there's no cache pollution */ + ReorderBufferExecuteInvalidations(txn->ninvalidations, + txn->invalidations); + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort a transaction that possibly has previous changes. Needs to be first + * called for subtransactions and then for the toplevel xid. + * + * NB: Transactions handled here have to have actively aborted (i.e. have + * produced an abort record). Implicitly aborted transactions are handled via + * ReorderBufferAbortOld(); transactions we're just not interested in, but + * which have committed are handled in ReorderBufferForget(). + * + * This function purges this transaction and its contents from memory and + * disk. + */ +void +ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, + TimestampTz abort_time) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to remove */ + if (txn == NULL) + return; + + txn->xact_time.abort_time = abort_time; + + /* For streamed transactions notify the remote node about the abort. */ + if (rbtxn_is_streamed(txn)) + { + rb->stream_abort(rb, txn, lsn); + + /* + * We might have decoded changes for this transaction that could load + * the cache as per the current transaction's view (consider DDL's + * happened in this transaction). We don't want the decoding of future + * transactions to use those cache entries so execute invalidations. + */ + if (txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + } + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Abort all transactions that aren't actually running anymore because the + * server restarted. + * + * NB: These really have to be transactions that have aborted due to a server + * crash/immediate restart, as we don't deal with invalidations here. + */ +void +ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid) +{ + dlist_mutable_iter it; + + /* + * Iterate through all (potential) toplevel TXNs and abort all that are + * older than what possibly can be running. Once we've found the first + * that is alive we stop, there might be some that acquired an xid earlier + * but started writing later, but it's unlikely and they will be cleaned + * up in a later call to this function. + */ + dlist_foreach_modify(it, &rb->toplevel_by_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, node, it.cur); + + if (TransactionIdPrecedes(txn->xid, oldestRunningXid)) + { + elog(DEBUG2, "aborting old transaction %u", txn->xid); + + /* Notify the remote node about the crash/immediate restart. */ + if (rbtxn_is_streamed(txn)) + rb->stream_abort(rb, txn, InvalidXLogRecPtr); + + /* remove potential on-disk data, and deallocate this tx */ + ReorderBufferCleanupTXN(rb, txn); + } + else + return; + } +} + +/* + * Forget the contents of a transaction if we aren't interested in its + * contents. Needs to be first called for subtransactions and then for the + * toplevel xid. + * + * This is significantly different to ReorderBufferAbort() because + * transactions that have committed need to be treated differently from aborted + * ones since they may have modified the catalog. + * + * Note that this is only allowed to be called in the moment a transaction + * commit has just been read, not earlier; otherwise later records referring + * to this xid might re-create the transaction incompletely. + */ +void +ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to forget */ + if (txn == NULL) + return; + + /* this transaction mustn't be streamed */ + Assert(!rbtxn_is_streamed(txn)); + + /* cosmetic... */ + txn->final_lsn = lsn; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); + + /* remove potential on-disk data, and deallocate */ + ReorderBufferCleanupTXN(rb, txn); +} + +/* + * Invalidate cache for those transactions that need to be skipped just in case + * catalogs were manipulated as part of the transaction. + * + * Note that this is a special-purpose function for prepared transactions where + * we don't want to clean up the TXN even when we decide to skip it. See + * DecodePrepare. + */ +void +ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + + /* unknown, nothing to do */ + if (txn == NULL) + return; + + /* + * Process cache invalidation messages if there are any. Even if we're not + * interested in the transaction's contents, it could have manipulated the + * catalog and we need to update the caches according to that. + */ + if (txn->base_snapshot != NULL && txn->ninvalidations > 0) + ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, + txn->invalidations); + else + Assert(txn->ninvalidations == 0); +} + + +/* + * Execute invalidations happening outside the context of a decoded + * transaction. That currently happens either for xid-less commits + * (cf. RecordTransactionCommit()) or for invalidations in uninteresting + * transactions (via ReorderBufferForget()). + */ +void +ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, + SharedInvalidationMessage *invalidations) +{ + bool use_subtxn = IsTransactionOrTransactionBlock(); + int i; + + if (use_subtxn) + BeginInternalSubTransaction("replay"); + + /* + * Force invalidations to happen outside of a valid transaction - that way + * entries will just be marked as invalid without accessing the catalog. + * That's advantageous because we don't need to setup the full state + * necessary for catalog access. + */ + if (use_subtxn) + AbortCurrentTransaction(); + + for (i = 0; i < ninvalidations; i++) + LocalExecuteInvalidationMessage(&invalidations[i]); + + if (use_subtxn) + RollbackAndReleaseCurrentSubTransaction(); +} + +/* + * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at + * least once for every xid in XLogRecord->xl_xid (other places in records + * may, but do not have to be passed through here). + * + * Reorderbuffer keeps some datastructures about transactions in LSN order, + * for efficiency. To do that it has to know about when transactions are seen + * first in the WAL. As many types of records are not actually interesting for + * logical decoding, they do not necessarily pass though here. + */ +void +ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) +{ + /* many records won't have an xid assigned, centralize check here */ + if (xid != InvalidTransactionId) + ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); +} + +/* + * Add a new snapshot to this transaction that may only used after lsn 'lsn' + * because the previous snapshot doesn't describe the catalog correctly for + * following rows. + */ +void +ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.snapshot = snap; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Set up the transaction's base snapshot. + * + * If we know that xid is a subtransaction, set the base snapshot on the + * top-level transaction instead. + */ +void +ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Snapshot snap) +{ + ReorderBufferTXN *txn; + bool is_new; + + Assert(snap != NULL); + + /* + * Fetch the transaction to operate on. If we know it's a subtransaction, + * operate on its top-level transaction instead. + */ + txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true); + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + Assert(txn->base_snapshot == NULL); + + txn->base_snapshot = snap; + txn->base_snapshot_lsn = lsn; + dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node); + + AssertTXNLsnOrder(rb); +} + +/* + * Access the catalog with this CommandId at this point in the changestream. + * + * May only be called for command ids > 1 + */ +void +ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, CommandId cid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + + change->data.command_id = cid; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID; + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * Update memory counters to account for the new or removed change. + * + * We update two counters - in the reorder buffer, and in the transaction + * containing the change. The reorder buffer counter allows us to quickly + * decide if we reached the memory limit, the transaction counter allows + * us to quickly pick the largest transaction for eviction. + * + * When streaming is enabled, we need to update the toplevel transaction + * counters instead - we don't really care about subtransactions as we + * can't stream them individually anyway, and we only pick toplevel + * transactions for eviction. So only toplevel transactions matter. + */ +static void +ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb, + ReorderBufferChange *change, + bool addition, Size sz) +{ + ReorderBufferTXN *txn; + ReorderBufferTXN *toptxn; + + Assert(change->txn); + + /* + * Ignore tuple CID changes, because those are not evicted when reaching + * memory limit. So we just don't count them, because it might easily + * trigger a pointless attempt to spill. + */ + if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID) + return; + + txn = change->txn; + + /* + * Update the total size in top level as well. This is later used to + * compute the decoding stats. + */ + toptxn = rbtxn_get_toptxn(txn); + + if (addition) + { + txn->size += sz; + rb->size += sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size += sz; + } + else + { + Assert((rb->size >= sz) && (txn->size >= sz)); + txn->size -= sz; + rb->size -= sz; + + /* Update the total size in the top transaction. */ + toptxn->total_size -= sz; + } + + Assert(txn->size <= rb->size); +} + +/* + * Add new (relfilelocator, tid) -> (cmin, cmax) mappings. + * + * We do not include this change type in memory accounting, because we + * keep CIDs in a separate list and do not evict them when reaching + * the memory limit. + */ +void +ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, RelFileLocator locator, + ItemPointerData tid, CommandId cmin, + CommandId cmax, CommandId combocid) +{ + ReorderBufferChange *change = ReorderBufferGetChange(rb); + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + change->data.tuplecid.locator = locator; + change->data.tuplecid.tid = tid; + change->data.tuplecid.cmin = cmin; + change->data.tuplecid.cmax = cmax; + change->data.tuplecid.combocid = combocid; + change->lsn = lsn; + change->txn = txn; + change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID; + + dlist_push_tail(&txn->tuplecids, &change->node); + txn->ntuplecids++; +} + +/* + * Accumulate the invalidations for executing them later. + * + * This needs to be called for each XLOG_XACT_INVALIDATIONS message and + * accumulates all the invalidation messages in the toplevel transaction, if + * available, otherwise in the current transaction, as well as in the form of + * change in reorder buffer. We require to record it in form of the change + * so that we can execute only the required invalidations instead of executing + * all the invalidations on each CommandId increment. We also need to + * accumulate these in the txn buffer because in some cases where we skip + * processing the transaction (see ReorderBufferForget), we need to execute + * all the invalidations together. + */ +void +ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + ReorderBufferChange *change; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction, if available, + * so that we can execute them all together. See comments atop this + * function. + */ + txn = rbtxn_get_toptxn(txn); + + Assert(nmsgs > 0); + + /* Accumulate invalidations. */ + if (txn->ninvalidations == 0) + { + txn->ninvalidations = nmsgs; + txn->invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(txn->invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + } + else + { + txn->invalidations = (SharedInvalidationMessage *) + repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * + (txn->ninvalidations + nmsgs)); + + memcpy(txn->invalidations + txn->ninvalidations, msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + txn->ninvalidations += nmsgs; + } + + change = ReorderBufferGetChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Apply all invalidations we know. Possibly we only need parts at this point + * in the changestream but we don't know which those are. + */ +static void +ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs) +{ + int i; + + for (i = 0; i < nmsgs; i++) + LocalExecuteInvalidationMessage(&msgs[i]); +} + +/* + * Mark a transaction as containing catalog changes + */ +void +ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + if (!rbtxn_has_catalog_changes(txn)) + { + txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + dclist_push_tail(&rb->catchange_txns, &txn->catchange_node); + } + + /* + * Mark top-level transaction as having catalog changes too if one of its + * children has so that the ReorderBufferBuildTupleCidHash can + * conveniently check just top-level transaction and decide whether to + * build the hash table or not. + */ + if (rbtxn_is_subtxn(txn)) + { + ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn); + + if (!rbtxn_has_catalog_changes(toptxn)) + { + toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES; + dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node); + } + } +} + +/* + * Return palloc'ed array of the transactions that have changed catalogs. + * The returned array is sorted in xidComparator order. + * + * The caller must free the returned array when done with it. + */ +TransactionId * +ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb) +{ + dlist_iter iter; + TransactionId *xids = NULL; + size_t xcnt = 0; + + /* Quick return if the list is empty */ + if (dclist_count(&rb->catchange_txns) == 0) + return NULL; + + /* Initialize XID array */ + xids = (TransactionId *) palloc(sizeof(TransactionId) * + dclist_count(&rb->catchange_txns)); + dclist_foreach(iter, &rb->catchange_txns) + { + ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN, + catchange_node, + iter.cur); + + Assert(rbtxn_has_catalog_changes(txn)); + + xids[xcnt++] = txn->xid; + } + + qsort(xids, xcnt, sizeof(TransactionId), xidComparator); + + Assert(xcnt == dclist_count(&rb->catchange_txns)); + return xids; +} + +/* + * Query whether a transaction is already *known* to contain catalog + * changes. This can be wrong until directly before the commit! + */ +bool +ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, + false); + if (txn == NULL) + return false; + + return rbtxn_has_catalog_changes(txn); +} + +/* + * ReorderBufferXidHasBaseSnapshot + * Have we already set the base snapshot for the given txn/subtxn? + */ +bool +ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid) +{ + ReorderBufferTXN *txn; + + txn = ReorderBufferTXNByXid(rb, xid, false, + NULL, InvalidXLogRecPtr, false); + + /* transaction isn't known yet, ergo no snapshot */ + if (txn == NULL) + return false; + + /* a known subtxn? operate on top-level txn instead */ + if (rbtxn_is_known_subxact(txn)) + txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false, + NULL, InvalidXLogRecPtr, false); + + return txn->base_snapshot != NULL; +} + + +/* + * --------------------------------------- + * Disk serialization support + * --------------------------------------- + */ + +/* + * Ensure the IO buffer is >= sz. + */ +static void +ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz) +{ + if (!rb->outbufsize) + { + rb->outbuf = MemoryContextAlloc(rb->context, sz); + rb->outbufsize = sz; + } + else if (rb->outbufsize < sz) + { + rb->outbuf = repalloc(rb->outbuf, sz); + rb->outbufsize = sz; + } +} + +/* + * Find the largest transaction (toplevel or subxact) to evict (spill to disk). + * + * XXX With many subtransactions this might be quite slow, because we'll have + * to walk through all of them. There are some options how we could improve + * that: (a) maintain some secondary structure with transactions sorted by + * amount of changes, (b) not looking for the entirely largest transaction, + * but e.g. for transaction using at least some fraction of the memory limit, + * and (c) evicting multiple transactions at once, e.g. to free a given portion + * of the memory limit (e.g. 50%). + */ +static ReorderBufferTXN * +ReorderBufferLargestTXN(ReorderBuffer *rb) +{ + HASH_SEQ_STATUS hash_seq; + ReorderBufferTXNByIdEnt *ent; + ReorderBufferTXN *largest = NULL; + + hash_seq_init(&hash_seq, rb->by_txn); + while ((ent = hash_seq_search(&hash_seq)) != NULL) + { + ReorderBufferTXN *txn = ent->txn; + + /* if the current transaction is larger, remember it */ + if ((!largest) || (txn->size > largest->size)) + largest = txn; + } + + Assert(largest); + Assert(largest->size > 0); + Assert(largest->size <= rb->size); + + return largest; +} + +/* + * Find the largest streamable toplevel transaction to evict (by streaming). + * + * This can be seen as an optimized version of ReorderBufferLargestTXN, which + * should give us the same transaction (because we don't update memory account + * for subtransaction with streaming, so it's always 0). But we can simply + * iterate over the limited number of toplevel transactions that have a base + * snapshot. There is no use of selecting a transaction that doesn't have base + * snapshot because we don't decode such transactions. Also, we do not select + * the transaction which doesn't have any streamable change. + * + * Note that, we skip transactions that contains incomplete changes. There + * is a scope of optimization here such that we can select the largest + * transaction which has incomplete changes. But that will make the code and + * design quite complex and that might not be worth the benefit. If we plan to + * stream the transactions that contains incomplete changes then we need to + * find a way to partially stream/truncate the transaction changes in-memory + * and build a mechanism to partially truncate the spilled files. + * Additionally, whenever we partially stream the transaction we need to + * maintain the last streamed lsn and next time we need to restore from that + * segment and the offset in WAL. As we stream the changes from the top + * transaction and restore them subtransaction wise, we need to even remember + * the subxact from where we streamed the last change. + */ +static ReorderBufferTXN * +ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb) +{ + dlist_iter iter; + Size largest_size = 0; + ReorderBufferTXN *largest = NULL; + + /* Find the largest top-level transaction having a base snapshot. */ + dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn) + { + ReorderBufferTXN *txn; + + txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur); + + /* must not be a subtxn */ + Assert(!rbtxn_is_known_subxact(txn)); + /* base_snapshot must be set */ + Assert(txn->base_snapshot != NULL); + + if ((largest == NULL || txn->total_size > largest_size) && + (txn->total_size > 0) && !(rbtxn_has_partial_change(txn)) && + rbtxn_has_streamable_change(txn)) + { + largest = txn; + largest_size = txn->total_size; + } + } + + return largest; +} + +/* + * Check whether the logical_decoding_work_mem limit was reached, and if yes + * pick the largest (sub)transaction at-a-time to evict and spill its changes to + * disk or send to the output plugin until we reach under the memory limit. + * + * If debug_logical_replication_streaming is set to "immediate", stream or + * serialize the changes immediately. + * + * XXX At this point we select the transactions until we reach under the memory + * limit, but we might also adapt a more elaborate eviction strategy - for example + * evicting enough transactions to free certain fraction (e.g. 50%) of the memory + * limit. + */ +static void +ReorderBufferCheckMemoryLimit(ReorderBuffer *rb) +{ + ReorderBufferTXN *txn; + + /* + * Bail out if debug_logical_replication_streaming is buffered and we + * haven't exceeded the memory limit. + */ + if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED && + rb->size < logical_decoding_work_mem * 1024L) + return; + + /* + * If debug_logical_replication_streaming is immediate, loop until there's + * no change. Otherwise, loop until we reach under the memory limit. One + * might think that just by evicting the largest (sub)transaction we will + * come under the memory limit based on assumption that the selected + * transaction is at least as large as the most recent change (which + * caused us to go over the memory limit). However, that is not true + * because a user can reduce the logical_decoding_work_mem to a smaller + * value before the most recent change. + */ + while (rb->size >= logical_decoding_work_mem * 1024L || + (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE && + rb->size > 0)) + { + /* + * Pick the largest transaction and evict it from memory by streaming, + * if possible. Otherwise, spill to disk. + */ + if (ReorderBufferCanStartStreaming(rb) && + (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL) + { + /* we know there has to be one, because the size is not zero */ + Assert(txn && rbtxn_is_toptxn(txn)); + Assert(txn->total_size > 0); + Assert(rb->size >= txn->total_size); + + ReorderBufferStreamTXN(rb, txn); + } + else + { + /* + * Pick the largest transaction (or subtransaction) and evict it + * from memory by serializing it to disk. + */ + txn = ReorderBufferLargestTXN(rb); + + /* we know there has to be one, because the size is not zero */ + Assert(txn); + Assert(txn->size > 0); + Assert(rb->size >= txn->size); + + ReorderBufferSerializeTXN(rb, txn); + } + + /* + * After eviction, the transaction should have no entries in memory, + * and should use 0 bytes for changes. + */ + Assert(txn->size == 0); + Assert(txn->nentries_mem == 0); + } + + /* We must be under the memory limit now. */ + Assert(rb->size < logical_decoding_work_mem * 1024L); +} + +/* + * Spill data of a large transaction (and its subtransactions) to disk. + */ +static void +ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + dlist_iter subtxn_i; + dlist_mutable_iter change_i; + int fd = -1; + XLogSegNo curOpenSegNo = 0; + Size spilled = 0; + Size size = txn->size; + + elog(DEBUG2, "spill %u changes in XID %u to disk", + (uint32) txn->nentries_mem, txn->xid); + + /* do the same to all child TXs */ + dlist_foreach(subtxn_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur); + ReorderBufferSerializeTXN(rb, subtxn); + } + + /* serialize changestream */ + dlist_foreach_modify(change_i, &txn->changes) + { + ReorderBufferChange *change; + + change = dlist_container(ReorderBufferChange, node, change_i.cur); + + /* + * store in segment in which it belongs by start lsn, don't split over + * multiple segments tho + */ + if (fd == -1 || + !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size)) + { + char path[MAXPGPATH]; + + if (fd != -1) + CloseTransientFile(fd); + + XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + curOpenSegNo); + + /* open segment, create it if necessary */ + fd = OpenTransientFile(path, + O_CREAT | O_WRONLY | O_APPEND | PG_BINARY); + + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + ReorderBufferSerializeChange(rb, txn, fd, change); + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + + spilled++; + } + + /* update the statistics iff we have spilled anything */ + if (spilled) + { + rb->spillCount += 1; + rb->spillBytes += size; + + /* don't consider already serialized transactions */ + rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + } + + Assert(spilled == txn->nentries_mem); + Assert(dlist_is_empty(&txn->changes)); + txn->nentries_mem = 0; + txn->txn_flags |= RBTXN_IS_SERIALIZED; + + if (fd != -1) + CloseTransientFile(fd); +} + +/* + * Serialize individual change to disk. + */ +static void +ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + int fd, ReorderBufferChange *change) +{ + ReorderBufferDiskChange *ondisk; + Size sz = sizeof(ReorderBufferDiskChange); + + ReorderBufferSerializeReserve(rb, sz); + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(&ondisk->change, change, sizeof(ReorderBufferChange)); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + char *data; + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + if (oldlen) + { + memcpy(data, &oldtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, oldtup->tuple.t_data, oldlen); + data += oldlen; + } + + if (newlen) + { + memcpy(data, &newtup->tuple, sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + memcpy(data, newtup->tuple.t_data, newlen); + data += newlen; + } + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + char *data; + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + /* write the prefix including the size */ + memcpy(data, &prefix_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.prefix, + prefix_size); + data += prefix_size; + + /* write the message including the size */ + memcpy(data, &change->data.msg.message_size, sizeof(Size)); + data += sizeof(Size); + memcpy(data, change->data.msg.message, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + char *data; + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + sz += inval_size; + + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + memcpy(data, change->data.inval.invalidations, inval_size); + data += inval_size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + char *data; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, snap, sizeof(SnapshotData)); + data += sizeof(SnapshotData); + + if (snap->xcnt) + { + memcpy(data, snap->xip, + sizeof(TransactionId) * snap->xcnt); + data += sizeof(TransactionId) * snap->xcnt; + } + + if (snap->subxcnt) + { + memcpy(data, snap->subxip, + sizeof(TransactionId) * snap->subxcnt); + data += sizeof(TransactionId) * snap->subxcnt; + } + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Size size; + char *data; + + /* account for the OIDs of truncated relations */ + size = sizeof(Oid) * change->data.truncate.nrelids; + sz += size; + + /* make sure we have enough space */ + ReorderBufferSerializeReserve(rb, sz); + + data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange); + /* might have been reallocated above */ + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + memcpy(data, change->data.truncate.relids, size); + data += size; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + ondisk->size = sz; + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE); + if (write(fd, rb->outbuf, ondisk->size) != ondisk->size) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to data file for XID %u: %m", + txn->xid))); + } + pgstat_report_wait_end(); + + /* + * Keep the transaction's final_lsn up to date with each change we send to + * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to + * only do this on commit and abort records, but that doesn't work if a + * system crash leaves a transaction without its abort record). + * + * Make sure not to move it backwards. + */ + if (txn->final_lsn < change->lsn) + txn->final_lsn = change->lsn; + + Assert(ondisk->change.action == change->action); +} + +/* Returns true, if the output plugin supports streaming, false, otherwise. */ +static inline bool +ReorderBufferCanStream(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + + return ctx->streaming; +} + +/* Returns true, if the streaming can be started now, false, otherwise. */ +static inline bool +ReorderBufferCanStartStreaming(ReorderBuffer *rb) +{ + LogicalDecodingContext *ctx = rb->private_data; + SnapBuild *builder = ctx->snapshot_builder; + + /* We can't start streaming unless a consistent state is reached. */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) + return false; + + /* + * We can't start streaming immediately even if the streaming is enabled + * because we previously decoded this transaction and now just are + * restarting. + */ + if (ReorderBufferCanStream(rb) && + !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr)) + return true; + + return false; +} + +/* + * Send data of a large transaction (and its subtransactions) to the + * output plugin, but using the stream API. + */ +static void +ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + Snapshot snapshot_now; + CommandId command_id; + Size stream_bytes; + bool txn_is_streamed; + + /* We can never reach here for a subtransaction. */ + Assert(rbtxn_is_toptxn(txn)); + + /* + * We can't make any assumptions about base snapshot here, similar to what + * ReorderBufferCommit() does. That relies on base_snapshot getting + * transferred from subxact in ReorderBufferCommitChild(), but that was + * not yet called as the transaction is in-progress. + * + * So just walk the subxacts and use the same logic here. But we only need + * to do that once, when the transaction is streamed for the first time. + * After that we need to reuse the snapshot from the previous run. + * + * Unlike DecodeCommit which adds xids of all the subtransactions in + * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but + * we do add them to subxip array instead via ReorderBufferCopySnap. This + * allows the catalog changes made in subtransactions decoded till now to + * be visible. + */ + if (txn->snapshot_now == NULL) + { + dlist_iter subxact_i; + + /* make sure this transaction is streamed for the first time */ + Assert(!rbtxn_is_streamed(txn)); + + /* at the beginning we should have invalid command ID */ + Assert(txn->command_id == InvalidCommandId); + + dlist_foreach(subxact_i, &txn->subtxns) + { + ReorderBufferTXN *subtxn; + + subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur); + ReorderBufferTransferSnapToParent(txn, subtxn); + } + + /* + * If this transaction has no snapshot, it didn't make any changes to + * the database till now, so there's nothing to decode. + */ + if (txn->base_snapshot == NULL) + { + Assert(txn->ninvalidations == 0); + return; + } + + command_id = FirstCommandId; + snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot, + txn, command_id); + } + else + { + /* the transaction must have been already streamed */ + Assert(rbtxn_is_streamed(txn)); + + /* + * Nah, we already have snapshot from the previous streaming run. We + * assume new subxacts can't move the LSN backwards, and so can't beat + * the LSN condition in the previous branch (so no need to walk + * through subxacts again). In fact, we must not do that as we may be + * using snapshot half-way through the subxact. + */ + command_id = txn->command_id; + + /* + * We can't use txn->snapshot_now directly because after the last + * streaming run, we might have got some new sub-transactions. So we + * need to add them to the snapshot. + */ + snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now, + txn, command_id); + + /* Free the previously copied snapshot. */ + Assert(txn->snapshot_now->copied); + ReorderBufferFreeSnap(rb, txn->snapshot_now); + txn->snapshot_now = NULL; + } + + /* + * Remember this information to be used later to update stats. We can't + * update the stats here as an error while processing the changes would + * lead to the accumulation of stats even though we haven't streamed all + * the changes. + */ + txn_is_streamed = rbtxn_is_streamed(txn); + stream_bytes = txn->total_size; + + /* Process and send the changes to output plugin. */ + ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now, + command_id, true); + + rb->streamCount += 1; + rb->streamBytes += stream_bytes; + + /* Don't consider already streamed transaction. */ + rb->streamTxns += (txn_is_streamed) ? 0 : 1; + + /* update the decoding stats */ + UpdateDecodingStats((LogicalDecodingContext *) rb->private_data); + + Assert(dlist_is_empty(&txn->changes)); + Assert(txn->nentries == 0); + Assert(txn->nentries_mem == 0); +} + +/* + * Size of a change in memory. + */ +static Size +ReorderBufferChangeSize(ReorderBufferChange *change) +{ + Size sz = sizeof(ReorderBufferChange); + + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + { + ReorderBufferTupleBuf *oldtup, + *newtup; + Size oldlen = 0; + Size newlen = 0; + + oldtup = change->data.tp.oldtuple; + newtup = change->data.tp.newtuple; + + if (oldtup) + { + sz += sizeof(HeapTupleData); + oldlen = oldtup->tuple.t_len; + sz += oldlen; + } + + if (newtup) + { + sz += sizeof(HeapTupleData); + newlen = newtup->tuple.t_len; + sz += newlen; + } + + break; + } + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size = strlen(change->data.msg.prefix) + 1; + + sz += prefix_size + change->data.msg.message_size + + sizeof(Size) + sizeof(Size); + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + sz += sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot snap; + + snap = change->data.snapshot; + + sz += sizeof(SnapshotData) + + sizeof(TransactionId) * snap->xcnt + + sizeof(TransactionId) * snap->subxcnt; + + break; + } + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + sz += sizeof(Oid) * change->data.truncate.nrelids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + /* ReorderBufferChange contains everything important */ + break; + } + + return sz; +} + + +/* + * Restore a number of changes spilled to disk back into memory. + */ +static Size +ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn, + TXNEntryFile *file, XLogSegNo *segno) +{ + Size restored = 0; + XLogSegNo last_segno; + dlist_mutable_iter cleanup_iter; + File *fd = &file->vfd; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + /* free current entries, so we have memory for more */ + dlist_foreach_modify(cleanup_iter, &txn->changes) + { + ReorderBufferChange *cleanup = + dlist_container(ReorderBufferChange, node, cleanup_iter.cur); + + dlist_delete(&cleanup->node); + ReorderBufferReturnChange(rb, cleanup, true); + } + txn->nentries_mem = 0; + Assert(dlist_is_empty(&txn->changes)); + + XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size); + + while (restored < max_changes_in_memory && *segno <= last_segno) + { + int readBytes; + ReorderBufferDiskChange *ondisk; + + CHECK_FOR_INTERRUPTS(); + + if (*fd == -1) + { + char path[MAXPGPATH]; + + /* first time in */ + if (*segno == 0) + XLByteToSeg(txn->first_lsn, *segno, wal_segment_size); + + Assert(*segno != 0 || dlist_is_empty(&txn->changes)); + + /* + * No need to care about TLIs here, only used during a single run, + * so each LSN only maps to a specific WAL record. + */ + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, + *segno); + + *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + + /* No harm in resetting the offset even in case of failure */ + file->curOffset = 0; + + if (*fd < 0 && errno == ENOENT) + { + *fd = -1; + (*segno)++; + continue; + } + else if (*fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } + + /* + * Read the statically sized part of a change which has information + * about the total size. If we couldn't read a record, we're at the + * end of this file. + */ + ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange)); + readBytes = FileRead(file->vfd, rb->outbuf, + sizeof(ReorderBufferDiskChange), + file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ); + + /* eof */ + if (readBytes == 0) + { + FileClose(*fd); + *fd = -1; + (*segno)++; + continue; + } + else if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) sizeof(ReorderBufferDiskChange)))); + + file->curOffset += readBytes; + + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + ReorderBufferSerializeReserve(rb, + sizeof(ReorderBufferDiskChange) + ondisk->size); + ondisk = (ReorderBufferDiskChange *) rb->outbuf; + + readBytes = FileRead(file->vfd, + rb->outbuf + sizeof(ReorderBufferDiskChange), + ondisk->size - sizeof(ReorderBufferDiskChange), + file->curOffset, + WAIT_EVENT_REORDER_BUFFER_READ); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: %m"))); + else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes", + readBytes, + (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange))))); + + file->curOffset += readBytes; + + /* + * ok, read a full change from disk, now restore it into proper + * in-memory format + */ + ReorderBufferRestoreChange(rb, txn, rb->outbuf); + restored++; + } + + return restored; +} + +/* + * Convert change from its on-disk format to in-memory format and queue it onto + * the TXN's ->changes list. + * + * Note: although "data" is declared char*, at entry it points to a + * maxalign'd buffer, making it safe in most of this function to assume + * that the pointed-to data is suitably aligned for direct access. + */ +static void +ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, + char *data) +{ + ReorderBufferDiskChange *ondisk; + ReorderBufferChange *change; + + ondisk = (ReorderBufferDiskChange *) data; + + change = ReorderBufferGetChange(rb); + + /* copy static part */ + memcpy(change, &ondisk->change, sizeof(ReorderBufferChange)); + + data += sizeof(ReorderBufferDiskChange); + + /* restore individual stuff */ + switch (change->action) + { + /* fall through these, they're all similar enough */ + case REORDER_BUFFER_CHANGE_INSERT: + case REORDER_BUFFER_CHANGE_UPDATE: + case REORDER_BUFFER_CHANGE_DELETE: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT: + if (change->data.tp.oldtuple) + { + uint32 tuplelen = ((HeapTuple) data)->t_len; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.oldtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.oldtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.oldtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + if (change->data.tp.newtuple) + { + /* here, data might not be suitably aligned! */ + uint32 tuplelen; + + memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len), + sizeof(uint32)); + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader); + + /* restore ->tuple */ + memcpy(&change->data.tp.newtuple->tuple, data, + sizeof(HeapTupleData)); + data += sizeof(HeapTupleData); + + /* reset t_data pointer into the new tuplebuf */ + change->data.tp.newtuple->tuple.t_data = + ReorderBufferTupleBufData(change->data.tp.newtuple); + + /* restore tuple data itself */ + memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen); + data += tuplelen; + } + + break; + case REORDER_BUFFER_CHANGE_MESSAGE: + { + Size prefix_size; + + /* read prefix */ + memcpy(&prefix_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.prefix = MemoryContextAlloc(rb->context, + prefix_size); + memcpy(change->data.msg.prefix, data, prefix_size); + Assert(change->data.msg.prefix[prefix_size - 1] == '\0'); + data += prefix_size; + + /* read the message */ + memcpy(&change->data.msg.message_size, data, sizeof(Size)); + data += sizeof(Size); + change->data.msg.message = MemoryContextAlloc(rb->context, + change->data.msg.message_size); + memcpy(change->data.msg.message, data, + change->data.msg.message_size); + data += change->data.msg.message_size; + + break; + } + case REORDER_BUFFER_CHANGE_INVALIDATION: + { + Size inval_size = sizeof(SharedInvalidationMessage) * + change->data.inval.ninvalidations; + + change->data.inval.invalidations = + MemoryContextAlloc(rb->context, inval_size); + + /* read the message */ + memcpy(change->data.inval.invalidations, data, inval_size); + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: + { + Snapshot oldsnap; + Snapshot newsnap; + Size size; + + oldsnap = (Snapshot) data; + + size = sizeof(SnapshotData) + + sizeof(TransactionId) * oldsnap->xcnt + + sizeof(TransactionId) * (oldsnap->subxcnt + 0); + + change->data.snapshot = MemoryContextAllocZero(rb->context, size); + + newsnap = change->data.snapshot; + + memcpy(newsnap, data, size); + newsnap->xip = (TransactionId *) + (((char *) newsnap) + sizeof(SnapshotData)); + newsnap->subxip = newsnap->xip + newsnap->xcnt; + newsnap->copied = true; + break; + } + /* the base struct contains all the data, easy peasy */ + case REORDER_BUFFER_CHANGE_TRUNCATE: + { + Oid *relids; + + relids = ReorderBufferGetRelids(rb, + change->data.truncate.nrelids); + memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid)); + change->data.truncate.relids = relids; + + break; + } + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: + case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT: + case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID: + case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID: + break; + } + + dlist_push_tail(&txn->changes, &change->node); + txn->nentries_mem++; + + /* + * Update memory accounting for the restored change. We need to do this + * although we don't check the memory limit when restoring the changes in + * this branch (we only do that when initially queueing the changes after + * decoding), because we will release the changes later, and that will + * update the accounting too (subtracting the size from the counters). And + * we don't want to underflow there. + */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Remove all on-disk stored for the passed in transaction. + */ +static void +ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + XLogSegNo first; + XLogSegNo cur; + XLogSegNo last; + + Assert(txn->first_lsn != InvalidXLogRecPtr); + Assert(txn->final_lsn != InvalidXLogRecPtr); + + XLByteToSeg(txn->first_lsn, first, wal_segment_size); + XLByteToSeg(txn->final_lsn, last, wal_segment_size); + + /* iterate over all possible filenames, and delete them */ + for (cur = first; cur <= last; cur++) + { + char path[MAXPGPATH]; + + ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur); + if (unlink(path) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } +} + +/* + * Remove any leftover serialized reorder buffers from a slot directory after a + * prior crash or decoding session exit. + */ +static void +ReorderBufferCleanupSerializedTXNs(const char *slotname) +{ + DIR *spill_dir; + struct dirent *spill_de; + struct stat statbuf; + char path[MAXPGPATH * 2 + 12]; + + sprintf(path, "pg_replslot/%s", slotname); + + /* we're only handling directories here, skip if it's not ours */ + if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode)) + return; + + spill_dir = AllocateDir(path); + while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL) + { + /* only look at names that can be ours */ + if (strncmp(spill_de->d_name, "xid", 3) == 0) + { + snprintf(path, sizeof(path), + "pg_replslot/%s/%s", slotname, + spill_de->d_name); + + if (unlink(path) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m", + path, slotname))); + } + } + FreeDir(spill_dir); +} + +/* + * Given a replication slot, transaction ID and segment number, fill in the + * corresponding spill file into 'path', which is a caller-owned buffer of size + * at least MAXPGPATH. + */ +static void +ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid, + XLogSegNo segno) +{ + XLogRecPtr recptr; + + XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr); + + snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill", + NameStr(MyReplicationSlot->data.name), + xid, LSN_FORMAT_ARGS(recptr)); +} + +/* + * Delete all data spilled to disk after we've restarted/crashed. It will be + * recreated when the respective slots are reused. + */ +void +StartupReorderBuffer(void) +{ + DIR *logical_dir; + struct dirent *logical_de; + + logical_dir = AllocateDir("pg_replslot"); + while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL) + { + if (strcmp(logical_de->d_name, ".") == 0 || + strcmp(logical_de->d_name, "..") == 0) + continue; + + /* if it cannot be a slot, skip the directory */ + if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + continue; + + /* + * ok, has to be a surviving logical slot, iterate and delete + * everything starting with xid-* + */ + ReorderBufferCleanupSerializedTXNs(logical_de->d_name); + } + FreeDir(logical_dir); +} + +/* --------------------------------------- + * toast reassembly support + * --------------------------------------- + */ + +/* + * Initialize per tuple toast reconstruction support. + */ +static void +ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASHCTL hash_ctl; + + Assert(txn->toast_hash == NULL); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(ReorderBufferToastEnt); + hash_ctl.hcxt = rb->context; + txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); +} + +/* + * Per toast-chunk handling for toast reconstruction + * + * Appends a toast chunk so we can reconstruct it when the tuple "owning" the + * toasted Datum comes along. + */ +static void +ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + ReorderBufferToastEnt *ent; + ReorderBufferTupleBuf *newtup; + bool found; + int32 chunksize; + bool isnull; + Pointer chunk; + TupleDesc desc = RelationGetDescr(relation); + Oid chunk_id; + int32 chunk_seq; + + if (txn->toast_hash == NULL) + ReorderBufferToastInitHash(rb, txn); + + Assert(IsToastRelation(relation)); + + newtup = change->data.tp.newtuple; + chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull)); + Assert(!isnull); + chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull)); + Assert(!isnull); + + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found); + + if (!found) + { + Assert(ent->chunk_id == chunk_id); + ent->num_chunks = 0; + ent->last_chunk_seq = 0; + ent->size = 0; + ent->reconstructed = NULL; + dlist_init(&ent->chunks); + + if (chunk_seq != 0) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0", + chunk_seq, chunk_id); + } + else if (found && chunk_seq != ent->last_chunk_seq + 1) + elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d", + chunk_seq, chunk_id, ent->last_chunk_seq + 1); + + chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull)); + Assert(!isnull); + + /* calculate size so we can allocate the right size at once later */ + if (!VARATT_IS_EXTENDED(chunk)) + chunksize = VARSIZE(chunk) - VARHDRSZ; + else if (VARATT_IS_SHORT(chunk)) + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + else + elog(ERROR, "unexpected type of toast chunk"); + + ent->size += chunksize; + ent->last_chunk_seq = chunk_seq; + ent->num_chunks++; + dlist_push_tail(&ent->chunks, &change->node); +} + +/* + * Rejigger change->newtuple to point to in-memory toast tuples instead to + * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM). + * + * We cannot replace unchanged toast tuples though, so those will still point + * to on-disk toast data. + * + * While updating the existing change with detoasted tuple data, we need to + * update the memory accounting info, because the change size will differ. + * Otherwise the accounting may get out of sync, triggering serialization + * at unexpected times. + * + * We simply subtract size of the change before rejiggering the tuple, and + * then adding the new size. This makes it look like the change was removed + * and then added back, except it only tweaks the accounting info. + * + * In particular it can't trigger serialization, which would be pointless + * anyway as it happens during commit processing right before handing + * the change to the output plugin. + */ +static void +ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + TupleDesc desc; + int natt; + Datum *attrs; + bool *isnull; + bool *free; + HeapTuple tmphtup; + Relation toast_rel; + TupleDesc toast_desc; + MemoryContext oldcontext; + ReorderBufferTupleBuf *newtup; + Size old_size; + + /* no toast tuples changed */ + if (txn->toast_hash == NULL) + return; + + /* + * We're going to modify the size of the change. So, to make sure the + * accounting is correct we record the current change size and then after + * re-computing the change we'll subtract the recorded size and then + * re-add the new change size at the end. We don't immediately subtract + * the old size because if there is any error before we add the new size, + * we will release the changes and that will update the accounting info + * (subtracting the size from the counters). And we don't want to + * underflow there. + */ + old_size = ReorderBufferChangeSize(change); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* we should only have toast tuples in an INSERT or UPDATE */ + Assert(change->data.tp.newtuple); + + desc = RelationGetDescr(relation); + + toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid); + if (!RelationIsValid(toast_rel)) + elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")", + relation->rd_rel->reltoastrelid, RelationGetRelationName(relation)); + + toast_desc = RelationGetDescr(toast_rel); + + /* should we allocate from stack instead? */ + attrs = palloc0(sizeof(Datum) * desc->natts); + isnull = palloc0(sizeof(bool) * desc->natts); + free = palloc0(sizeof(bool) * desc->natts); + + newtup = change->data.tp.newtuple; + + heap_deform_tuple(&newtup->tuple, desc, attrs, isnull); + + for (natt = 0; natt < desc->natts; natt++) + { + Form_pg_attribute attr = TupleDescAttr(desc, natt); + ReorderBufferToastEnt *ent; + struct varlena *varlena; + + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + struct varatt_indirect redirect_pointer; + struct varlena *new_datum = NULL; + struct varlena *reconstructed; + dlist_iter it; + Size data_done = 0; + + /* system columns aren't toasted */ + if (attr->attnum < 0) + continue; + + if (attr->attisdropped) + continue; + + /* not a varlena datatype */ + if (attr->attlen != -1) + continue; + + /* no data */ + if (isnull[natt]) + continue; + + /* ok, we know we have a toast datum */ + varlena = (struct varlena *) DatumGetPointer(attrs[natt]); + + /* no need to do anything if the tuple isn't external */ + if (!VARATT_IS_EXTERNAL(varlena)) + continue; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena); + + /* + * Check whether the toast tuple changed, replace if so. + */ + ent = (ReorderBufferToastEnt *) + hash_search(txn->toast_hash, + &toast_pointer.va_valueid, + HASH_FIND, + NULL); + if (ent == NULL) + continue; + + new_datum = + (struct varlena *) palloc0(INDIRECT_POINTER_SIZE); + + free[natt] = true; + + reconstructed = palloc0(toast_pointer.va_rawsize); + + ent->reconstructed = reconstructed; + + /* stitch toast tuple back together from its parts */ + dlist_foreach(it, &ent->chunks) + { + bool isnull; + ReorderBufferChange *cchange; + ReorderBufferTupleBuf *ctup; + Pointer chunk; + + cchange = dlist_container(ReorderBufferChange, node, it.cur); + ctup = cchange->data.tp.newtuple; + chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull)); + + Assert(!isnull); + Assert(!VARATT_IS_EXTERNAL(chunk)); + Assert(!VARATT_IS_SHORT(chunk)); + + memcpy(VARDATA(reconstructed) + data_done, + VARDATA(chunk), + VARSIZE(chunk) - VARHDRSZ); + data_done += VARSIZE(chunk) - VARHDRSZ; + } + Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer)); + + /* make sure its marked as compressed or not */ + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ); + else + SET_VARSIZE(reconstructed, data_done + VARHDRSZ); + + memset(&redirect_pointer, 0, sizeof(redirect_pointer)); + redirect_pointer.pointer = reconstructed; + + SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT); + memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer, + sizeof(redirect_pointer)); + + attrs[natt] = PointerGetDatum(new_datum); + } + + /* + * Build tuple in separate memory & copy tuple back into the tuplebuf + * passed to the output plugin. We can't directly heap_fill_tuple() into + * the tuplebuf because attrs[] will point back into the current content. + */ + tmphtup = heap_form_tuple(desc, attrs, isnull); + Assert(newtup->tuple.t_len <= MaxHeapTupleSize); + Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data); + + memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len); + newtup->tuple.t_len = tmphtup->t_len; + + /* + * free resources we won't further need, more persistent stuff will be + * free'd in ReorderBufferToastReset(). + */ + RelationClose(toast_rel); + pfree(tmphtup); + for (natt = 0; natt < desc->natts; natt++) + { + if (free[natt]) + pfree(DatumGetPointer(attrs[natt])); + } + pfree(attrs); + pfree(free); + pfree(isnull); + + MemoryContextSwitchTo(oldcontext); + + /* subtract the old change size */ + ReorderBufferChangeMemoryUpdate(rb, change, false, old_size); + /* now add the change back, with the correct size */ + ReorderBufferChangeMemoryUpdate(rb, change, true, + ReorderBufferChangeSize(change)); +} + +/* + * Free all resources allocated for toast reconstruction. + */ +static void +ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferToastEnt *ent; + + if (txn->toast_hash == NULL) + return; + + /* sequentially walk over the hash and free everything */ + hash_seq_init(&hstat, txn->toast_hash); + while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL) + { + dlist_mutable_iter it; + + if (ent->reconstructed != NULL) + pfree(ent->reconstructed); + + dlist_foreach_modify(it, &ent->chunks) + { + ReorderBufferChange *change = + dlist_container(ReorderBufferChange, node, it.cur); + + dlist_delete(&change->node); + ReorderBufferReturnChange(rb, change, true); + } + } + + hash_destroy(txn->toast_hash); + txn->toast_hash = NULL; +} + + +/* --------------------------------------- + * Visibility support for logical decoding + * + * + * Lookup actual cmin/cmax values when using decoding snapshot. We can't + * always rely on stored cmin/cmax values because of two scenarios: + * + * * A tuple got changed multiple times during a single transaction and thus + * has got a combo CID. Combo CIDs are only valid for the duration of a + * single transaction. + * * A tuple with a cmin but no cmax (and thus no combo CID) got + * deleted/updated in another transaction than the one which created it + * which we are looking at right now. As only one of cmin, cmax or combo CID + * is actually stored in the heap we don't have access to the value we + * need anymore. + * + * To resolve those problems we have a per-transaction hash of (cmin, + * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual + * (cmin, cmax) values. That also takes care of combo CIDs by simply + * not caring about them at all. As we have the real cmin/cmax values + * combo CIDs aren't interesting. + * + * As we only care about catalog tuples here the overhead of this + * hashtable should be acceptable. + * + * Heap rewrites complicate this a bit, check rewriteheap.c for + * details. + * ------------------------------------------------------------------------- + */ + +/* struct for sorting mapping files by LSN efficiently */ +typedef struct RewriteMappingFile +{ + XLogRecPtr lsn; + char fname[MAXPGPATH]; +} RewriteMappingFile; + +#ifdef NOT_USED +static void +DisplayMapping(HTAB *tuplecid_data) +{ + HASH_SEQ_STATUS hstat; + ReorderBufferTupleCidEnt *ent; + + hash_seq_init(&hstat, tuplecid_data); + while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) + { + elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + ent->key.rlocator.dbOid, + ent->key.rlocator.spcOid, + ent->key.rlocator.relNumber, + ItemPointerGetBlockNumber(&ent->key.tid), + ItemPointerGetOffsetNumber(&ent->key.tid), + ent->cmin, + ent->cmax + ); + } +} +#endif + +/* + * Apply a single mapping file to tuplecid_data. + * + * The mapping file has to have been verified to be a) committed b) for our + * transaction c) applied in LSN order. + */ +static void +ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname) +{ + char path[MAXPGPATH]; + int fd; + int readBytes; + LogicalRewriteMappingData map; + + sprintf(path, "pg_logical/mappings/%s", fname); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + while (true) + { + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ReorderBufferTupleCidEnt *new_ent; + bool found; + + /* be careful about padding */ + memset(&key, 0, sizeof(ReorderBufferTupleCidKey)); + + /* read all mappings till the end of the file */ + pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ); + readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData)); + pgstat_report_wait_end(); + + if (readBytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else if (readBytes == 0) /* EOF */ + break; + else if (readBytes != sizeof(LogicalRewriteMappingData)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": read %d instead of %d bytes", + path, readBytes, + (int32) sizeof(LogicalRewriteMappingData)))); + + key.rlocator = map.old_locator; + ItemPointerCopy(&map.old_tid, + &key.tid); + + + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_FIND, NULL); + + /* no existing mapping, no need to update */ + if (!ent) + continue; + + key.rlocator = map.new_locator; + ItemPointerCopy(&map.new_tid, + &key.tid); + + new_ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_ENTER, &found); + + if (found) + { + /* + * Make sure the existing mapping makes sense. We sometime update + * old records that did not yet have a cmax (e.g. pg_class' own + * entry while rewriting it) during rewrites, so allow that. + */ + Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin); + Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax); + } + else + { + /* update mapping */ + new_ent->cmin = ent->cmin; + new_ent->cmax = ent->cmax; + new_ent->combocid = ent->combocid; + } + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); +} + + +/* + * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'. + */ +static bool +TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num) +{ + return bsearch(&xid, xip, num, + sizeof(TransactionId), xidComparator) != NULL; +} + +/* + * list_sort() comparator for sorting RewriteMappingFiles in LSN order. + */ +static int +file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p) +{ + RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p); + RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p); + + if (a->lsn < b->lsn) + return -1; + else if (a->lsn > b->lsn) + return 1; + return 0; +} + +/* + * Apply any existing logical remapping files if there are any targeted at our + * transaction for relid. + */ +static void +UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) +{ + DIR *mapping_dir; + struct dirent *mapping_de; + List *files = NIL; + ListCell *file; + Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId; + + mapping_dir = AllocateDir("pg_logical/mappings"); + while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL) + { + Oid f_dboid; + Oid f_relid; + TransactionId f_mapped_xid; + TransactionId f_create_xid; + XLogRecPtr f_lsn; + uint32 f_hi, + f_lo; + RewriteMappingFile *f; + + if (strcmp(mapping_de->d_name, ".") == 0 || + strcmp(mapping_de->d_name, "..") == 0) + continue; + + /* Ignore files that aren't ours */ + if (strncmp(mapping_de->d_name, "map-", 4) != 0) + continue; + + if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT, + &f_dboid, &f_relid, &f_hi, &f_lo, + &f_mapped_xid, &f_create_xid) != 6) + elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name); + + f_lsn = ((uint64) f_hi) << 32 | f_lo; + + /* mapping for another database */ + if (f_dboid != dboid) + continue; + + /* mapping for another relation */ + if (f_relid != relid) + continue; + + /* did the creating transaction abort? */ + if (!TransactionIdDidCommit(f_create_xid)) + continue; + + /* not for our transaction */ + if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt)) + continue; + + /* ok, relevant, queue for apply */ + f = palloc(sizeof(RewriteMappingFile)); + f->lsn = f_lsn; + strcpy(f->fname, mapping_de->d_name); + files = lappend(files, f); + } + FreeDir(mapping_dir); + + /* sort files so we apply them in LSN order */ + list_sort(files, file_sort_by_lsn); + + foreach(file, files) + { + RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file); + + elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, + snapshot->subxip[0]); + ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); + pfree(f); + } +} + +/* + * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on + * combo CIDs. + */ +bool +ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data, + Snapshot snapshot, + HeapTuple htup, Buffer buffer, + CommandId *cmin, CommandId *cmax) +{ + ReorderBufferTupleCidKey key; + ReorderBufferTupleCidEnt *ent; + ForkNumber forkno; + BlockNumber blockno; + bool updated_mapping = false; + + /* + * Return unresolved if tuplecid_data is not valid. That's because when + * streaming in-progress transactions we may run into tuples with the CID + * before actually decoding them. Think e.g. about INSERT followed by + * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the + * INSERT. So in such cases, we assume the CID is from the future + * command. + */ + if (tuplecid_data == NULL) + return false; + + /* be careful about padding */ + memset(&key, 0, sizeof(key)); + + Assert(!BufferIsLocal(buffer)); + + /* + * get relfilelocator from the buffer, no convenient way to access it + * other than that. + */ + BufferGetTag(buffer, &key.rlocator, &forkno, &blockno); + + /* tuples can only be in the main fork */ + Assert(forkno == MAIN_FORKNUM); + Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self)); + + ItemPointerCopy(&htup->t_self, + &key.tid); + +restart: + ent = (ReorderBufferTupleCidEnt *) + hash_search(tuplecid_data, &key, HASH_FIND, NULL); + + /* + * failed to find a mapping, check whether the table was rewritten and + * apply mapping if so, but only do that once - there can be no new + * mappings while we are in here since we have to hold a lock on the + * relation. + */ + if (ent == NULL && !updated_mapping) + { + UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot); + /* now check but don't update for a mapping again */ + updated_mapping = true; + goto restart; + } + else if (ent == NULL) + return false; + + if (cmin) + *cmin = ent->cmin; + if (cmax) + *cmax = ent->cmax; + return true; +} diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c new file mode 100644 index 0000000..7a7aba3 --- /dev/null +++ b/src/backend/replication/logical/snapbuild.c @@ -0,0 +1,2132 @@ +/*------------------------------------------------------------------------- + * + * snapbuild.c + * + * Infrastructure for building historic catalog snapshots based on contents + * of the WAL, for the purpose of decoding heapam.c style values in the + * WAL. + * + * NOTES: + * + * We build snapshots which can *only* be used to read catalog contents and we + * do so by reading and interpreting the WAL stream. The aim is to build a + * snapshot that behaves the same as a freshly taken MVCC snapshot would have + * at the time the XLogRecord was generated. + * + * To build the snapshots we reuse the infrastructure built for Hot + * Standby. The in-memory snapshots we build look different than HS' because + * we have different needs. To successfully decode data from the WAL we only + * need to access catalog tables and (sys|rel|cat)cache, not the actual user + * tables since the data we decode is wholly contained in the WAL + * records. Also, our snapshots need to be different in comparison to normal + * MVCC ones because in contrast to those we cannot fully rely on the clog and + * pg_subtrans for information about committed transactions because they might + * commit in the future from the POV of the WAL entry we're currently + * decoding. This definition has the advantage that we only need to prevent + * removal of catalog rows, while normal table's rows can still be + * removed. This is achieved by using the replication slot mechanism. + * + * As the percentage of transactions modifying the catalog normally is fairly + * small in comparisons to ones only manipulating user data, we keep track of + * the committed catalog modifying ones inside [xmin, xmax) instead of keeping + * track of all running transactions like it's done in a normal snapshot. Note + * that we're generally only looking at transactions that have acquired an + * xid. That is we keep a list of transactions between snapshot->(xmin, xmax) + * that we consider committed, everything else is considered aborted/in + * progress. That also allows us not to care about subtransactions before they + * have committed which means this module, in contrast to HS, doesn't have to + * care about suboverflowed subtransactions and similar. + * + * One complexity of doing this is that to e.g. handle mixed DDL/DML + * transactions we need Snapshots that see intermediate versions of the + * catalog in a transaction. During normal operation this is achieved by using + * CommandIds/cmin/cmax. The problem with that however is that for space + * efficiency reasons only one value of that is stored + * (cf. combocid.c). Since combo CIDs are only available in memory we log + * additional information which allows us to get the original (cmin, cmax) + * pair during visibility checks. Check the reorderbuffer.c's comment above + * ResolveCminCmaxDuringDecoding() for details. + * + * To facilitate all this we need our own visibility routine, as the normal + * ones are optimized for different usecases. + * + * To replace the normal catalog snapshots with decoding ones use the + * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions. + * + * + * + * The snapbuild machinery is starting up in several stages, as illustrated + * by the following graph describing the SnapBuild->state transitions: + * + * +-------------------------+ + * +----| START |-------------+ + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #1 | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | BUILDING_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * | | | + * | running_xacts #2, xacts from #1 finished | + * | | | + * | | | + * | v | + * | +-------------------------+ v + * | | FULL_SNAPSHOT |------------>| + * | +-------------------------+ | + * | | | + * running_xacts | saved snapshot + * with zero xacts | at running_xacts's lsn + * | | | + * | running_xacts with xacts from #2 finished | + * | | | + * | v | + * | +-------------------------+ | + * +--->|SNAPBUILD_CONSISTENT |<------------+ + * +-------------------------+ + * + * Initially the machinery is in the START stage. When an xl_running_xacts + * record is read that is sufficiently new (above the safe xmin horizon), + * there's a state transition. If there were no running xacts when the + * xl_running_xacts record was generated, we'll directly go into CONSISTENT + * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full + * snapshot means that all transactions that start henceforth can be decoded + * in their entirety, but transactions that started previously can't. In + * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously + * running transactions have committed or aborted. + * + * Only transactions that commit after CONSISTENT state has been reached will + * be replayed, even though they might have started while still in + * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous + * changes has been exported, but all the following ones will be. That point + * is a convenient point to initialize replication from, which is why we + * export a snapshot at that point, which *can* be used to read normal data. + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/snapbuild.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "common/file_utils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logical.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "storage/block.h" /* debugging output */ +#include "storage/fd.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/standby.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/snapshot.h" + +/* + * This struct contains the current state of the snapshot building + * machinery. Besides a forward declaration in the header, it is not exposed + * to the public, so we can easily change its contents. + */ +struct SnapBuild +{ + /* how far are we along building our first full snapshot */ + SnapBuildState state; + + /* private memory context used to allocate memory for this module. */ + MemoryContext context; + + /* all transactions < than this have committed/aborted */ + TransactionId xmin; + + /* all transactions >= than this are uncommitted */ + TransactionId xmax; + + /* + * Don't replay commits from an LSN < this LSN. This can be set externally + * but it will also be advanced (never retreat) from within snapbuild.c. + */ + XLogRecPtr start_decoding_at; + + /* + * LSN at which two-phase decoding was enabled or LSN at which we found a + * consistent point at the time of slot creation. + * + * The prepared transactions, that were skipped because previously + * two-phase was not enabled or are not covered by initial snapshot, need + * to be sent later along with commit prepared and they must be before + * this point. + */ + XLogRecPtr two_phase_at; + + /* + * Don't start decoding WAL until the "xl_running_xacts" information + * indicates there are no running xids with an xid smaller than this. + */ + TransactionId initial_xmin_horizon; + + /* Indicates if we are building full snapshot or just catalog one. */ + bool building_full_snapshot; + + /* + * Snapshot that's valid to see the catalog state seen at this moment. + */ + Snapshot snapshot; + + /* + * LSN of the last location we are sure a snapshot has been serialized to. + */ + XLogRecPtr last_serialized_snapshot; + + /* + * The reorderbuffer we need to update with usable snapshots et al. + */ + ReorderBuffer *reorder; + + /* + * TransactionId at which the next phase of initial snapshot building will + * happen. InvalidTransactionId if not known (i.e. SNAPBUILD_START), or + * when no next phase necessary (SNAPBUILD_CONSISTENT). + */ + TransactionId next_phase_at; + + /* + * Array of transactions which could have catalog changes that committed + * between xmin and xmax. + */ + struct + { + /* number of committed transactions */ + size_t xcnt; + + /* available space for committed transactions */ + size_t xcnt_space; + + /* + * Until we reach a CONSISTENT state, we record commits of all + * transactions, not just the catalog changing ones. Record when that + * changes so we know we cannot export a snapshot safely anymore. + */ + bool includes_all_transactions; + + /* + * Array of committed transactions that have modified the catalog. + * + * As this array is frequently modified we do *not* keep it in + * xidComparator order. Instead we sort the array when building & + * distributing a snapshot. + * + * TODO: It's unclear whether that reasoning has much merit. Every + * time we add something here after becoming consistent will also + * require distributing a snapshot. Storing them sorted would + * potentially also make it easier to purge (but more complicated wrt + * wraparound?). Should be improved if sorting while building the + * snapshot shows up in profiles. + */ + TransactionId *xip; + } committed; + + /* + * Array of transactions and subtransactions that had modified catalogs + * and were running when the snapshot was serialized. + * + * We normally rely on some WAL record types such as HEAP2_NEW_CID to know + * if the transaction has changed the catalog. But it could happen that + * the logical decoding decodes only the commit record of the transaction + * after restoring the previously serialized snapshot in which case we + * will miss adding the xid to the snapshot and end up looking at the + * catalogs with the wrong snapshot. + * + * Now to avoid the above problem, we serialize the transactions that had + * modified the catalogs and are still running at the time of snapshot + * serialization. We fill this array while restoring the snapshot and then + * refer it while decoding commit to ensure if the xact has modified the + * catalog. We discard this array when all the xids in the list become old + * enough to matter. See SnapBuildPurgeOlderTxn for details. + */ + struct + { + /* number of transactions */ + size_t xcnt; + + /* This array must be sorted in xidComparator order */ + TransactionId *xip; + } catchange; +}; + +/* + * Starting a transaction -- which we need to do while exporting a snapshot -- + * removes knowledge about the previously used resowner, so we save it here. + */ +static ResourceOwner SavedResourceOwnerDuringExport = NULL; +static bool ExportInProgress = false; + +/* ->committed and ->catchange manipulation */ +static void SnapBuildPurgeOlderTxn(SnapBuild *builder); + +/* snapshot building/manipulation/distribution functions */ +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder); + +static void SnapBuildFreeSnapshot(Snapshot snap); + +static void SnapBuildSnapIncRefcount(Snapshot snap); + +static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn); + +static inline bool SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, + uint32 xinfo); + +/* xlog reading helper functions for SnapBuildProcessRunningXacts */ +static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running); +static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff); + +/* serialization functions */ +static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn); +static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn); +static void SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path); + +/* + * Allocate a new snapshot builder. + * + * xmin_horizon is the xid >= which we can be sure no catalog rows have been + * removed, start_lsn is the LSN >= we want to replay commits. + */ +SnapBuild * +AllocateSnapshotBuilder(ReorderBuffer *reorder, + TransactionId xmin_horizon, + XLogRecPtr start_lsn, + bool need_full_snapshot, + XLogRecPtr two_phase_at) +{ + MemoryContext context; + MemoryContext oldcontext; + SnapBuild *builder; + + /* allocate memory in own context, to have better accountability */ + context = AllocSetContextCreate(CurrentMemoryContext, + "snapshot builder context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(context); + + builder = palloc0(sizeof(SnapBuild)); + + builder->state = SNAPBUILD_START; + builder->context = context; + builder->reorder = reorder; + /* Other struct members initialized by zeroing via palloc0 above */ + + builder->committed.xcnt = 0; + builder->committed.xcnt_space = 128; /* arbitrary number */ + builder->committed.xip = + palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); + builder->committed.includes_all_transactions = true; + + builder->catchange.xcnt = 0; + builder->catchange.xip = NULL; + + builder->initial_xmin_horizon = xmin_horizon; + builder->start_decoding_at = start_lsn; + builder->building_full_snapshot = need_full_snapshot; + builder->two_phase_at = two_phase_at; + + MemoryContextSwitchTo(oldcontext); + + return builder; +} + +/* + * Free a snapshot builder. + */ +void +FreeSnapshotBuilder(SnapBuild *builder) +{ + MemoryContext context = builder->context; + + /* free snapshot explicitly, that contains some error checking */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + builder->snapshot = NULL; + } + + /* other resources are deallocated via memory context reset */ + MemoryContextDelete(context); +} + +/* + * Free an unreferenced snapshot that has previously been built by us. + */ +static void +SnapBuildFreeSnapshot(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + Assert(snap->regd_count == 0); + + /* slightly more likely, so it's checked even without c-asserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + if (snap->active_count) + elog(ERROR, "cannot free an active snapshot"); + + pfree(snap); +} + +/* + * In which state of snapshot building are we? + */ +SnapBuildState +SnapBuildCurrentState(SnapBuild *builder) +{ + return builder->state; +} + +/* + * Return the LSN at which the two-phase decoding was first enabled. + */ +XLogRecPtr +SnapBuildGetTwoPhaseAt(SnapBuild *builder) +{ + return builder->two_phase_at; +} + +/* + * Set the LSN at which two-phase decoding is enabled. + */ +void +SnapBuildSetTwoPhaseAt(SnapBuild *builder, XLogRecPtr ptr) +{ + builder->two_phase_at = ptr; +} + +/* + * Should the contents of transaction ending at 'ptr' be decoded? + */ +bool +SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr) +{ + return ptr < builder->start_decoding_at; +} + +/* + * Increase refcount of a snapshot. + * + * This is used when handing out a snapshot to some external resource or when + * adding a Snapshot as builder->snapshot. + */ +static void +SnapBuildSnapIncRefcount(Snapshot snap) +{ + snap->active_count++; +} + +/* + * Decrease refcount of a snapshot and free if the refcount reaches zero. + * + * Externally visible, so that external resources that have been handed an + * IncRef'ed Snapshot can adjust its refcount easily. + */ +void +SnapBuildSnapDecRefcount(Snapshot snap) +{ + /* make sure we don't get passed an external snapshot */ + Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC); + + /* make sure nobody modified our snapshot */ + Assert(snap->curcid == FirstCommandId); + Assert(!snap->suboverflowed); + Assert(!snap->takenDuringRecovery); + + Assert(snap->regd_count == 0); + + Assert(snap->active_count > 0); + + /* slightly more likely, so it's checked even without casserts */ + if (snap->copied) + elog(ERROR, "cannot free a copied snapshot"); + + snap->active_count--; + if (snap->active_count == 0) + SnapBuildFreeSnapshot(snap); +} + +/* + * Build a new snapshot, based on currently committed catalog-modifying + * transactions. + * + * In-progress transactions with catalog access are *not* allowed to modify + * these snapshots; they have to copy them and fill in appropriate ->curcid + * and ->subxip/subxcnt values. + */ +static Snapshot +SnapBuildBuildSnapshot(SnapBuild *builder) +{ + Snapshot snapshot; + Size ssize; + + Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT); + + ssize = sizeof(SnapshotData) + + sizeof(TransactionId) * builder->committed.xcnt + + sizeof(TransactionId) * 1 /* toplevel xid */ ; + + snapshot = MemoryContextAllocZero(builder->context, ssize); + + snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC; + + /* + * We misuse the original meaning of SnapshotData's xip and subxip fields + * to make the more fitting for our needs. + * + * In the 'xip' array we store transactions that have to be treated as + * committed. Since we will only ever look at tuples from transactions + * that have modified the catalog it's more efficient to store those few + * that exist between xmin and xmax (frequently there are none). + * + * Snapshots that are used in transactions that have modified the catalog + * also use the 'subxip' array to store their toplevel xid and all the + * subtransaction xids so we can recognize when we need to treat rows as + * visible that are not in xip but still need to be visible. Subxip only + * gets filled when the transaction is copied into the context of a + * catalog modifying transaction since we otherwise share a snapshot + * between transactions. As long as a txn hasn't modified the catalog it + * doesn't need to treat any uncommitted rows as visible, so there is no + * need for those xids. + * + * Both arrays are qsort'ed so that we can use bsearch() on them. + */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + snapshot->xmin = builder->xmin; + snapshot->xmax = builder->xmax; + + /* store all transactions to be treated as committed by this snapshot */ + snapshot->xip = + (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); + snapshot->xcnt = builder->committed.xcnt; + memcpy(snapshot->xip, + builder->committed.xip, + builder->committed.xcnt * sizeof(TransactionId)); + + /* sort so we can bsearch() */ + qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator); + + /* + * Initially, subxip is empty, i.e. it's a snapshot to be used by + * transactions that don't modify the catalog. Will be filled by + * ReorderBufferCopySnap() if necessary. + */ + snapshot->subxcnt = 0; + snapshot->subxip = NULL; + + snapshot->suboverflowed = false; + snapshot->takenDuringRecovery = false; + snapshot->copied = false; + snapshot->curcid = FirstCommandId; + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->snapXactCompletionCount = 0; + + return snapshot; +} + +/* + * Build the initial slot snapshot and convert it to a normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. + * + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. + */ +Snapshot +SnapBuildInitialSnapshot(SnapBuild *builder) +{ + Snapshot snap; + TransactionId xid; + TransactionId safeXid; + TransactionId *newxip; + int newxcnt = 0; + + Assert(XactIsoLevel == XACT_REPEATABLE_READ); + Assert(builder->building_full_snapshot); + + /* don't allow older snapshots */ + InvalidateCatalogSnapshot(); /* about to overwrite MyProc->xmin */ + if (HaveRegisteredOrActiveSnapshot()) + elog(ERROR, "cannot build an initial slot snapshot when snapshots exist"); + Assert(!HistoricSnapshotActive()); + + if (builder->state != SNAPBUILD_CONSISTENT) + elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); + + if (!builder->committed.includes_all_transactions) + elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); + + /* so we don't overwrite the existing value */ + if (TransactionIdIsValid(MyProc->xmin)) + elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid"); + + snap = SnapBuildBuildSnapshot(builder); + + /* + * We know that snap->xmin is alive, enforced by the logical xmin + * mechanism. Due to that we can do this without locks, we're only + * changing our own value. + * + * Building an initial snapshot is expensive and an unenforced xmin + * horizon would have bad consequences, therefore always double-check that + * the horizon is enforced. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + safeXid = GetOldestSafeDecodingTransactionId(false); + LWLockRelease(ProcArrayLock); + + if (TransactionIdFollows(safeXid, snap->xmin)) + elog(ERROR, "cannot build an initial slot snapshot as oldest safe xid %u follows snapshot's xmin %u", + safeXid, snap->xmin); + + MyProc->xmin = snap->xmin; + + /* allocate in transaction context */ + newxip = (TransactionId *) + palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); + + /* + * snapbuild.c builds transactions in an "inverted" manner, which means it + * stores committed transactions in ->xip, not ones in progress. Build a + * classical snapshot by marking all non-committed transactions as + * in-progress. This can be expensive. + */ + for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);) + { + void *test; + + /* + * Check whether transaction committed using the decoding snapshot + * meaning of ->xip. + */ + test = bsearch(&xid, snap->xip, snap->xcnt, + sizeof(TransactionId), xidComparator); + + if (test == NULL) + { + if (newxcnt >= GetMaxSnapshotXidCount()) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("initial slot snapshot too large"))); + + newxip[newxcnt++] = xid; + } + + TransactionIdAdvance(xid); + } + + /* adjust remaining snapshot fields as needed */ + snap->snapshot_type = SNAPSHOT_MVCC; + snap->xcnt = newxcnt; + snap->xip = newxip; + + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char * +SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + elog(ERROR, "cannot export a snapshot from within a transaction"); + + if (SavedResourceOwnerDuringExport) + elog(ERROR, "can only export one snapshot at a time"); + + SavedResourceOwnerDuringExport = CurrentResourceOwner; + ExportInProgress = true; + + StartTransactionCommand(); + + /* There doesn't seem to a nice API to set these */ + XactIsoLevel = XACT_REPEATABLE_READ; + XactReadOnly = true; + + snap = SnapBuildInitialSnapshot(builder); + + /* + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it + */ + snapname = ExportSnapshot(snap); + + ereport(LOG, + (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID", + "exported logical decoding snapshot: \"%s\" with %u transaction IDs", + snap->xcnt, + snapname, snap->xcnt))); + return snapname; +} + +/* + * Ensure there is a snapshot and if not build one for current transaction. + */ +Snapshot +SnapBuildGetOrBuildSnapshot(SnapBuild *builder) +{ + Assert(builder->state == SNAPBUILD_CONSISTENT); + + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + return builder->snapshot; +} + +/* + * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is + * any. Aborts the previously started transaction and resets the resource + * owner back to its original value. + */ +void +SnapBuildClearExportedSnapshot(void) +{ + ResourceOwner tmpResOwner; + + /* nothing exported, that is the usual case */ + if (!ExportInProgress) + return; + + if (!IsTransactionState()) + elog(ERROR, "clearing exported snapshot in wrong transaction state"); + + /* + * AbortCurrentTransaction() takes care of resetting the snapshot state, + * so remember SavedResourceOwnerDuringExport. + */ + tmpResOwner = SavedResourceOwnerDuringExport; + + /* make sure nothing could have ever happened */ + AbortCurrentTransaction(); + + CurrentResourceOwner = tmpResOwner; +} + +/* + * Clear snapshot export state during transaction abort. + */ +void +SnapBuildResetExportedSnapshotState(void) +{ + SavedResourceOwnerDuringExport = NULL; + ExportInProgress = false; +} + +/* + * Handle the effects of a single heap change, appropriate to the current state + * of the snapshot builder and returns whether changes made at (xid, lsn) can + * be decoded. + */ +bool +SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn) +{ + /* + * We can't handle data in transactions if we haven't built a snapshot + * yet, so don't store them. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return false; + + /* + * No point in keeping track of changes in transactions that we don't have + * enough information about to decode. This means that they started before + * we got into the SNAPBUILD_FULL_SNAPSHOT state. + */ + if (builder->state < SNAPBUILD_CONSISTENT && + TransactionIdPrecedes(xid, builder->next_phase_at)) + return false; + + /* + * If the reorderbuffer doesn't yet have a snapshot, add one now, it will + * be needed to decode the change we're currently processing. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + /* only build a new snapshot if we don't have a prebuilt one */ + if (builder->snapshot == NULL) + { + builder->snapshot = SnapBuildBuildSnapshot(builder); + /* increase refcount for the snapshot builder */ + SnapBuildSnapIncRefcount(builder->snapshot); + } + + /* + * Increase refcount for the transaction we're handing the snapshot + * out to. + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + return true; +} + +/* + * Do CommandId/combo CID handling after reading an xl_heap_new_cid record. + * This implies that a transaction has done some form of write to system + * catalogs. + */ +void +SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, + XLogRecPtr lsn, xl_heap_new_cid *xlrec) +{ + CommandId cid; + + /* + * we only log new_cid's if a catalog tuple was modified, so mark the + * transaction as containing catalog modifications + */ + ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn); + + ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn, + xlrec->target_locator, xlrec->target_tid, + xlrec->cmin, xlrec->cmax, + xlrec->combocid); + + /* figure out new command id */ + if (xlrec->cmin != InvalidCommandId && + xlrec->cmax != InvalidCommandId) + cid = Max(xlrec->cmin, xlrec->cmax); + else if (xlrec->cmax != InvalidCommandId) + cid = xlrec->cmax; + else if (xlrec->cmin != InvalidCommandId) + cid = xlrec->cmin; + else + { + cid = InvalidCommandId; /* silence compiler */ + elog(ERROR, "xl_heap_new_cid record without a valid CommandId"); + } + + ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1); +} + +/* + * Add a new Snapshot to all transactions we're decoding that currently are + * in-progress so they can see new catalog contents made by the transaction + * that just committed. This is necessary because those in-progress + * transactions will use the new catalog's contents from here on (at the very + * least everything they do needs to be compatible with newer catalog + * contents). + */ +static void +SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn) +{ + dlist_iter txn_i; + ReorderBufferTXN *txn; + + /* + * Iterate through all toplevel transactions. This can include + * subtransactions which we just don't yet know to be that, but that's + * fine, they will just get an unnecessary snapshot queued. + */ + dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn) + { + txn = dlist_container(ReorderBufferTXN, node, txn_i.cur); + + Assert(TransactionIdIsValid(txn->xid)); + + /* + * If we don't have a base snapshot yet, there are no changes in this + * transaction which in turn implies we don't yet need a snapshot at + * all. We'll add a snapshot when the first change gets queued. + * + * NB: This works correctly even for subtransactions because + * ReorderBufferAssignChild() takes care to transfer the base snapshot + * to the top-level transaction, and while iterating the changequeue + * we'll get the change from the subtxn. + */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid)) + continue; + + /* + * We don't need to add snapshot to prepared transactions as they + * should not see the new catalog contents. + */ + if (rbtxn_prepared(txn) || rbtxn_skip_prepared(txn)) + continue; + + elog(DEBUG2, "adding a new snapshot to %u at %X/%X", + txn->xid, LSN_FORMAT_ARGS(lsn)); + + /* + * increase the snapshot's refcount for the transaction we are handing + * it out to + */ + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn, + builder->snapshot); + } +} + +/* + * Keep track of a new catalog changing transaction that has committed. + */ +static void +SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + if (builder->committed.xcnt == builder->committed.xcnt_space) + { + builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1; + + elog(DEBUG1, "increasing space for committed transactions to %u", + (uint32) builder->committed.xcnt_space); + + builder->committed.xip = repalloc(builder->committed.xip, + builder->committed.xcnt_space * sizeof(TransactionId)); + } + + /* + * TODO: It might make sense to keep the array sorted here instead of + * doing it every time we build a new snapshot. On the other hand this + * gets called repeatedly when a transaction with subtransactions commits. + */ + builder->committed.xip[builder->committed.xcnt++] = xid; +} + +/* + * Remove knowledge about transactions we treat as committed or containing catalog + * changes that are smaller than ->xmin. Those won't ever get checked via + * the ->committed or ->catchange array, respectively. The committed xids will + * get checked via the clog machinery. + * + * We can ideally remove the transaction from catchange array once it is + * finished (committed/aborted) but that could be costly as we need to maintain + * the xids order in the array. + */ +static void +SnapBuildPurgeOlderTxn(SnapBuild *builder) +{ + int off; + TransactionId *workspace; + int surviving_xids = 0; + + /* not ready yet */ + if (!TransactionIdIsNormal(builder->xmin)) + return; + + /* TODO: Neater algorithm than just copying and iterating? */ + workspace = + MemoryContextAlloc(builder->context, + builder->committed.xcnt * sizeof(TransactionId)); + + /* copy xids that still are interesting to workspace */ + for (off = 0; off < builder->committed.xcnt; off++) + { + if (NormalTransactionIdPrecedes(builder->committed.xip[off], + builder->xmin)) + ; /* remove */ + else + workspace[surviving_xids++] = builder->committed.xip[off]; + } + + /* copy workspace back to persistent state */ + memcpy(builder->committed.xip, workspace, + surviving_xids * sizeof(TransactionId)); + + elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->committed.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->committed.xcnt = surviving_xids; + + pfree(workspace); + + /* + * Purge xids in ->catchange as well. The purged array must also be sorted + * in xidComparator order. + */ + if (builder->catchange.xcnt > 0) + { + /* + * Since catchange.xip is sorted, we find the lower bound of xids that + * are still interesting. + */ + for (off = 0; off < builder->catchange.xcnt; off++) + { + if (TransactionIdFollowsOrEquals(builder->catchange.xip[off], + builder->xmin)) + break; + } + + surviving_xids = builder->catchange.xcnt - off; + + if (surviving_xids > 0) + { + memmove(builder->catchange.xip, &(builder->catchange.xip[off]), + surviving_xids * sizeof(TransactionId)); + } + else + { + pfree(builder->catchange.xip); + builder->catchange.xip = NULL; + } + + elog(DEBUG3, "purged catalog modifying transactions from %u to %u, xmin: %u, xmax: %u", + (uint32) builder->catchange.xcnt, (uint32) surviving_xids, + builder->xmin, builder->xmax); + builder->catchange.xcnt = surviving_xids; + } +} + +/* + * Handle everything that needs to be done when a transaction commits + */ +void +SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, + int nsubxacts, TransactionId *subxacts, uint32 xinfo) +{ + int nxact; + + bool needs_snapshot = false; + bool needs_timetravel = false; + bool sub_needs_timetravel = false; + + TransactionId xmax = xid; + + /* + * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor + * will they be part of a snapshot. So we don't need to record anything. + */ + if (builder->state == SNAPBUILD_START || + (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedes(xid, builder->next_phase_at))) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + return; + } + + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* ensure that only commits after this are getting replayed */ + if (builder->start_decoding_at <= lsn) + builder->start_decoding_at = lsn + 1; + + /* + * If building an exportable snapshot, force xid to be tracked, even + * if the transaction didn't modify the catalog. + */ + if (builder->building_full_snapshot) + { + needs_timetravel = true; + } + } + + for (nxact = 0; nxact < nsubxacts; nxact++) + { + TransactionId subxid = subxacts[nxact]; + + /* + * Add subtransaction to base snapshot if catalog modifying, we don't + * distinguish to toplevel transactions there. + */ + if (SnapBuildXidHasCatalogChanges(builder, subxid, xinfo)) + { + sub_needs_timetravel = true; + needs_snapshot = true; + + elog(DEBUG1, "found subtransaction %u:%u with catalog changes", + xid, subxid); + + SnapBuildAddCommittedTxn(builder, subxid); + + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + + /* + * If we're forcing timetravel we also need visibility information + * about subtransaction, so keep track of subtransaction's state, even + * if not catalog modifying. Don't need to distribute a snapshot in + * that case. + */ + else if (needs_timetravel) + { + SnapBuildAddCommittedTxn(builder, subxid); + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; + } + } + + /* if top-level modified catalog, it'll need a snapshot */ + if (SnapBuildXidHasCatalogChanges(builder, xid, xinfo)) + { + elog(DEBUG2, "found top level transaction %u, with catalog changes", + xid); + needs_snapshot = true; + needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (sub_needs_timetravel) + { + /* track toplevel txn as well, subxact alone isn't meaningful */ + elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions", + xid); + needs_timetravel = true; + SnapBuildAddCommittedTxn(builder, xid); + } + else if (needs_timetravel) + { + elog(DEBUG2, "forced transaction %u to do timetravel", xid); + + SnapBuildAddCommittedTxn(builder, xid); + } + + if (!needs_timetravel) + { + /* record that we cannot export a general snapshot anymore */ + builder->committed.includes_all_transactions = false; + } + + Assert(!needs_snapshot || needs_timetravel); + + /* + * Adjust xmax of the snapshot builder, we only do that for committed, + * catalog modifying, transactions, everything else isn't interesting for + * us since we'll never look at the respective rows. + */ + if (needs_timetravel && + (!TransactionIdIsValid(builder->xmax) || + TransactionIdFollowsOrEquals(xmax, builder->xmax))) + { + builder->xmax = xmax; + TransactionIdAdvance(builder->xmax); + } + + /* if there's any reason to build a historic snapshot, do so now */ + if (needs_snapshot) + { + /* + * If we haven't built a complete snapshot yet there's no need to hand + * it out, it wouldn't (and couldn't) be used anyway. + */ + if (builder->state < SNAPBUILD_FULL_SNAPSHOT) + return; + + /* + * Decrease the snapshot builder's refcount of the old snapshot, note + * that it still will be used if it has been handed out to the + * reorderbuffer earlier. + */ + if (builder->snapshot) + SnapBuildSnapDecRefcount(builder->snapshot); + + builder->snapshot = SnapBuildBuildSnapshot(builder); + + /* we might need to execute invalidations, add snapshot */ + if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) + { + SnapBuildSnapIncRefcount(builder->snapshot); + ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn, + builder->snapshot); + } + + /* refcount of the snapshot builder for the new snapshot */ + SnapBuildSnapIncRefcount(builder->snapshot); + + /* add a new catalog snapshot to all currently running transactions */ + SnapBuildDistributeNewCatalogSnapshot(builder, lsn); + } +} + +/* + * Check the reorder buffer and the snapshot to see if the given transaction has + * modified catalogs. + */ +static inline bool +SnapBuildXidHasCatalogChanges(SnapBuild *builder, TransactionId xid, + uint32 xinfo) +{ + if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) + return true; + + /* + * The transactions that have changed catalogs must have invalidation + * info. + */ + if (!(xinfo & XACT_XINFO_HAS_INVALS)) + return false; + + /* Check the catchange XID array */ + return ((builder->catchange.xcnt > 0) && + (bsearch(&xid, builder->catchange.xip, builder->catchange.xcnt, + sizeof(TransactionId), xidComparator) != NULL)); +} + +/* ----------------------------------- + * Snapshot building functions dealing with xlog records + * ----------------------------------- + */ + +/* + * Process a running xacts record, and use its information to first build a + * historic snapshot and later to release resources that aren't needed + * anymore. + */ +void +SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + ReorderBufferTXN *txn; + TransactionId xmin; + + /* + * If we're not consistent yet, inspect the record to see whether it + * allows to get closer to being consistent. If we are consistent, dump + * our snapshot so others or we, after a restart, can use it. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + { + /* returns false if there's no point in performing cleanup just yet */ + if (!SnapBuildFindSnapshot(builder, lsn, running)) + return; + } + else + SnapBuildSerialize(builder, lsn); + + /* + * Update range of interesting xids based on the running xacts + * information. We don't increase ->xmax using it, because once we are in + * a consistent state we can do that ourselves and much more efficiently + * so, because we only need to do it for catalog transactions since we + * only ever look at those. + * + * NB: We only increase xmax when a catalog modifying transaction commits + * (see SnapBuildCommitTxn). Because of this, xmax can be lower than + * xmin, which looks odd but is correct and actually more efficient, since + * we hit fast paths in heapam_visibility.c. + */ + builder->xmin = running->oldestRunningXid; + + /* Remove transactions we don't need to keep track off anymore */ + SnapBuildPurgeOlderTxn(builder); + + /* + * Advance the xmin limit for the current replication slot, to allow + * vacuum to clean up the tuples this slot has been protecting. + * + * The reorderbuffer might have an xmin among the currently running + * snapshots; use it if so. If not, we need only consider the snapshots + * we'll produce later, which can't be less than the oldest running xid in + * the record we're reading now. + */ + xmin = ReorderBufferGetOldestXmin(builder->reorder); + if (xmin == InvalidTransactionId) + xmin = running->oldestRunningXid; + elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u", + builder->xmin, builder->xmax, running->oldestRunningXid, xmin); + LogicalIncreaseXminForSlot(lsn, xmin); + + /* + * Also tell the slot where we can restart decoding from. We don't want to + * do that after every commit because changing that implies an fsync of + * the logical slot's state file, so we only do it every time we see a + * running xacts record. + * + * Do so by looking for the oldest in progress transaction (determined by + * the first LSN of any of its relevant records). Every transaction + * remembers the last location we stored the snapshot to disk before its + * beginning. That point is where we can restart from. + */ + + /* + * Can't know about a serialized snapshot's location if we're not + * consistent. + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + txn = ReorderBufferGetOldestTXN(builder->reorder); + + /* + * oldest ongoing txn might have started when we didn't yet serialize + * anything because we hadn't reached a consistent state yet. + */ + if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); + + /* + * No in-progress transaction, can reuse the last serialized snapshot if + * we have one. + */ + else if (txn == NULL && + builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && + builder->last_serialized_snapshot != InvalidXLogRecPtr) + LogicalIncreaseRestartDecodingForSlot(lsn, + builder->last_serialized_snapshot); +} + + +/* + * Build the start of a snapshot that's capable of decoding the catalog. + * + * Helper function for SnapBuildProcessRunningXacts() while we're not yet + * consistent. + * + * Returns true if there is a point in performing internal maintenance/cleanup + * using the xl_running_xacts record. + */ +static bool +SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) +{ + /* --- + * Build catalog decoding snapshot incrementally using information about + * the currently running transactions. There are several ways to do that: + * + * a) There were no running transactions when the xl_running_xacts record + * was inserted, jump to CONSISTENT immediately. We might find such a + * state while waiting on c)'s sub-states. + * + * b) This (in a previous run) or another decoding slot serialized a + * snapshot to disk that we can use. Can't use this method for the + * initial snapshot when slot is being created and needs full snapshot + * for export or direct use, as that snapshot will only contain catalog + * modifying transactions. + * + * c) First incrementally build a snapshot for catalog tuples + * (BUILDING_SNAPSHOT), that requires all, already in-progress, + * transactions to finish. Every transaction starting after that + * (FULL_SNAPSHOT state), has enough information to be decoded. But + * for older running transactions no viable snapshot exists yet, so + * CONSISTENT will only be reached once all of those have finished. + * --- + */ + + /* + * xl_running_xacts record is older than what we can use, we might not + * have all necessary catalog rows anymore. + */ + if (TransactionIdIsNormal(builder->initial_xmin_horizon) && + NormalTransactionIdPrecedes(running->oldestRunningXid, + builder->initial_xmin_horizon)) + { + ereport(DEBUG1, + (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid))); + + + SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); + + return true; + } + + /* + * a) No transaction were running, we can jump to consistent. + * + * This is not affected by races around xl_running_xacts, because we can + * miss transaction commits, but currently not transactions starting. + * + * NB: We might have already started to incrementally assemble a snapshot, + * so we need to be careful to deal with that. + */ + if (running->oldestRunningXid == running->nextXid) + { + if (builder->start_decoding_at == InvalidXLogRecPtr || + builder->start_decoding_at <= lsn) + /* can decode everything after this */ + builder->start_decoding_at = lsn + 1; + + /* As no transactions were running xmin/xmax can be trivially set. */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions."))); + + return false; + } + /* b) valid on disk state and not building full snapshot */ + else if (!builder->building_full_snapshot && + SnapBuildRestore(builder, lsn)) + { + /* there won't be any state to cleanup */ + return false; + } + + /* + * c) transition from START to BUILDING_SNAPSHOT. + * + * In START state, and a xl_running_xacts record with running xacts is + * encountered. In that case, switch to BUILDING_SNAPSHOT state, and + * record xl_running_xacts->nextXid. Once all running xacts have finished + * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It + * might look that we could use xl_running_xacts's ->xids information to + * get there quicker, but that is problematic because transactions marked + * as running, might already have inserted their commit record - it's + * infeasible to change that with locking. + */ + else if (builder->state == SNAPBUILD_START) + { + builder->state = SNAPBUILD_BUILDING_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + /* + * Start with an xmin/xmax that's correct for future, when all the + * currently running transactions have finished. We'll update both + * while waiting for the pending transactions to finish. + */ + builder->xmin = running->nextXid; /* < are finished */ + builder->xmax = running->nextXid; /* >= are running */ + + /* so we can safely use the faster comparisons */ + Assert(TransactionIdIsNormal(builder->xmin)); + Assert(TransactionIdIsNormal(builder->xmax)); + + ereport(LOG, + (errmsg("logical decoding found initial starting point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT. + * + * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid + * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This + * means all transactions starting afterwards have enough information to + * be decoded. Switch to FULL_SNAPSHOT. + */ + else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_FULL_SNAPSHOT; + builder->next_phase_at = running->nextXid; + + ereport(LOG, + (errmsg("logical decoding found initial consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid))); + + SnapBuildWaitSnapshot(running, running->nextXid); + } + + /* + * c) transition from FULL_SNAPSHOT to CONSISTENT. + * + * In FULL_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid is + * >= than nextXid from when we switched to FULL_SNAPSHOT. This means all + * transactions that are currently in progress have a catalog snapshot, + * and all their changes have been collected. Switch to CONSISTENT. + */ + else if (builder->state == SNAPBUILD_FULL_SNAPSHOT && + TransactionIdPrecedesOrEquals(builder->next_phase_at, + running->oldestRunningXid)) + { + builder->state = SNAPBUILD_CONSISTENT; + builder->next_phase_at = InvalidTransactionId; + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore."))); + } + + /* + * We already started to track running xacts and need to wait for all + * in-progress ones to finish. We fall through to the normal processing of + * records so incremental cleanup can be performed. + */ + return true; +} + +/* --- + * Iterate through xids in record, wait for all older than the cutoff to + * finish. Then, if possible, log a new xl_running_xacts record. + * + * This isn't required for the correctness of decoding, but to: + * a) allow isolationtester to notice that we're currently waiting for + * something. + * b) log a new xl_running_xacts record where it'd be helpful, without having + * to wait for bgwriter or checkpointer. + * --- + */ +static void +SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff) +{ + int off; + + for (off = 0; off < running->xcnt; off++) + { + TransactionId xid = running->xids[off]; + + /* + * Upper layers should prevent that we ever need to wait on ourselves. + * Check anyway, since failing to do so would either result in an + * endless wait or an Assert() failure. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + elog(ERROR, "waiting for ourselves"); + + if (TransactionIdFollows(xid, cutoff)) + continue; + + XactLockTableWait(xid, NULL, NULL, XLTW_None); + } + + /* + * All transactions we needed to finish finished - try to ensure there is + * another xl_running_xacts record in a timely manner, without having to + * wait for bgwriter or checkpointer to log one. During recovery we can't + * enforce that, so we'll have to wait. + */ + if (!RecoveryInProgress()) + { + LogStandbySnapshot(); + } +} + +/* ----------------------------------- + * Snapshot serialization support + * ----------------------------------- + */ + +/* + * We store current state of struct SnapBuild on disk in the following manner: + * + * struct SnapBuildOnDisk; + * TransactionId * committed.xcnt; (*not xcnt_space*) + * TransactionId * catchange.xcnt; + * + */ +typedef struct SnapBuildOnDisk +{ + /* first part of this struct needs to be version independent */ + + /* data not covered by checksum */ + uint32 magic; + pg_crc32c checksum; + + /* data covered by checksum */ + + /* version, in case we want to support pg_upgrade */ + uint32 version; + /* how large is the on disk data, excluding the constant sized part */ + uint32 length; + + /* version dependent part */ + SnapBuild builder; + + /* variable amount of TransactionIds follows */ +} SnapBuildOnDisk; + +#define SnapBuildOnDiskConstantSize \ + offsetof(SnapBuildOnDisk, builder) +#define SnapBuildOnDiskNotChecksummedSize \ + offsetof(SnapBuildOnDisk, version) + +#define SNAPBUILD_MAGIC 0x51A1E001 +#define SNAPBUILD_VERSION 5 + +/* + * Store/Load a snapshot from disk, depending on the snapshot builder's state. + * + * Supposed to be used by external (i.e. not snapbuild.c) code that just read + * a record that's a potential location for a serialized snapshot. + */ +void +SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn) +{ + if (builder->state < SNAPBUILD_CONSISTENT) + SnapBuildRestore(builder, lsn); + else + SnapBuildSerialize(builder, lsn); +} + +/* + * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already + * been done by another decoding process. + */ +static void +SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) +{ + Size needed_length; + SnapBuildOnDisk *ondisk = NULL; + TransactionId *catchange_xip = NULL; + MemoryContext old_ctx; + size_t catchange_xcnt; + char *ondisk_c; + int fd; + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + int ret; + struct stat stat_buf; + Size sz; + + Assert(lsn != InvalidXLogRecPtr); + Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || + builder->last_serialized_snapshot <= lsn); + + /* + * no point in serializing if we cannot continue to work immediately after + * restoring the snapshot + */ + if (builder->state < SNAPBUILD_CONSISTENT) + return; + + /* consistent snapshots have no next phase */ + Assert(builder->next_phase_at == InvalidTransactionId); + + /* + * We identify snapshots by the LSN they are valid for. We don't need to + * include timelines in the name as each LSN maps to exactly one timeline + * unless the user used pg_resetwal or similar. If a user did so, there's + * no hope continuing to decode anyway. + */ + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + /* + * first check whether some other backend already has written the snapshot + * for this LSN. It's perfectly fine if there's none, so we accept ENOENT + * as a valid state. Everything else is an unexpected error. + */ + ret = stat(path, &stat_buf); + + if (ret != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + + else if (ret == 0) + { + /* + * somebody else has already serialized to this point, don't overwrite + * but remember location, so we don't need to read old data again. + * + * To be sure it has been synced to disk after the rename() from the + * tempfile filename to the real filename, we just repeat the fsync. + * That ought to be cheap because in most scenarios it should already + * be safely on disk. + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + builder->last_serialized_snapshot = lsn; + goto out; + } + + /* + * there is an obvious race condition here between the time we stat(2) the + * file and us writing the file. But we rename the file into place + * atomically and all files created need to contain the same data anyway, + * so this is perfectly fine, although a bit of a resource waste. Locking + * seems like pointless complication. + */ + elog(DEBUG1, "serializing snapshot to %s", path); + + /* to make sure only we will write to this tempfile, include pid */ + sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%d.tmp", + LSN_FORMAT_ARGS(lsn), MyProcPid); + + /* + * Unlink temporary file if it already exists, needs to have been before a + * crash/error since we won't enter this function twice from within a + * single decoding slot/backend and the temporary file contains the pid of + * the current process. + */ + if (unlink(tmppath) != 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", tmppath))); + + old_ctx = MemoryContextSwitchTo(builder->context); + + /* Get the catalog modifying transactions that are yet not committed */ + catchange_xip = ReorderBufferGetCatalogChangesXacts(builder->reorder); + catchange_xcnt = dclist_count(&builder->reorder->catchange_txns); + + needed_length = sizeof(SnapBuildOnDisk) + + sizeof(TransactionId) * (builder->committed.xcnt + catchange_xcnt); + + ondisk_c = palloc0(needed_length); + ondisk = (SnapBuildOnDisk *) ondisk_c; + ondisk->magic = SNAPBUILD_MAGIC; + ondisk->version = SNAPBUILD_VERSION; + ondisk->length = needed_length; + INIT_CRC32C(ondisk->checksum); + COMP_CRC32C(ondisk->checksum, + ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + ondisk_c += sizeof(SnapBuildOnDisk); + + memcpy(&ondisk->builder, builder, sizeof(SnapBuild)); + /* NULL-ify memory-only data */ + ondisk->builder.context = NULL; + ondisk->builder.snapshot = NULL; + ondisk->builder.reorder = NULL; + ondisk->builder.committed.xip = NULL; + ondisk->builder.catchange.xip = NULL; + /* update catchange only on disk data */ + ondisk->builder.catchange.xcnt = catchange_xcnt; + + COMP_CRC32C(ondisk->checksum, + &ondisk->builder, + sizeof(SnapBuild)); + + /* copy committed xacts */ + if (builder->committed.xcnt > 0) + { + sz = sizeof(TransactionId) * builder->committed.xcnt; + memcpy(ondisk_c, builder->committed.xip, sz); + COMP_CRC32C(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + } + + /* copy catalog modifying xacts */ + if (catchange_xcnt > 0) + { + sz = sizeof(TransactionId) * catchange_xcnt; + memcpy(ondisk_c, catchange_xip, sz); + COMP_CRC32C(ondisk->checksum, ondisk_c, sz); + ondisk_c += sz; + } + + FIN_CRC32C(ondisk->checksum); + + /* we have valid data now, open tempfile and write it there */ + fd = OpenTransientFile(tmppath, + O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", tmppath))); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); + if ((write(fd, ondisk, needed_length)) != needed_length) + { + int save_errno = errno; + + CloseTransientFile(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + /* + * fsync the file before renaming so that even if we crash after this we + * have either a fully valid file or nothing. + * + * It's safe to just ERROR on fsync() here because we'll retry the whole + * operation including the writes. + * + * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has + * some noticeable overhead since it's performed synchronously during + * decoding? + */ + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + CloseTransientFile(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmppath))); + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tmppath))); + + fsync_fname("pg_logical/snapshots", true); + + /* + * We may overwrite the work from some other backend, but that's ok, our + * snapshot is valid as well, we'll just have done some superfluous work. + */ + if (rename(tmppath, path) != 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + } + + /* make sure we persist */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + /* + * Now there's no way we can lose the dumped state anymore, remember this + * as a serialization point. + */ + builder->last_serialized_snapshot = lsn; + + MemoryContextSwitchTo(old_ctx); + +out: + ReorderBufferSetRestartPoint(builder->reorder, + builder->last_serialized_snapshot); + /* be tidy */ + if (ondisk) + pfree(ondisk); + if (catchange_xip) + pfree(catchange_xip); +} + +/* + * Restore a snapshot into 'builder' if previously one has been stored at the + * location indicated by 'lsn'. Returns true if successful, false otherwise. + */ +static bool +SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) +{ + SnapBuildOnDisk ondisk; + int fd; + char path[MAXPGPATH]; + Size sz; + pg_crc32c checksum; + + /* no point in loading a snapshot if we're already there */ + if (builder->state == SNAPBUILD_CONSISTENT) + return false; + + sprintf(path, "pg_logical/snapshots/%X-%X.snap", + LSN_FORMAT_ARGS(lsn)); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + + if (fd < 0 && errno == ENOENT) + return false; + else if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* ---- + * Make sure the snapshot had been stored safely to disk, that's normally + * cheap. + * Note that we do not need PANIC here, nobody will be able to use the + * slot without fsyncing, and saving it won't succeed without an fsync() + * either... + * ---- + */ + fsync_fname(path, false); + fsync_fname("pg_logical/snapshots", true); + + + /* read statically sized portion of snapshot */ + SnapBuildRestoreContents(fd, (char *) &ondisk, SnapBuildOnDiskConstantSize, path); + + if (ondisk.magic != SNAPBUILD_MAGIC) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u", + path, ondisk.magic, SNAPBUILD_MAGIC))); + + if (ondisk.version != SNAPBUILD_VERSION) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u", + path, ondisk.version, SNAPBUILD_VERSION))); + + INIT_CRC32C(checksum); + COMP_CRC32C(checksum, + ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize, + SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); + + /* read SnapBuild */ + SnapBuildRestoreContents(fd, (char *) &ondisk.builder, sizeof(SnapBuild), path); + COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild)); + + /* restore committed xacts information */ + if (ondisk.builder.committed.xcnt > 0) + { + sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt; + ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz); + SnapBuildRestoreContents(fd, (char *) ondisk.builder.committed.xip, sz, path); + COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz); + } + + /* restore catalog modifying xacts information */ + if (ondisk.builder.catchange.xcnt > 0) + { + sz = sizeof(TransactionId) * ondisk.builder.catchange.xcnt; + ondisk.builder.catchange.xip = MemoryContextAllocZero(builder->context, sz); + SnapBuildRestoreContents(fd, (char *) ondisk.builder.catchange.xip, sz, path); + COMP_CRC32C(checksum, ondisk.builder.catchange.xip, sz); + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + FIN_CRC32C(checksum); + + /* verify checksum of what we've read */ + if (!EQ_CRC32C(checksum, ondisk.checksum)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u", + path, checksum, ondisk.checksum))); + + /* + * ok, we now have a sensible snapshot here, figure out if it has more + * information than we have. + */ + + /* + * We are only interested in consistent snapshots for now, comparing + * whether one incomplete snapshot is more "advanced" seems to be + * unnecessarily complex. + */ + if (ondisk.builder.state < SNAPBUILD_CONSISTENT) + goto snapshot_not_interesting; + + /* + * Don't use a snapshot that requires an xmin that we cannot guarantee to + * be available. + */ + if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon)) + goto snapshot_not_interesting; + + /* + * Consistent snapshots have no next phase. Reset next_phase_at as it is + * possible that an old value may remain. + */ + Assert(ondisk.builder.next_phase_at == InvalidTransactionId); + builder->next_phase_at = InvalidTransactionId; + + /* ok, we think the snapshot is sensible, copy over everything important */ + builder->xmin = ondisk.builder.xmin; + builder->xmax = ondisk.builder.xmax; + builder->state = ondisk.builder.state; + + builder->committed.xcnt = ondisk.builder.committed.xcnt; + /* We only allocated/stored xcnt, not xcnt_space xids ! */ + /* don't overwrite preallocated xip, if we don't have anything here */ + if (builder->committed.xcnt > 0) + { + pfree(builder->committed.xip); + builder->committed.xcnt_space = ondisk.builder.committed.xcnt; + builder->committed.xip = ondisk.builder.committed.xip; + } + ondisk.builder.committed.xip = NULL; + + /* set catalog modifying transactions */ + if (builder->catchange.xip) + pfree(builder->catchange.xip); + builder->catchange.xcnt = ondisk.builder.catchange.xcnt; + builder->catchange.xip = ondisk.builder.catchange.xip; + ondisk.builder.catchange.xip = NULL; + + /* our snapshot is not interesting anymore, build a new one */ + if (builder->snapshot != NULL) + { + SnapBuildSnapDecRefcount(builder->snapshot); + } + builder->snapshot = SnapBuildBuildSnapshot(builder); + SnapBuildSnapIncRefcount(builder->snapshot); + + ReorderBufferSetRestartPoint(builder->reorder, lsn); + + Assert(builder->state == SNAPBUILD_CONSISTENT); + + ereport(LOG, + (errmsg("logical decoding found consistent point at %X/%X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot."))); + return true; + +snapshot_not_interesting: + if (ondisk.builder.committed.xip != NULL) + pfree(ondisk.builder.committed.xip); + if (ondisk.builder.catchange.xip != NULL) + pfree(ondisk.builder.catchange.xip); + return false; +} + +/* + * Read the contents of the serialized snapshot to 'dest'. + */ +static void +SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path) +{ + int readBytes; + + pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); + readBytes = read(fd, dest, size); + pgstat_report_wait_end(); + if (readBytes != size) + { + int save_errno = errno; + + CloseTransientFile(fd); + + if (readBytes < 0) + { + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, size))); + } +} + +/* + * Remove all serialized snapshots that are not required anymore because no + * slot can need them. This doesn't actually have to run during a checkpoint, + * but it's a convenient point to schedule this. + * + * NB: We run this during checkpoints even if logical decoding is disabled so + * we cleanup old slots at some point after it got disabled. + */ +void +CheckPointSnapBuild(void) +{ + XLogRecPtr cutoff; + XLogRecPtr redo; + DIR *snap_dir; + struct dirent *snap_de; + char path[MAXPGPATH + 21]; + + /* + * We start off with a minimum of the last redo pointer. No new + * replication slot will start before that, so that's a safe upper bound + * for removal. + */ + redo = GetRedoRecPtr(); + + /* now check for the restart ptrs from existing slots */ + cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + + /* don't start earlier than the restart lsn */ + if (redo < cutoff) + cutoff = redo; + + snap_dir = AllocateDir("pg_logical/snapshots"); + while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL) + { + uint32 hi; + uint32 lo; + XLogRecPtr lsn; + PGFileType de_type; + + if (strcmp(snap_de->d_name, ".") == 0 || + strcmp(snap_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name); + de_type = get_dirent_type(path, snap_de, false, DEBUG1); + + if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_REG) + { + elog(DEBUG1, "only regular files expected: %s", path); + continue; + } + + /* + * temporary filenames from SnapBuildSerialize() include the LSN and + * everything but are postfixed by .$pid.tmp. We can just remove them + * the same as other files because there can be none that are + * currently being written that are older than cutoff. + * + * We just log a message if a file doesn't fit the pattern, it's + * probably some editors lock/state file or similar... + */ + if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name \"%s\"", path))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + + /* check whether we still need it */ + if (lsn < cutoff || cutoff == InvalidXLogRecPtr) + { + elog(DEBUG1, "removing snapbuild snapshot %s", path); + + /* + * It's not particularly harmful, though strange, if we can't + * remove the file here. Don't prevent the checkpoint from + * completing, that'd be a cure worse than the disease. + */ + if (unlink(path) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + path))); + continue; + } + } + } + FreeDir(snap_dir); +} diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c new file mode 100644 index 0000000..81fff19 --- /dev/null +++ b/src/backend/replication/logical/tablesync.c @@ -0,0 +1,1673 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication: initial table data synchronization + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in a separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in the stream with + * the leader apply worker. + * + * There are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in a single process for the whole database. + * - It allows us to synchronize any tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps: + * - Apply worker requests a tablesync worker to start, setting the new + * table state to INIT. + * - Tablesync worker starts; changes table state from INIT to DATASYNC while + * copying. + * - Tablesync worker does initial table copy; there is a FINISHEDCOPY (sync + * worker specific) state to indicate when the copy phase has completed, so + * if the worker crashes with this (non-memory) state then the copy will not + * be re-attempted. + * - Tablesync worker then sets table state to SYNCWAIT; waits for state change. + * - Apply worker periodically checks for tables in SYNCWAIT state. When + * any appear, it sets the table state to CATCHUP and starts loop-waiting + * until either the table state is set to SYNCDONE or the sync worker + * exits. + * - After the sync worker has seen the state change to CATCHUP, it will + * read the stream and apply changes (acting like an apply worker) until + * it catches up to the specified stream position. Then it sets the + * state to SYNCDONE. There might be zero changes applied between + * CATCHUP and SYNCDONE, because the sync worker might be ahead of the + * apply worker. + * - Once the state is set to SYNCDONE, the apply will continue tracking + * the table until it reaches the SYNCDONE stream position, at which + * point it sets state to READY and stops tracking. Again, there might + * be zero changes in between. + * + * So the state progression is always: INIT -> DATASYNC -> FINISHEDCOPY + * -> SYNCWAIT -> CATCHUP -> SYNCDONE -> READY. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state. The catalog holds all states + * except SYNCWAIT and CATCHUP which are only in shared memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:10 + * -> set in memory CATCHUP + * -> enter wait-loop + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * apply:11 + * -> set in catalog READY + * + * - Sync is in front: + * sync:10 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:8 + * -> set in memory CATCHUP + * -> continue per-table filtering + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> set in catalog READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/table.h" +#include "access/xact.h" +#include "catalog/indexing.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" +#include "commands/copy.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "parser/parse_relation.h" +#include "pgstat.h" +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "replication/slot.h" +#include "replication/origin.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rls.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/usercontext.h" + +static bool table_states_valid = false; +static List *table_states_not_ready = NIL; +static bool FetchTableStates(bool *started_tx); + +static StringInfo copybuf = NULL; + +/* + * Exit routine for synchronization worker. + */ +static void +pg_attribute_noreturn() +finish_sync_worker(void) +{ + /* + * Commit any outstanding transaction. This is the usual case, unless + * there was nothing to do for the table. + */ + if (IsTransactionState()) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + /* And flush all writes. */ + XLogFlush(GetXLogWriteRecPtr()); + + StartTransactionCommand(); + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid)))); + CommitTransactionCommand(); + + /* Find the leader apply worker and signal it. */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + + /* Stop gracefully */ + proc_exit(0); +} + +/* + * Wait until the relation sync state is set in the catalog to the expected + * one; return true when it happens. + * + * Returns false if the table sync worker or the table itself have + * disappeared, or the table state has been reset. + * + * Currently, this is used in the apply worker when transitioning from + * CATCHUP state to SYNCDONE. + */ +static bool +wait_for_relation_state_change(Oid relid, char expected_state) +{ + char state; + + for (;;) + { + LogicalRepWorker *worker; + XLogRecPtr statelsn; + + CHECK_FOR_INTERRUPTS(); + + InvalidateCatalogSnapshot(); + state = GetSubscriptionRelState(MyLogicalRepWorker->subid, + relid, &statelsn); + + if (state == SUBREL_STATE_UNKNOWN) + break; + + if (state == expected_state) + return true; + + /* Check if the sync worker is still running and bail if not. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, relid, + false); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Wait until the apply worker changes the state of our synchronization + * worker to the expected one. + * + * Used when transitioning from SYNCWAIT state to CATCHUP. + * + * Returns false if the apply worker has disappeared. + */ +static bool +wait_for_worker_state_change(char expected_state) +{ + int rc; + + for (;;) + { + LogicalRepWorker *worker; + + CHECK_FOR_INTERRUPTS(); + + /* + * Done if already in correct state. (We assume this fetch is atomic + * enough to not give a misleading answer if we do it with no lock.) + */ + if (MyLogicalRepWorker->relstate == expected_state) + return true; + + /* + * Bail out if the apply worker has died, else signal it we're + * waiting. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(MyLogicalRepWorker->subid, + InvalidOid, false); + if (worker && worker->proc) + logicalrep_worker_wakeup_ptr(worker); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + /* + * Wait. We expect to get a latch signal back from the apply worker, + * but use a timeout in case it dies without sending one. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + + if (rc & WL_LATCH_SET) + ResetLatch(MyLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void +invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + table_states_valid = false; +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in CATCHUP state and reached (or passed) the + * predetermined synchronization point in the WAL stream, mark the table as + * SYNCDONE and finish. + */ +static void +process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP && + current_lsn >= MyLogicalRepWorker->relstate_lsn) + { + TimeLineID tli; + char syncslotname[NAMEDATALEN] = {0}; + char originname[NAMEDATALEN] = {0}; + + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCDONE; + MyLogicalRepWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * UpdateSubscriptionRelState must be called within a transaction. + */ + if (!IsTransactionState()) + StartTransactionCommand(); + + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + + /* + * End streaming so that LogRepWorkerWalRcvConn can be used to drop + * the slot. + */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); + + /* + * Cleanup the tablesync slot. + * + * This has to be done after updating the state because otherwise if + * there is an error while doing the database operations we won't be + * able to rollback dropped slot. + */ + ReplicationSlotNameForTablesync(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + syncslotname, + sizeof(syncslotname)); + + /* + * It is important to give an error if we are unable to drop the slot, + * otherwise, it won't be dropped till the corresponding subscription + * is dropped. So passing missing_ok = false. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, syncslotname, false); + + CommitTransactionCommand(); + pgstat_report_stat(false); + + /* + * Start a new transaction to clean up the tablesync origin tracking. + * This transaction will be ended within the finish_sync_worker(). + * Now, even, if we fail to remove this here, the apply worker will + * ensure to clean it up afterward. + * + * We need to do this after the table state is set to SYNCDONE. + * Otherwise, if an error occurs while performing the database + * operation, the worker will be restarted and the in-memory state of + * replication progress (remote_lsn) won't be rolled-back which would + * have been cleared before restart. So, the restarted worker will use + * invalid replication progress state resulting in replay of + * transactions that have already been applied. + */ + StartTransactionCommand(); + + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + + /* + * Resetting the origin session removes the ownership of the slot. + * This is needed to allow the origin to be dropped. + */ + replorigin_session_reset(); + replorigin_session_origin = InvalidRepOriginId; + replorigin_session_origin_lsn = InvalidXLogRecPtr; + replorigin_session_origin_timestamp = 0; + + /* + * Drop the tablesync's origin tracking if exists. + * + * There is a chance that the user is concurrently performing refresh + * for the subscription where we remove the table state and its origin + * or the apply worker would have removed this origin. So passing + * missing_ok = true. + */ + replorigin_drop_by_name(originname, true, false); + + finish_sync_worker(); + } + else + SpinLockRelease(&MyLogicalRepWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). To prevent starting the sync worker for the same relation at a + * high frequency after a failure, we store its last start time with each sync + * state info. We start the sync worker for the same relation after waiting + * at least wal_retrieve_retry_interval. + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. This is the + * SYNCWAIT to CATCHUP transition. + * + * If the synchronization position is reached (SYNCDONE), then the table can + * be marked as READY and is no longer tracked. + */ +static void +process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + struct tablesync_start_time_mapping + { + Oid relid; + TimestampTz last_start_time; + }; + static HTAB *last_start_times = NULL; + ListCell *lc; + bool started_tx = false; + bool should_exit = false; + + Assert(!IsTransactionState()); + + /* We need up-to-date sync state info for subscription tables here. */ + FetchTableStates(&started_tx); + + /* + * Prepare a hash table for tracking last start times of workers, to avoid + * immediate restarts. We don't need it if there are no tables that need + * syncing. + */ + if (table_states_not_ready != NIL && !last_start_times) + { + HASHCTL ctl; + + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(struct tablesync_start_time_mapping); + last_start_times = hash_create("Logical replication table sync worker start times", + 256, &ctl, HASH_ELEM | HASH_BLOBS); + } + + /* + * Clean up the hash table when we're done with all tables (just to + * release the bit of memory). + */ + else if (table_states_not_ready == NIL && last_start_times) + { + hash_destroy(last_start_times); + last_start_times = NULL; + } + + /* + * Process all tables that are being synchronized. + */ + foreach(lc, table_states_not_ready) + { + SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) + { + /* + * Apply has caught up to the position where the table sync has + * finished. Mark the table as ready so that the apply will just + * continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) + { + char originname[NAMEDATALEN]; + + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + if (!started_tx) + { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Remove the tablesync origin tracking if exists. + * + * There is a chance that the user is concurrently performing + * refresh for the subscription where we remove the table + * state and its origin or the tablesync worker would have + * already removed this origin. We can't rely on tablesync + * worker to remove the origin tracking as if there is any + * error while dropping we won't restart it to drop the + * origin. So passing missing_ok = true. + */ + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, + rstate->relid, + originname, + sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + + /* + * Update the state to READY only after the origin cleanup. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + rstate->relid, rstate->state, + rstate->lsn); + } + } + else + { + LogicalRepWorker *syncworker; + + /* + * Look for a sync worker for this relation. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid, + rstate->relid, false); + + if (syncworker) + { + /* Found one, update our copy of its state */ + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* + * Sync worker is waiting for apply. Tell sync worker it + * can catchup now. + */ + syncworker->relstate = SUBREL_STATE_CATCHUP; + syncworker->relstate_lsn = + Max(syncworker->relstate_lsn, current_lsn); + } + SpinLockRelease(&syncworker->relmutex); + + /* If we told worker to catch up, wait for it. */ + if (rstate->state == SUBREL_STATE_SYNCWAIT) + { + /* Signal the sync worker, as it may be waiting for us. */ + if (syncworker->proc) + logicalrep_worker_wakeup_ptr(syncworker); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + if (started_tx) + { + /* + * We must commit the existing transaction to release + * the existing locks before entering a busy loop. + * This is required to avoid any undetected deadlocks + * due to any existing lock as deadlock detector won't + * be able to detect the waits on the latch. + */ + CommitTransactionCommand(); + pgstat_report_stat(false); + } + + /* + * Enter busy loop and wait for synchronization worker to + * reach expected state (or die trying). + */ + StartTransactionCommand(); + started_tx = true; + + wait_for_relation_state_change(rstate->relid, + SUBREL_STATE_SYNCDONE); + } + else + LWLockRelease(LogicalRepWorkerLock); + } + else + { + /* + * If there is no sync worker for this table yet, count + * running sync workers for this subscription, while we have + * the lock. + */ + int nsyncworkers = + logicalrep_sync_worker_count(MyLogicalRepWorker->subid); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * If there are free sync worker slot(s), start a new sync + * worker for the table. + */ + if (nsyncworkers < max_sync_workers_per_subscription) + { + TimestampTz now = GetCurrentTimestamp(); + struct tablesync_start_time_mapping *hentry; + bool found; + + hentry = hash_search(last_start_times, &rstate->relid, + HASH_ENTER, &found); + + if (!found || + TimestampDifferenceExceeds(hentry->last_start_time, now, + wal_retrieve_retry_interval)) + { + logicalrep_worker_launch(MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid, + DSM_HANDLE_INVALID); + hentry->last_start_time = now; + } + } + } + } + } + + if (started_tx) + { + /* + * Even when the two_phase mode is requested by the user, it remains + * as 'pending' until all tablesyncs have reached READY state. + * + * When this happens, we restart the apply worker and (if the + * conditions are still ok) then the two_phase tri-state will become + * 'enabled' at that time. + * + * Note: If the subscription has no tables then leave the state as + * PENDING, which allows ALTER SUBSCRIPTION ... REFRESH PUBLICATION to + * work. + */ + if (MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING) + { + CommandCounterIncrement(); /* make updates visible */ + if (AllTablesyncsReady()) + { + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" will restart so that two_phase can be enabled", + MySubscription->name))); + should_exit = true; + } + } + + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + if (should_exit) + { + /* + * Reset the last-start time for this worker so that the launcher will + * restart it without waiting for wal_retrieve_retry_interval. + */ + ApplyLauncherForgetWorkerStartTime(MySubscription->oid); + + proc_exit(0); + } +} + +/* + * Process possible state change(s) of tables that are being synchronized. + */ +void +process_syncing_tables(XLogRecPtr current_lsn) +{ + /* + * Skip for parallel apply workers because they only operate on tables + * that are in a READY state. See pa_can_start() and + * should_apply_changes_for_rel(). + */ + if (am_parallel_apply_worker()) + return; + + if (am_tablesync_worker()) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List * +make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + int i; + + for (i = 0; i < rel->remoterel.natts; i++) + { + attnamelist = lappend(attnamelist, + makeString(rel->remoterel.attnames[i])); + } + + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int +copy_read_data(void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + + /* If there are some leftover data from previous read, use it. */ + avail = copybuf->len - copybuf->cursor; + if (avail) + { + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (maxread > 0 && bytesread < minread) + { + pgsocket fd = PGINVALID_SOCKET; + int len; + char *buf = NULL; + + for (;;) + { + /* Try read the data. */ + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else + { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + memcpy(outbuf, ©buf->data[copybuf->cursor], avail); + outbuf = (void *) ((char *) outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } + + /* + * Wait for more data or latch. + */ + (void) WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA); + + ResetLatch(MyLatch); + } + + return bytesread; +} + + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. This function also returns the relation + * qualifications to be used in the COPY command. + */ +static void +fetch_remote_table_info(char *nspname, char *relname, + LogicalRepRelation *lrel, List **qual) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[] = {OIDOID, CHAROID, CHAROID}; + Oid attrRow[] = {INT2OID, TEXTOID, OIDOID, BOOLOID}; + Oid qualRow[] = {TEXTOID}; + bool isnull; + int natt; + ListCell *lc; + Bitmapset *included_cols = NULL; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT c.oid, c.relreplident, c.relkind" + " FROM pg_catalog.pg_class c" + " INNER JOIN pg_catalog.pg_namespace n" + " ON (c.relnamespace = n.oid)" + " WHERE n.nspname = %s" + " AND c.relname = %s", + quote_literal_cstr(nspname), + quote_literal_cstr(relname)); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(tableRow), tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("table \"%s.%s\" not found on publisher", + nspname, relname))); + + lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + lrel->relkind = DatumGetChar(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + + /* + * Get column lists for each relation. + * + * We need to do this before fetching info about column names and types, + * so that we can skip columns that should not be replicated. + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 150000) + { + WalRcvExecResult *pubres; + TupleTableSlot *tslot; + Oid attrsRow[] = {INT2VECTOROID}; + StringInfoData pub_names; + + initStringInfo(&pub_names); + foreach(lc, MySubscription->publications) + { + if (foreach_current_index(lc) > 0) + appendStringInfoString(&pub_names, ", "); + appendStringInfoString(&pub_names, quote_literal_cstr(strVal(lfirst(lc)))); + } + + /* + * Fetch info about column lists for the relation (from all the + * publications). + */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT DISTINCT" + " (CASE WHEN (array_length(gpt.attrs, 1) = c.relnatts)" + " THEN NULL ELSE gpt.attrs END)" + " FROM pg_publication p," + " LATERAL pg_get_publication_tables(p.pubname) gpt," + " pg_class c" + " WHERE gpt.relid = %u AND c.oid = gpt.relid" + " AND p.pubname IN ( %s )", + lrel->remoteid, + pub_names.data); + + pubres = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(attrsRow), attrsRow); + + if (pubres->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch column list info for table \"%s.%s\" from publisher: %s", + nspname, relname, pubres->err))); + + /* + * We don't support the case where the column list is different for + * the same table when combining publications. See comments atop + * fetch_table_list. So there should be only one row returned. + * Although we already checked this when creating the subscription, we + * still need to check here in case the column list was changed after + * creating the subscription and before the sync worker is started. + */ + if (tuplestore_tuple_count(pubres->tuplestore) > 1) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use different column lists for table \"%s.%s\" in different publications", + nspname, relname)); + + /* + * Get the column list and build a single bitmap with the attnums. + * + * If we find a NULL value, it means all the columns should be + * replicated. + */ + tslot = MakeSingleTupleTableSlot(pubres->tupledesc, &TTSOpsMinimalTuple); + if (tuplestore_gettupleslot(pubres->tuplestore, true, false, tslot)) + { + Datum cfval = slot_getattr(tslot, 1, &isnull); + + if (!isnull) + { + ArrayType *arr; + int nelems; + int16 *elems; + + arr = DatumGetArrayTypeP(cfval); + nelems = ARR_DIMS(arr)[0]; + elems = (int16 *) ARR_DATA_PTR(arr); + + for (natt = 0; natt < nelems; natt++) + included_cols = bms_add_member(included_cols, elems[natt]); + } + + ExecClearTuple(tslot); + } + ExecDropSingleTupleTableSlot(tslot); + + walrcv_clear_result(pubres); + + pfree(pub_names.data); + } + + /* + * Now fetch column names and types. + */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attnum," + " a.attname," + " a.atttypid," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped %s" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, + (walrcv_server_version(LogRepWorkerWalRcvConn) >= 120000 ? + "AND a.attgenerated = ''" : ""), + lrel->remoteid); + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, + lengthof(attrRow), attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + /* We don't know the number of rows coming, so allocate enough space. */ + lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + /* + * Store the columns as a list of names. Ignore those that are not + * present in the column list, if there is one. + */ + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + char *rel_colname; + AttrNumber attnum; + + attnum = DatumGetInt16(slot_getattr(slot, 1, &isnull)); + Assert(!isnull); + + /* If the column is not in the column list, skip it. */ + if (included_cols != NULL && !bms_is_member(attnum, included_cols)) + { + ExecClearTuple(slot); + continue; + } + + rel_colname = TextDatumGetCString(slot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + lrel->attnames[natt] = rel_colname; + lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 3, &isnull)); + Assert(!isnull); + + if (DatumGetBool(slot_getattr(slot, 4, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", + nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + + /* + * Get relation's row filter expressions. DISTINCT avoids the same + * expression of a table in multiple publications from being included + * multiple times in the final expression. + * + * We need to copy the row even if it matches just one of the + * publications, so we later combine all the quals with OR. + * + * For initial synchronization, row filtering can be ignored in following + * cases: + * + * 1) one of the subscribed publications for the table hasn't specified + * any row filter + * + * 2) one of the subscribed publications has puballtables set to true + * + * 3) one of the subscribed publications is declared as TABLES IN SCHEMA + * that includes this relation + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 150000) + { + StringInfoData pub_names; + + /* Build the pubname list. */ + initStringInfo(&pub_names); + foreach(lc, MySubscription->publications) + { + char *pubname = strVal(lfirst(lc)); + + if (foreach_current_index(lc) > 0) + appendStringInfoString(&pub_names, ", "); + + appendStringInfoString(&pub_names, quote_literal_cstr(pubname)); + } + + /* Check for row filters. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT DISTINCT pg_get_expr(gpt.qual, gpt.relid)" + " FROM pg_publication p," + " LATERAL pg_get_publication_tables(p.pubname) gpt" + " WHERE gpt.relid = %u" + " AND p.pubname IN ( %s )", + lrel->remoteid, + pub_names.data); + + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 1, qualRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table WHERE clause info for table \"%s.%s\" from publisher: %s", + nspname, relname, res->err))); + + /* + * Multiple row filter expressions for the same table will be combined + * by COPY using OR. If any of the filter expressions for this table + * are null, it means the whole table will be copied. In this case it + * is not necessary to construct a unified row filter expression at + * all. + */ + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + { + Datum rf = slot_getattr(slot, 1, &isnull); + + if (!isnull) + *qual = lappend(*qual, makeString(TextDatumGetCString(rf))); + else + { + /* Ignore filters and cleanup as necessary. */ + if (*qual) + { + list_free_deep(*qual); + *qual = NIL; + } + break; + } + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); + } + + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void +copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + List *qual = NIL; + WalRcvExecResult *res; + StringInfoData cmd; + CopyFromState cstate; + List *attnamelist; + ParseState *pstate; + List *options = NIL; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel), &lrel, &qual); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + + /* Regular table with no row filter */ + if (lrel.relkind == RELKIND_RELATION && qual == NIL) + { + appendStringInfo(&cmd, "COPY %s", + quote_qualified_identifier(lrel.nspname, lrel.relname)); + + /* If the table has columns, then specify the columns */ + if (lrel.natts) + { + appendStringInfoString(&cmd, " ("); + + /* + * XXX Do we need to list the columns in all cases? Maybe we're + * replicating all columns? + */ + for (int i = 0; i < lrel.natts; i++) + { + if (i > 0) + appendStringInfoString(&cmd, ", "); + + appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i])); + } + + appendStringInfoString(&cmd, ")"); + } + + appendStringInfoString(&cmd, " TO STDOUT"); + } + else + { + /* + * For non-tables and tables with row filters, we need to do COPY + * (SELECT ...), but we can't just do SELECT * because we need to not + * copy generated columns. For tables with any row filters, build a + * SELECT query with OR'ed row filters for COPY. + */ + appendStringInfoString(&cmd, "COPY (SELECT "); + for (int i = 0; i < lrel.natts; i++) + { + appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i])); + if (i < lrel.natts - 1) + appendStringInfoString(&cmd, ", "); + } + + appendStringInfoString(&cmd, " FROM "); + + /* + * For regular tables, make sure we don't copy data from a child that + * inherits the named table as those will be copied separately. + */ + if (lrel.relkind == RELKIND_RELATION) + appendStringInfoString(&cmd, "ONLY "); + + appendStringInfoString(&cmd, quote_qualified_identifier(lrel.nspname, lrel.relname)); + /* list of OR'ed filters */ + if (qual != NIL) + { + ListCell *lc; + char *q = strVal(linitial(qual)); + + appendStringInfo(&cmd, " WHERE %s", q); + for_each_from(lc, qual, 1) + { + q = strVal(lfirst(lc)); + appendStringInfo(&cmd, " OR %s", q); + } + list_free_deep(qual); + } + + appendStringInfoString(&cmd, ") TO STDOUT"); + } + + /* + * Prior to v16, initial table synchronization will use text format even + * if the binary option is enabled for a subscription. + */ + if (walrcv_server_version(LogRepWorkerWalRcvConn) >= 160000 && + MySubscription->binary) + { + appendStringInfoString(&cmd, " WITH (FORMAT binary)"); + options = list_make1(makeDefElem("format", + (Node *) makeString("binary"), -1)); + } + + res = walrcv_exec(LogRepWorkerWalRcvConn, cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not start initial contents copy for table \"%s.%s\": %s", + lrel.nspname, lrel.relname, res->err))); + walrcv_clear_result(res); + + copybuf = makeStringInfo(); + + pstate = make_parsestate(NULL); + (void) addRangeTableEntryForRelation(pstate, rel, AccessShareLock, + NULL, false, false); + + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(pstate, rel, NULL, NULL, false, copy_read_data, attnamelist, options); + + /* Do the copy */ + (void) CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Determine the tablesync slot name. + * + * The name must not exceed NAMEDATALEN - 1 because of remote node constraints + * on slot name length. We append system_identifier to avoid slot_name + * collision with subscriptions in other clusters. With the current scheme + * pg_%u_sync_%u_UINT64_FORMAT (3 + 10 + 6 + 10 + 20 + '\0'), the maximum + * length of slot_name will be 50. + * + * The returned slot name is stored in the supplied buffer (syncslotname) with + * the given size. + * + * Note: We don't use the subscription slot name as part of tablesync slot name + * because we are responsible for cleaning up these slots and it could become + * impossible to recalculate what name to cleanup if the subscription slot name + * had changed. + */ +void +ReplicationSlotNameForTablesync(Oid suboid, Oid relid, + char *syncslotname, Size szslot) +{ + snprintf(syncslotname, szslot, "pg_%u_sync_%u_" UINT64_FORMAT, suboid, + relid, GetSystemIdentifier()); +} + +/* + * Start syncing the table in the sync worker. + * + * If nothing needs to be done to sync the table, we exit the worker without + * any further action. + * + * The returned slot name is palloc'ed in current memory context. + */ +char * +LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char *err; + char relstate; + XLogRecPtr relstate_lsn; + Relation rel; + AclResult aclresult; + WalRcvExecResult *res; + char originname[NAMEDATALEN]; + RepOriginId originid; + UserContext ucxt; + bool must_use_password; + bool run_as_owner; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + relstate = GetSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + &relstate_lsn); + + /* Is the use of a password mandatory? */ + must_use_password = MySubscription->passwordrequired && + !superuser_arg(MySubscription->owner); + + /* Note that the superuser_arg call can access the DB */ + CommitTransactionCommand(); + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = relstate; + MyLogicalRepWorker->relstate_lsn = relstate_lsn; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * If synchronization is already done or no longer necessary, exit now + * that we've updated shared memory state. + */ + switch (relstate) + { + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + case SUBREL_STATE_UNKNOWN: + finish_sync_worker(); /* doesn't return */ + } + + /* Calculate the name of the tablesync slot. */ + slotname = (char *) palloc(NAMEDATALEN); + ReplicationSlotNameForTablesync(MySubscription->oid, + MyLogicalRepWorker->relid, + slotname, + NAMEDATALEN); + + /* + * Here we use the slot name instead of the subscription name as the + * application_name, so that it is different from the leader apply worker, + * so that synchronous replication can distinguish them. + */ + LogRepWorkerWalRcvConn = + walrcv_connect(MySubscription->conninfo, true, + must_use_password, + slotname, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + Assert(MyLogicalRepWorker->relstate == SUBREL_STATE_INIT || + MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC || + MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY); + + /* Assign the origin tracking record name. */ + ReplicationOriginNameForLogicalRep(MySubscription->oid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + + if (MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC) + { + /* + * We have previously errored out before finishing the copy so the + * replication slot might exist. We want to remove the slot if it + * already exists and proceed. + * + * XXX We could also instead try to drop the slot, last time we failed + * but for that, we might need to clean up the copy state as it might + * be in the middle of fetching the rows. Also, if there is a network + * breakdown then it wouldn't have succeeded so trying it next time + * seems like a better bet. + */ + ReplicationSlotDropAtPubNode(LogRepWorkerWalRcvConn, slotname, true); + } + else if (MyLogicalRepWorker->relstate == SUBREL_STATE_FINISHEDCOPY) + { + /* + * The COPY phase was previously done, but tablesync then crashed + * before it was able to finish normally. + */ + StartTransactionCommand(); + + /* + * The origin tracking name must already exist. It was created first + * time this tablesync was launched. + */ + originid = replorigin_by_name(originname, false); + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + *origin_startpos = replorigin_session_get_progress(false); + + CommitTransactionCommand(); + + goto copy_table_done; + } + + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC; + MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + MyLogicalRepWorker->relstate, + MyLogicalRepWorker->relstate_lsn); + CommitTransactionCommand(); + pgstat_report_stat(true); + + StartTransactionCommand(); + + /* + * Use a standard write lock here. It might be better to disallow access + * to the table while it's being synchronized. But we don't want to block + * the main apply process from working and it has to open the relation in + * RowExclusiveLock when remapping remote relation id to local one. + */ + rel = table_open(MyLogicalRepWorker->relid, RowExclusiveLock); + + /* + * Start a transaction in the remote node in REPEATABLE READ mode. This + * ensures that both the replication slot we create (see below) and the + * COPY are consistent with each other. + */ + res = walrcv_exec(LogRepWorkerWalRcvConn, + "BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ", + 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not start transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + /* + * Create a new permanent logical decoding slot. This slot will be used + * for the catchup phase after COPY is done, so tell it to use the + * snapshot to make the final data consistent. + */ + walrcv_create_slot(LogRepWorkerWalRcvConn, + slotname, false /* permanent */ , false /* two_phase */ , + CRS_USE_SNAPSHOT, origin_startpos); + + /* + * Setup replication origin tracking. The purpose of doing this before the + * copy is to avoid doing the copy again due to any error in setting up + * origin tracking. + */ + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + { + /* + * Origin tracking does not exist, so create it now. + * + * Then advance to the LSN got from walrcv_create_slot. This is WAL + * logged for the purpose of recovery. Locks are to prevent the + * replication origin from vanishing while advancing. + */ + originid = replorigin_create(originname); + + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, + true /* go backward */ , true /* WAL log */ ); + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("replication origin \"%s\" already exists", + originname))); + } + + /* + * Make sure that the copy command runs as the table owner, unless the + * user has opted out of that behaviour. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->rd_rel->relowner, &ucxt); + + /* + * Check that our table sync worker has permission to insert into the + * target table. + */ + aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), + ACL_INSERT); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, + get_relkind_objtype(rel->rd_rel->relkind), + RelationGetRelationName(rel)); + + /* + * COPY FROM does not honor RLS policies. That is not a problem for + * subscriptions owned by roles with BYPASSRLS privilege (or superuser, + * who has it implicitly), but other roles should not be able to + * circumvent RLS. Disallow logical replication into RLS enabled + * relations for such roles. + */ + if (check_enable_rls(RelationGetRelid(rel), InvalidOid, false) == RLS_ENABLED) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("user \"%s\" cannot replicate into relation with row-level security enabled: \"%s\"", + GetUserNameFromId(GetUserId(), true), + RelationGetRelationName(rel)))); + + /* Now do the initial data copy */ + PushActiveSnapshot(GetTransactionSnapshot()); + copy_table(rel); + PopActiveSnapshot(); + + res = walrcv_exec(LogRepWorkerWalRcvConn, "COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("table copy could not finish transaction on publisher: %s", + res->err))); + walrcv_clear_result(res); + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + table_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * Update the persisted state to indicate the COPY phase is done; make it + * visible to others. + */ + UpdateSubscriptionRelState(MyLogicalRepWorker->subid, + MyLogicalRepWorker->relid, + SUBREL_STATE_FINISHEDCOPY, + MyLogicalRepWorker->relstate_lsn); + + CommitTransactionCommand(); + +copy_table_done: + + elog(DEBUG1, + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + originname, LSN_FORMAT_ARGS(*origin_startpos)); + + /* + * We are done with the initial data synchronization, update the state. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT; + MyLogicalRepWorker->relstate_lsn = *origin_startpos; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + /* + * Finally, wait until the leader apply worker tells us to catch up and + * then return to let LogicalRepApplyLoop do it. + */ + wait_for_worker_state_change(SUBREL_STATE_CATCHUP); + return slotname; +} + +/* + * Common code to fetch the up-to-date sync state info into the static lists. + * + * Returns true if subscription has 1 or more tables, else false. + * + * Note: If this function started the transaction (indicated by the parameter) + * then it is the caller's responsibility to commit it. + */ +static bool +FetchTableStates(bool *started_tx) +{ + static bool has_subrels = false; + + *started_tx = false; + + if (!table_states_valid) + { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + /* Clean the old lists. */ + list_free_deep(table_states_not_ready); + table_states_not_ready = NIL; + + if (!IsTransactionState()) + { + StartTransactionCommand(); + *started_tx = true; + } + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionRelations(MySubscription->oid, true); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + foreach(lc, rstates) + { + rstate = palloc(sizeof(SubscriptionRelState)); + memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState)); + table_states_not_ready = lappend(table_states_not_ready, rstate); + } + MemoryContextSwitchTo(oldctx); + + /* + * Does the subscription have tables? + * + * If there were not-READY relations found then we know it does. But + * if table_states_not_ready was empty we still need to check again to + * see if there are 0 tables. + */ + has_subrels = (table_states_not_ready != NIL) || + HasSubscriptionRelations(MySubscription->oid); + + table_states_valid = true; + } + + return has_subrels; +} + +/* + * If the subscription has no tables then return false. + * + * Otherwise, are all tablesyncs READY? + * + * Note: This function is not suitable to be called from outside of apply or + * tablesync workers because MySubscription needs to be already initialized. + */ +bool +AllTablesyncsReady(void) +{ + bool started_tx = false; + bool has_subrels = false; + + /* We need up-to-date sync state info for subscription tables here. */ + has_subrels = FetchTableStates(&started_tx); + + if (started_tx) + { + CommitTransactionCommand(); + pgstat_report_stat(true); + } + + /* + * Return false when there are no tables in subscription or not all tables + * are in ready state; true otherwise. + */ + return has_subrels && (table_states_not_ready == NIL); +} + +/* + * Update the two_phase state of the specified subscription in pg_subscription. + */ +void +UpdateTwoPhaseState(Oid suboid, char new_state) +{ + Relation rel; + HeapTuple tup; + bool nulls[Natts_pg_subscription]; + bool replaces[Natts_pg_subscription]; + Datum values[Natts_pg_subscription]; + + Assert(new_state == LOGICALREP_TWOPHASE_STATE_DISABLED || + new_state == LOGICALREP_TWOPHASE_STATE_PENDING || + new_state == LOGICALREP_TWOPHASE_STATE_ENABLED); + + rel = table_open(SubscriptionRelationId, RowExclusiveLock); + tup = SearchSysCacheCopy1(SUBSCRIPTIONOID, ObjectIdGetDatum(suboid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, + "cache lookup failed for subscription oid %u", + suboid); + + /* Form a new tuple. */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + /* And update/set two_phase state */ + values[Anum_pg_subscription_subtwophasestate - 1] = CharGetDatum(new_state); + replaces[Anum_pg_subscription_subtwophasestate - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), + values, nulls, replaces); + CatalogTupleUpdate(rel, &tup->t_self, tup); + + heap_freetuple(tup); + table_close(rel, RowExclusiveLock); +} diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c new file mode 100644 index 0000000..832b1cf --- /dev/null +++ b/src/backend/replication/logical/worker.c @@ -0,0 +1,5113 @@ +/*------------------------------------------------------------------------- + * worker.c + * PostgreSQL logical replication worker (apply) + * + * Copyright (c) 2016-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/worker.c + * + * NOTES + * This file contains the worker which applies logical changes as they come + * from remote logical replication stream. + * + * The main worker (apply) is started by logical replication worker + * launcher for every enabled subscription in a database. It uses + * walsender protocol to communicate with publisher. + * + * This module includes server facing code and shares libpqwalreceiver + * module with walreceiver for providing the libpq specific functionality. + * + * + * STREAMED TRANSACTIONS + * --------------------- + * Streamed transactions (large transactions exceeding a memory limit on the + * upstream) are applied using one of two approaches: + * + * 1) Write to temporary files and apply when the final commit arrives + * + * This approach is used when the user has set the subscription's streaming + * option as on. + * + * Unlike the regular (non-streamed) case, handling streamed transactions has + * to handle aborts of both the toplevel transaction and subtransactions. This + * is achieved by tracking offsets for subtransactions, which is then used + * to truncate the file with serialized changes. + * + * The files are placed in tmp file directory by default, and the filenames + * include both the XID of the toplevel transaction and OID of the + * subscription. This is necessary so that different workers processing a + * remote transaction with the same XID doesn't interfere. + * + * We use BufFiles instead of using normal temporary files because (a) the + * BufFile infrastructure supports temporary files that exceed the OS file size + * limit, (b) provides a way for automatic clean up on the error and (c) provides + * a way to survive these files across local transactions and allow to open and + * close at stream start and close. We decided to use FileSet + * infrastructure as without that it deletes the files on the closure of the + * file and if we decide to keep stream files open across the start/stop stream + * then it will consume a lot of memory (more than 8K for each BufFile and + * there could be multiple such BufFiles as the subscriber could receive + * multiple start/stop streams for different transactions before getting the + * commit). Moreover, if we don't use FileSet then we also need to invent + * a new way to pass filenames to BufFile APIs so that we are allowed to open + * the file we desired across multiple stream-open calls for the same + * transaction. + * + * 2) Parallel apply workers. + * + * This approach is used when the user has set the subscription's streaming + * option as parallel. See logical/applyparallelworker.c for information about + * this approach. + * + * TWO_PHASE TRANSACTIONS + * ---------------------- + * Two phase transactions are replayed at prepare and then committed or + * rolled back at commit prepared and rollback prepared respectively. It is + * possible to have a prepared transaction that arrives at the apply worker + * when the tablesync is busy doing the initial copy. In this case, the apply + * worker skips all the prepared operations [e.g. inserts] while the tablesync + * is still busy (see the condition of should_apply_changes_for_rel). The + * tablesync worker might not get such a prepared transaction because say it + * was prior to the initial consistent point but might have got some later + * commits. Now, the tablesync worker will exit without doing anything for the + * prepared transaction skipped by the apply worker as the sync location for it + * will be already ahead of the apply worker's current location. This would lead + * to an "empty prepare", because later when the apply worker does the commit + * prepare, there is nothing in it (the inserts were skipped earlier). + * + * To avoid this, and similar prepare confusions the subscription's two_phase + * commit is enabled only after the initial sync is over. The two_phase option + * has been implemented as a tri-state with values DISABLED, PENDING, and + * ENABLED. + * + * Even if the user specifies they want a subscription with two_phase = on, + * internally it will start with a tri-state of PENDING which only becomes + * ENABLED after all tablesync initializations are completed - i.e. when all + * tablesync workers have reached their READY state. In other words, the value + * PENDING is only a temporary state for subscription start-up. + * + * Until the two_phase is properly available (ENABLED) the subscription will + * behave as if two_phase = off. When the apply worker detects that all + * tablesyncs have become READY (while the tri-state was PENDING) it will + * restart the apply worker process. This happens in + * process_syncing_tables_for_apply. + * + * When the (re-started) apply worker finds that all tablesyncs are READY for a + * two_phase tri-state of PENDING it start streaming messages with the + * two_phase option which in turn enables the decoding of two-phase commits at + * the publisher. Then, it updates the tri-state value from PENDING to ENABLED. + * Now, it is possible that during the time we have not enabled two_phase, the + * publisher (replication server) would have skipped some prepares but we + * ensure that such prepares are sent along with commit prepare, see + * ReorderBufferFinishPrepared. + * + * If the subscription has no tables then a two_phase tri-state PENDING is + * left unchanged. This lets the user still do an ALTER SUBSCRIPTION REFRESH + * PUBLICATION which might otherwise be disallowed (see below). + * + * If ever a user needs to be aware of the tri-state value, they can fetch it + * from the pg_subscription catalog (see column subtwophasestate). + * + * We don't allow to toggle two_phase option of a subscription because it can + * lead to an inconsistent replica. Consider, initially, it was on and we have + * received some prepare then we turn it off, now at commit time the server + * will send the entire transaction data along with the commit. With some more + * analysis, we can allow changing this option from off to on but not sure if + * that alone would be useful. + * + * Finally, to avoid problems mentioned in previous paragraphs from any + * subsequent (not READY) tablesyncs (need to toggle two_phase option from 'on' + * to 'off' and then again back to 'on') there is a restriction for + * ALTER SUBSCRIPTION REFRESH PUBLICATION. This command is not permitted when + * the two_phase tri-state is ENABLED, except when copy_data = false. + * + * We can get prepare of the same GID more than once for the genuine cases + * where we have defined multiple subscriptions for publications on the same + * server and prepared transaction has operations on tables subscribed to those + * subscriptions. For such cases, if we use the GID sent by publisher one of + * the prepares will be successful and others will fail, in which case the + * server will send them again. Now, this can lead to a deadlock if user has + * set synchronous_standby_names for all the subscriptions on subscriber. To + * avoid such deadlocks, we generate a unique GID (consisting of the + * subscription oid and the xid of the prepared transaction) for each prepare + * transaction on the subscriber. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> + +#include "access/genam.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "catalog/catalog.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/partition.h" +#include "catalog/pg_inherits.h" +#include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablecmds.h" +#include "commands/tablespace.h" +#include "commands/trigger.h" +#include "executor/executor.h" +#include "executor/execPartition.h" +#include "executor/nodeModifyTable.h" +#include "funcapi.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "optimizer/optimizer.h" +#include "parser/parse_relation.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "postmaster/walwriter.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/logicallauncher.h" +#include "replication/logicalproto.h" +#include "replication/logicalrelation.h" +#include "replication/logicalworker.h" +#include "replication/origin.h" +#include "replication/reorderbuffer.h" +#include "replication/snapbuild.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" +#include "rewrite/rewriteHandler.h" +#include "storage/buffile.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "tcop/tcopprot.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/catcache.h" +#include "utils/dynahash.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/rls.h" +#include "utils/syscache.h" +#include "utils/timeout.h" +#include "utils/usercontext.h" + +#define NAPTIME_PER_CYCLE 1000 /* max sleep time between cycles (1s) */ + +typedef struct FlushPosition +{ + dlist_node node; + XLogRecPtr local_end; + XLogRecPtr remote_end; +} FlushPosition; + +static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping); + +typedef struct ApplyExecutionData +{ + EState *estate; /* executor state, used to track resources */ + + LogicalRepRelMapEntry *targetRel; /* replication target rel */ + ResultRelInfo *targetRelInfo; /* ResultRelInfo for same */ + + /* These fields are used when the target relation is partitioned: */ + ModifyTableState *mtstate; /* dummy ModifyTable state */ + PartitionTupleRouting *proute; /* partition routing info */ +} ApplyExecutionData; + +/* Struct for saving and restoring apply errcontext information */ +typedef struct ApplyErrorCallbackArg +{ + LogicalRepMsgType command; /* 0 if invalid */ + LogicalRepRelMapEntry *rel; + + /* Remote node information */ + int remote_attnum; /* -1 if invalid */ + TransactionId remote_xid; + XLogRecPtr finish_lsn; + char *origin_name; +} ApplyErrorCallbackArg; + +/* + * The action to be taken for the changes in the transaction. + * + * TRANS_LEADER_APPLY: + * This action means that we are in the leader apply worker or table sync + * worker. The changes of the transaction are either directly applied or + * are read from temporary files (for streaming transactions) and then + * applied by the worker. + * + * TRANS_LEADER_SERIALIZE: + * This action means that we are in the leader apply worker or table sync + * worker. Changes are written to temporary files and then applied when the + * final commit arrives. + * + * TRANS_LEADER_SEND_TO_PARALLEL: + * This action means that we are in the leader apply worker and need to send + * the changes to the parallel apply worker. + * + * TRANS_LEADER_PARTIAL_SERIALIZE: + * This action means that we are in the leader apply worker and have sent some + * changes directly to the parallel apply worker and the remaining changes are + * serialized to a file, due to timeout while sending data. The parallel apply + * worker will apply these serialized changes when the final commit arrives. + * + * We can't use TRANS_LEADER_SERIALIZE for this case because, in addition to + * serializing changes, the leader worker also needs to serialize the + * STREAM_XXX message to a file, and wait for the parallel apply worker to + * finish the transaction when processing the transaction finish command. So + * this new action was introduced to keep the code and logic clear. + * + * TRANS_PARALLEL_APPLY: + * This action means that we are in the parallel apply worker and changes of + * the transaction are applied directly by the worker. + */ +typedef enum +{ + /* The action for non-streaming transactions. */ + TRANS_LEADER_APPLY, + + /* Actions for streaming transactions. */ + TRANS_LEADER_SERIALIZE, + TRANS_LEADER_SEND_TO_PARALLEL, + TRANS_LEADER_PARTIAL_SERIALIZE, + TRANS_PARALLEL_APPLY +} TransApplyAction; + +/* errcontext tracker */ +ApplyErrorCallbackArg apply_error_callback_arg = +{ + .command = 0, + .rel = NULL, + .remote_attnum = -1, + .remote_xid = InvalidTransactionId, + .finish_lsn = InvalidXLogRecPtr, + .origin_name = NULL, +}; + +ErrorContextCallback *apply_error_context_stack = NULL; + +MemoryContext ApplyMessageContext = NULL; +MemoryContext ApplyContext = NULL; + +/* per stream context for streaming transactions */ +static MemoryContext LogicalStreamingContext = NULL; + +WalReceiverConn *LogRepWorkerWalRcvConn = NULL; + +Subscription *MySubscription = NULL; +static bool MySubscriptionValid = false; + +static List *on_commit_wakeup_workers_subids = NIL; + +bool in_remote_transaction = false; +static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr; + +/* fields valid only when processing streamed transaction */ +static bool in_streamed_transaction = false; + +static TransactionId stream_xid = InvalidTransactionId; + +/* + * The number of changes applied by parallel apply worker during one streaming + * block. + */ +static uint32 parallel_stream_nchanges = 0; + +/* Are we initializing a apply worker? */ +bool InitializingApplyWorker = false; + +/* + * We enable skipping all data modification changes (INSERT, UPDATE, etc.) for + * the subscription if the remote transaction's finish LSN matches the subskiplsn. + * Once we start skipping changes, we don't stop it until we skip all changes of + * the transaction even if pg_subscription is updated and MySubscription->skiplsn + * gets changed or reset during that. Also, in streaming transaction cases (streaming = on), + * we don't skip receiving and spooling the changes since we decide whether or not + * to skip applying the changes when starting to apply changes. The subskiplsn is + * cleared after successfully skipping the transaction or applying non-empty + * transaction. The latter prevents the mistakenly specified subskiplsn from + * being left. Note that we cannot skip the streaming transactions when using + * parallel apply workers because we cannot get the finish LSN before applying + * the changes. So, we don't start parallel apply worker when finish LSN is set + * by the user. + */ +static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; +#define is_skipping_changes() (unlikely(!XLogRecPtrIsInvalid(skip_xact_finish_lsn))) + +/* BufFile handle of the current streaming file */ +static BufFile *stream_fd = NULL; + +typedef struct SubXactInfo +{ + TransactionId xid; /* XID of the subxact */ + int fileno; /* file number in the buffile */ + off_t offset; /* offset in the file */ +} SubXactInfo; + +/* Sub-transaction data for the current streaming transaction */ +typedef struct ApplySubXactData +{ + uint32 nsubxacts; /* number of sub-transactions */ + uint32 nsubxacts_max; /* current capacity of subxacts */ + TransactionId subxact_last; /* xid of the last sub-transaction */ + SubXactInfo *subxacts; /* sub-xact offset in changes file */ +} ApplySubXactData; + +static ApplySubXactData subxact_data = {0, 0, InvalidTransactionId, NULL}; + +static inline void subxact_filename(char *path, Oid subid, TransactionId xid); +static inline void changes_filename(char *path, Oid subid, TransactionId xid); + +/* + * Information about subtransactions of a given toplevel transaction. + */ +static void subxact_info_write(Oid subid, TransactionId xid); +static void subxact_info_read(Oid subid, TransactionId xid); +static void subxact_info_add(TransactionId xid); +static inline void cleanup_subxact_info(void); + +/* + * Serialize and deserialize changes for a toplevel transaction. + */ +static void stream_open_file(Oid subid, TransactionId xid, + bool first_segment); +static void stream_write_change(char action, StringInfo s); +static void stream_open_and_write_change(TransactionId xid, char action, StringInfo s); +static void stream_close_file(void); + +static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); + +static void DisableSubscriptionAndExit(void); + +static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); +static void apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot); +static void apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + Oid localindexoid); +static void apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + Oid localindexoid); +static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, + LogicalRepRelation *remoterel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot); +static void apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation); + +/* Compute GID for two_phase transactions */ +static void TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid); + +/* Functions for skipping changes */ +static void maybe_start_skipping_changes(XLogRecPtr finish_lsn); +static void stop_skipping_changes(void); +static void clear_subscription_skip_lsn(XLogRecPtr finish_lsn); + +/* Functions for apply error callback */ +static inline void set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn); +static inline void reset_apply_error_context_info(void); + +static TransApplyAction get_transaction_apply_action(TransactionId xid, + ParallelApplyWorkerInfo **winfo); + +/* + * Form the origin name for the subscription. + * + * This is a common function for tablesync and other workers. Tablesync workers + * must pass a valid relid. Other callers must pass relid = InvalidOid. + * + * Return the name in the supplied buffer. + */ +void +ReplicationOriginNameForLogicalRep(Oid suboid, Oid relid, + char *originname, Size szoriginname) +{ + if (OidIsValid(relid)) + { + /* Replication origin name for tablesync workers. */ + snprintf(originname, szoriginname, "pg_%u_%u", suboid, relid); + } + else + { + /* Replication origin name for non-tablesync workers. */ + snprintf(originname, szoriginname, "pg_%u", suboid); + } +} + +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the leader apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of initial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + * + * Note that for streaming transactions that are being applied in the parallel + * apply worker, we disallow applying changes if the target table in the + * subscription is not in the READY state, because we cannot decide whether to + * apply the change as we won't know remote_final_lsn by that time. + * + * We already checked this in pa_can_start() before assigning the + * streaming transaction to the parallel worker, but it also needs to be + * checked here because if the user executes ALTER SUBSCRIPTION ... REFRESH + * PUBLICATION in parallel, the new table can be added to pg_subscription_rel + * while applying this transaction. + */ +static bool +should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (am_tablesync_worker()) + return MyLogicalRepWorker->relid == rel->localreloid; + else if (am_parallel_apply_worker()) + { + /* We don't synchronize rel's that are in unknown state. */ + if (rel->state != SUBREL_STATE_READY && + rel->state != SUBREL_STATE_UNKNOWN) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication parallel apply worker for subscription \"%s\" will stop", + MySubscription->name), + errdetail("Cannot handle streamed replication transactions using parallel apply workers until all tables have been synchronized."))); + + return rel->state == SUBREL_STATE_READY; + } + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && + rel->statelsn <= remote_final_lsn)); +} + +/* + * Begin one step (one INSERT, UPDATE, etc) of a replication transaction. + * + * Start a transaction, if this is the first step (else we keep using the + * existing transaction). + * Also provide a global snapshot and ensure we run in ApplyMessageContext. + */ +static void +begin_replication_step(void) +{ + SetCurrentStatementStartTimestamp(); + + if (!IsTransactionState()) + { + StartTransactionCommand(); + maybe_reread_subscription(); + } + + PushActiveSnapshot(GetTransactionSnapshot()); + + MemoryContextSwitchTo(ApplyMessageContext); +} + +/* + * Finish up one step of a replication transaction. + * Callers of begin_replication_step() must also call this. + * + * We don't close out the transaction here, but we should increment + * the command counter to make the effects of this step visible. + */ +static void +end_replication_step(void) +{ + PopActiveSnapshot(); + + CommandCounterIncrement(); +} + +/* + * Handle streamed transactions for both the leader apply worker and the + * parallel apply workers. + * + * In the streaming case (receiving a block of the streamed transaction), for + * serialize mode, simply redirect it to a file for the proper toplevel + * transaction, and for parallel mode, the leader apply worker will send the + * changes to parallel apply workers and the parallel apply worker will define + * savepoints if needed. (LOGICAL_REP_MSG_RELATION or LOGICAL_REP_MSG_TYPE + * messages will be applied by both leader apply worker and parallel apply + * workers). + * + * Returns true for streamed transactions (when the change is either serialized + * to file or sent to parallel apply worker), false otherwise (regular mode or + * needs to be processed by parallel apply worker). + * + * Exception: If the message being processed is LOGICAL_REP_MSG_RELATION + * or LOGICAL_REP_MSG_TYPE, return false even if the message needs to be sent + * to a parallel apply worker. + */ +static bool +handle_streamed_transaction(LogicalRepMsgType action, StringInfo s) +{ + TransactionId current_xid; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + StringInfoData original_msg; + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + /* not in streaming mode */ + if (apply_action == TRANS_LEADER_APPLY) + return false; + + Assert(TransactionIdIsValid(stream_xid)); + + /* + * The parallel apply worker needs the xid in this message to decide + * whether to define a savepoint, so save the original message that has + * not moved the cursor after the xid. We will serialize this message to a + * file in PARTIAL_SERIALIZE mode. + */ + original_msg = *s; + + /* + * We should have received XID of the subxact as the first part of the + * message, so extract it. + */ + current_xid = pq_getmsgint(s, 4); + + if (!TransactionIdIsValid(current_xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + Assert(stream_fd); + + /* Add the new subxact to the array (unless already there). */ + subxact_info_add(current_xid); + + /* Write the change to the current file */ + stream_write_change(action, s); + return true; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * XXX The publisher side doesn't always send relation/type update + * messages after the streaming transaction, so also update the + * relation/type in leader apply worker. See function + * cleanup_rel_sync_cache. + */ + if (pa_send_data(winfo, s->len, s->data)) + return (action != LOGICAL_REP_MSG_RELATION && + action != LOGICAL_REP_MSG_TYPE); + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, false); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + stream_write_change(action, &original_msg); + + /* Same reason as TRANS_LEADER_SEND_TO_PARALLEL case. */ + return (action != LOGICAL_REP_MSG_RELATION && + action != LOGICAL_REP_MSG_TYPE); + + case TRANS_PARALLEL_APPLY: + parallel_stream_nchanges += 1; + + /* Define a savepoint for a subxact if needed. */ + pa_start_subtrans(current_xid, stream_xid); + return false; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + return false; /* silence compiler warning */ + } +} + +/* + * Executor state preparation for evaluation of constraint expressions, + * indexes and triggers for the specified relation. + * + * Note that the caller must open and close any indexes to be updated. + */ +static ApplyExecutionData * +create_edata_for_relation(LogicalRepRelMapEntry *rel) +{ + ApplyExecutionData *edata; + EState *estate; + RangeTblEntry *rte; + List *perminfos = NIL; + ResultRelInfo *resultRelInfo; + + edata = (ApplyExecutionData *) palloc0(sizeof(ApplyExecutionData)); + edata->targetRel = rel; + + edata->estate = estate = CreateExecutorState(); + + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = RelationGetRelid(rel->localrel); + rte->relkind = rel->localrel->rd_rel->relkind; + rte->rellockmode = AccessShareLock; + + addRTEPermissionInfo(&perminfos, rte); + + ExecInitRangeTable(estate, list_make1(rte), perminfos); + + edata->targetRelInfo = resultRelInfo = makeNode(ResultRelInfo); + + /* + * Use Relation opened by logicalrep_rel_open() instead of opening it + * again. + */ + InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0); + + /* + * We put the ResultRelInfo in the es_opened_result_relations list, even + * though we don't populate the es_result_relations array. That's a bit + * bogus, but it's enough to make ExecGetTriggerResultRel() find them. + * + * ExecOpenIndices() is not called here either, each execution path doing + * an apply operation being responsible for that. + */ + estate->es_opened_result_relations = + lappend(estate->es_opened_result_relations, resultRelInfo); + + estate->es_output_cid = GetCurrentCommandId(true); + + /* Prepare to catch AFTER triggers. */ + AfterTriggerBeginQuery(); + + /* other fields of edata remain NULL for now */ + + return edata; +} + +/* + * Finish any operations related to the executor state created by + * create_edata_for_relation(). + */ +static void +finish_edata(ApplyExecutionData *edata) +{ + EState *estate = edata->estate; + + /* Handle any queued AFTER triggers. */ + AfterTriggerEndQuery(estate); + + /* Shut down tuple routing, if any was done. */ + if (edata->proute) + ExecCleanupTupleRouting(edata->mtstate, edata->proute); + + /* + * Cleanup. It might seem that we should call ExecCloseResultRelations() + * here, but we intentionally don't. It would close the rel we added to + * es_opened_result_relations above, which is wrong because we took no + * corresponding refcount. We rely on ExecCleanupTupleRouting() to close + * any other relations opened during execution. + */ + ExecResetTupleTable(estate->es_tupleTable, false); + FreeExecutorState(estate); + pfree(edata); +} + +/* + * Executes default values for columns for which we can't map to remote + * relation columns. + * + * This allows us to support tables which have more columns on the downstream + * than on the upstream. + */ +static void +slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate, + TupleTableSlot *slot) +{ + TupleDesc desc = RelationGetDescr(rel->localrel); + int num_phys_attrs = desc->natts; + int i; + int attnum, + num_defaults = 0; + int *defmap; + ExprState **defexprs; + ExprContext *econtext; + + econtext = GetPerTupleExprContext(estate); + + /* We got all the data via replication, no need to evaluate anything. */ + if (num_phys_attrs == rel->remoterel.natts) + return; + + defmap = (int *) palloc(num_phys_attrs * sizeof(int)); + defexprs = (ExprState **) palloc(num_phys_attrs * sizeof(ExprState *)); + + Assert(rel->attrmap->maplen == num_phys_attrs); + for (attnum = 0; attnum < num_phys_attrs; attnum++) + { + Expr *defexpr; + + if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated) + continue; + + if (rel->attrmap->attnums[attnum] >= 0) + continue; + + defexpr = (Expr *) build_column_default(rel->localrel, attnum + 1); + + if (defexpr != NULL) + { + /* Run the expression through planner */ + defexpr = expression_planner(defexpr); + + /* Initialize executable expression in copycontext */ + defexprs[num_defaults] = ExecInitExpr(defexpr, NULL); + defmap[num_defaults] = attnum; + num_defaults++; + } + } + + for (i = 0; i < num_defaults; i++) + slot->tts_values[defmap[i]] = + ExecEvalExpr(defexprs[i], econtext, &slot->tts_isnull[defmap[i]]); +} + +/* + * Store tuple data into slot. + * + * Incoming data can be either text or binary format. + */ +static void +slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + + ExecClearTuple(slot); + + /* Call the "in" function for each non-dropped, non-null attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + Assert(remoteattnum < tupleData->ncols); + + /* Set attnum for error callback */ + apply_error_callback_arg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* + * NULL value from remote. (We don't expect to see + * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as + * NULL.) + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + /* Reset attnum for error callback */ + apply_error_callback_arg.remote_attnum = -1; + } + else + { + /* + * We assign NULL to dropped attributes and missing values + * (missing values should be later filled using + * slot_fill_defaults). + */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + } + + ExecStoreVirtualTuple(slot); +} + +/* + * Replace updated columns with data from the LogicalRepTupleData struct. + * This is somewhat similar to heap_modify_tuple but also calls the type + * input functions on the user data. + * + * "slot" is filled with a copy of the tuple in "srcslot", replacing + * columns provided in "tupleData" and leaving others as-is. + * + * Caution: unreplaced pass-by-ref columns in "slot" will point into the + * storage for "srcslot". This is OK for current usage, but someday we may + * need to materialize "slot" at the end to make it independent of "srcslot". + */ +static void +slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot, + LogicalRepRelMapEntry *rel, + LogicalRepTupleData *tupleData) +{ + int natts = slot->tts_tupleDescriptor->natts; + int i; + + /* We'll fill "slot" with a virtual tuple, so we must start with ... */ + ExecClearTuple(slot); + + /* + * Copy all the column data from srcslot, so that we'll have valid values + * for unreplaced columns. + */ + Assert(natts == srcslot->tts_tupleDescriptor->natts); + slot_getallattrs(srcslot); + memcpy(slot->tts_values, srcslot->tts_values, natts * sizeof(Datum)); + memcpy(slot->tts_isnull, srcslot->tts_isnull, natts * sizeof(bool)); + + /* Call the "in" function for each replaced attribute */ + Assert(natts == rel->attrmap->maplen); + for (i = 0; i < natts; i++) + { + Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (remoteattnum < 0) + continue; + + Assert(remoteattnum < tupleData->ncols); + + if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + { + StringInfo colvalue = &tupleData->colvalues[remoteattnum]; + + /* Set attnum for error callback */ + apply_error_callback_arg.remote_attnum = remoteattnum; + + if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT) + { + Oid typinput; + Oid typioparam; + + getTypeInputInfo(att->atttypid, &typinput, &typioparam); + slot->tts_values[i] = + OidInputFunctionCall(typinput, colvalue->data, + typioparam, att->atttypmod); + slot->tts_isnull[i] = false; + } + else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY) + { + Oid typreceive; + Oid typioparam; + + /* + * In some code paths we may be asked to re-parse the same + * tuple data. Reset the StringInfo's cursor so that works. + */ + colvalue->cursor = 0; + + getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam); + slot->tts_values[i] = + OidReceiveFunctionCall(typreceive, colvalue, + typioparam, att->atttypmod); + + /* Trouble if it didn't eat the whole buffer */ + if (colvalue->cursor != colvalue->len) + ereport(ERROR, + (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), + errmsg("incorrect binary data format in logical replication column %d", + remoteattnum + 1))); + slot->tts_isnull[i] = false; + } + else + { + /* must be LOGICALREP_COLUMN_NULL */ + slot->tts_values[i] = (Datum) 0; + slot->tts_isnull[i] = true; + } + + /* Reset attnum for error callback */ + apply_error_callback_arg.remote_attnum = -1; + } + } + + /* And finally, declare that "slot" contains a valid virtual tuple */ + ExecStoreVirtualTuple(slot); +} + +/* + * Handle BEGIN message. + */ +static void +apply_handle_begin(StringInfo s) +{ + LogicalRepBeginData begin_data; + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + logicalrep_read_begin(s, &begin_data); + set_apply_error_context_xact(begin_data.xid, begin_data.final_lsn); + + remote_final_lsn = begin_data.final_lsn; + + maybe_start_skipping_changes(begin_data.final_lsn); + + in_remote_transaction = true; + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Handle COMMIT message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_commit(StringInfo s) +{ + LogicalRepCommitData commit_data; + + logicalrep_read_commit(s, &commit_data); + + if (commit_data.commit_lsn != remote_final_lsn) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + LSN_FORMAT_ARGS(commit_data.commit_lsn), + LSN_FORMAT_ARGS(remote_final_lsn)))); + + apply_handle_commit_internal(&commit_data); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle BEGIN PREPARE message. + */ +static void +apply_handle_begin_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData begin_data; + + /* Tablesync should never receive prepare. */ + if (am_tablesync_worker()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("tablesync worker received a BEGIN PREPARE message"))); + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + logicalrep_read_begin_prepare(s, &begin_data); + set_apply_error_context_xact(begin_data.xid, begin_data.prepare_lsn); + + remote_final_lsn = begin_data.prepare_lsn; + + maybe_start_skipping_changes(begin_data.prepare_lsn); + + in_remote_transaction = true; + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Common function to prepare the GID. + */ +static void +apply_handle_prepare_internal(LogicalRepPreparedTxnData *prepare_data) +{ + char gid[GIDSIZE]; + + /* + * Compute unique GID for two_phase transactions. We don't use GID of + * prepared transaction sent by server as that can lead to deadlock when + * we have multiple subscriptions from same node point to publications on + * the same node. See comments atop worker.c + */ + TwoPhaseTransactionGid(MySubscription->oid, prepare_data->xid, + gid, sizeof(gid)); + + /* + * BeginTransactionBlock is necessary to balance the EndTransactionBlock + * called within the PrepareTransactionBlock below. + */ + if (!IsTransactionBlock()) + { + BeginTransactionBlock(); + CommitTransactionCommand(); /* Completes the preceding Begin command. */ + } + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = prepare_data->end_lsn; + replorigin_session_origin_timestamp = prepare_data->prepare_time; + + PrepareTransactionBlock(gid); +} + +/* + * Handle PREPARE message. + */ +static void +apply_handle_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData prepare_data; + + logicalrep_read_prepare(s, &prepare_data); + + if (prepare_data.prepare_lsn != remote_final_lsn) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)", + LSN_FORMAT_ARGS(prepare_data.prepare_lsn), + LSN_FORMAT_ARGS(remote_final_lsn)))); + + /* + * Unlike commit, here, we always prepare the transaction even though no + * change has happened in this transaction or all changes are skipped. It + * is done this way because at commit prepared time, we won't know whether + * we have skipped preparing a transaction because of those reasons. + * + * XXX, We can optimize such that at commit prepared time, we first check + * whether we have prepared the transaction or not but that doesn't seem + * worthwhile because such cases shouldn't be common. + */ + begin_replication_step(); + + apply_handle_prepare_internal(&prepare_data); + + end_replication_step(); + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + /* + * Since we have already prepared the transaction, in a case where the + * server crashes before clearing the subskiplsn, it will be left but the + * transaction won't be resent. But that's okay because it's a rare case + * and the subskiplsn will be cleared when finishing the next transaction. + */ + stop_skipping_changes(); + clear_subscription_skip_lsn(prepare_data.prepare_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle a COMMIT PREPARED of a previously PREPARED transaction. + * + * Note that we don't need to wait here if the transaction was prepared in a + * parallel apply worker. In that case, we have already waited for the prepare + * to finish in apply_handle_stream_prepare() which will ensure all the + * operations in that transaction have happened in the subscriber, so no + * concurrent transaction can cause deadlock or transaction dependency issues. + */ +static void +apply_handle_commit_prepared(StringInfo s) +{ + LogicalRepCommitPreparedTxnData prepare_data; + char gid[GIDSIZE]; + + logicalrep_read_commit_prepared(s, &prepare_data); + set_apply_error_context_xact(prepare_data.xid, prepare_data.commit_lsn); + + /* Compute GID for two_phase transactions. */ + TwoPhaseTransactionGid(MySubscription->oid, prepare_data.xid, + gid, sizeof(gid)); + + /* There is no transaction when COMMIT PREPARED is called */ + begin_replication_step(); + + /* + * Update origin state so we can restart streaming from correct position + * in case of crash. + */ + replorigin_session_origin_lsn = prepare_data.end_lsn; + replorigin_session_origin_timestamp = prepare_data.commit_time; + + FinishPreparedTransaction(gid, true); + end_replication_step(); + CommitTransactionCommand(); + pgstat_report_stat(false); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + clear_subscription_skip_lsn(prepare_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle a ROLLBACK PREPARED of a previously PREPARED TRANSACTION. + * + * Note that we don't need to wait here if the transaction was prepared in a + * parallel apply worker. In that case, we have already waited for the prepare + * to finish in apply_handle_stream_prepare() which will ensure all the + * operations in that transaction have happened in the subscriber, so no + * concurrent transaction can cause deadlock or transaction dependency issues. + */ +static void +apply_handle_rollback_prepared(StringInfo s) +{ + LogicalRepRollbackPreparedTxnData rollback_data; + char gid[GIDSIZE]; + + logicalrep_read_rollback_prepared(s, &rollback_data); + set_apply_error_context_xact(rollback_data.xid, rollback_data.rollback_end_lsn); + + /* Compute GID for two_phase transactions. */ + TwoPhaseTransactionGid(MySubscription->oid, rollback_data.xid, + gid, sizeof(gid)); + + /* + * It is possible that we haven't received prepare because it occurred + * before walsender reached a consistent point or the two_phase was still + * not enabled by that time, so in such cases, we need to skip rollback + * prepared. + */ + if (LookupGXact(gid, rollback_data.prepare_end_lsn, + rollback_data.prepare_time)) + { + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = rollback_data.rollback_end_lsn; + replorigin_session_origin_timestamp = rollback_data.rollback_time; + + /* There is no transaction when ABORT/ROLLBACK PREPARED is called */ + begin_replication_step(); + FinishPreparedTransaction(gid, false); + end_replication_step(); + CommitTransactionCommand(); + + clear_subscription_skip_lsn(rollback_data.rollback_end_lsn); + } + + pgstat_report_stat(false); + + store_flush_position(rollback_data.rollback_end_lsn, XactLastCommitEnd); + in_remote_transaction = false; + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(rollback_data.rollback_end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + reset_apply_error_context_info(); +} + +/* + * Handle STREAM PREPARE. + */ +static void +apply_handle_stream_prepare(StringInfo s) +{ + LogicalRepPreparedTxnData prepare_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM PREPARE message without STREAM STOP"))); + + /* Tablesync should never receive prepare. */ + if (am_tablesync_worker()) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("tablesync worker received a STREAM PREPARE message"))); + + logicalrep_read_stream_prepare(s, &prepare_data); + set_apply_error_context_xact(prepare_data.xid, prepare_data.prepare_lsn); + + apply_action = get_transaction_apply_action(prepare_data.xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * The transaction has been serialized to file, so replay all the + * spooled operations. + */ + apply_spooled_messages(MyLogicalRepWorker->stream_fileset, + prepare_data.xid, prepare_data.prepare_lsn); + + /* Mark the transaction as prepared. */ + apply_handle_prepare_internal(&prepare_data); + + CommitTransactionCommand(); + + store_flush_position(prepare_data.end_lsn, XactLastCommitEnd); + + in_remote_transaction = false; + + /* Unlink the files with serialized changes and subxact info. */ + stream_cleanup_files(MyLogicalRepWorker->subid, prepare_data.xid); + + elog(DEBUG1, "finished processing the STREAM PREPARE command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + if (pa_send_data(winfo, s->len, s->data)) + { + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, prepare_data.end_lsn); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + stream_open_and_write_change(prepare_data.xid, + LOGICAL_REP_MSG_STREAM_PREPARE, + &original_msg); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, prepare_data.end_lsn); + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before preparing. + */ + if (stream_fd) + stream_close_file(); + + begin_replication_step(); + + /* Mark the transaction as prepared. */ + apply_handle_prepare_internal(&prepare_data); + + end_replication_step(); + + CommitTransactionCommand(); + + MyParallelShared->last_commit_end = XactLastCommitEnd; + + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + pa_unlock_transaction(MyParallelShared->xid, AccessExclusiveLock); + + pa_reset_subtrans(); + + elog(DEBUG1, "finished processing the STREAM PREPARE command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + pgstat_report_stat(false); + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(prepare_data.end_lsn); + + /* + * Similar to prepare case, the subskiplsn could be left in a case of + * server crash but it's okay. See the comments in apply_handle_prepare(). + */ + stop_skipping_changes(); + clear_subscription_skip_lsn(prepare_data.prepare_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Handle ORIGIN message. + * + * TODO, support tracking of multiple origins + */ +static void +apply_handle_origin(StringInfo s) +{ + /* + * ORIGIN message can only come inside streaming transaction or inside + * remote transaction and before any actual writes. + */ + if (!in_streamed_transaction && + (!in_remote_transaction || + (IsTransactionState() && !am_tablesync_worker()))) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("ORIGIN message sent out of order"))); +} + +/* + * Initialize fileset (if not already done). + * + * Create a new file when first_segment is true, otherwise open the existing + * file. + */ +void +stream_start_internal(TransactionId xid, bool first_segment) +{ + begin_replication_step(); + + /* + * Initialize the worker's stream_fileset if we haven't yet. This will be + * used for the entire duration of the worker so create it in a permanent + * context. We create this on the very first streaming message from any + * transaction and then use it for this and other streaming transactions. + * Now, we could create a fileset at the start of the worker as well but + * then we won't be sure that it will ever be used. + */ + if (!MyLogicalRepWorker->stream_fileset) + { + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(ApplyContext); + + MyLogicalRepWorker->stream_fileset = palloc(sizeof(FileSet)); + FileSetInit(MyLogicalRepWorker->stream_fileset); + + MemoryContextSwitchTo(oldctx); + } + + /* Open the spool file for this transaction. */ + stream_open_file(MyLogicalRepWorker->subid, xid, first_segment); + + /* If this is not the first segment, open existing subxact file. */ + if (!first_segment) + subxact_info_read(MyLogicalRepWorker->subid, xid); + + end_replication_step(); +} + +/* + * Handle STREAM START message. + */ +static void +apply_handle_stream_start(StringInfo s) +{ + bool first_segment; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("duplicate STREAM START message"))); + + /* There must not be an active streaming transaction. */ + Assert(!TransactionIdIsValid(stream_xid)); + + /* notify handle methods we're processing a remote transaction */ + in_streamed_transaction = true; + + /* extract XID of the top-level transaction */ + stream_xid = logicalrep_read_stream_start(s, &first_segment); + + if (!TransactionIdIsValid(stream_xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid transaction ID in streamed replication transaction"))); + + set_apply_error_context_xact(stream_xid, InvalidXLogRecPtr); + + /* Try to allocate a worker for the streaming transaction. */ + if (first_segment) + pa_allocate_worker(stream_xid); + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + + /* + * Function stream_start_internal starts a transaction. This + * transaction will be committed on the stream stop unless it is a + * tablesync worker in which case it will be committed after + * processing all the messages. We need this transaction for + * handling the BufFile, used for serializing the streaming data + * and subxact info. + */ + stream_start_internal(stream_xid, first_segment); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * Once we start serializing the changes, the parallel apply + * worker will wait for the leader to release the stream lock + * until the end of the transaction. So, we don't need to release + * the lock or increment the stream count in that case. + */ + if (pa_send_data(winfo, s->len, s->data)) + { + /* + * Unlock the shared object lock so that the parallel apply + * worker can continue to receive changes. + */ + if (!first_segment) + pa_unlock_stream(winfo->shared->xid, AccessExclusiveLock); + + /* + * Increment the number of streaming blocks waiting to be + * processed by parallel apply worker. + */ + pg_atomic_add_fetch_u32(&winfo->shared->pending_stream_count, 1); + + /* Cache the parallel apply worker for this transaction. */ + pa_set_stream_apply_worker(winfo); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, !first_segment); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + /* + * Open the spool file unless it was already opened when switching + * to serialize mode. The transaction started in + * stream_start_internal will be committed on the stream stop. + */ + if (apply_action != TRANS_LEADER_SEND_TO_PARALLEL) + stream_start_internal(stream_xid, first_segment); + + stream_write_change(LOGICAL_REP_MSG_STREAM_START, &original_msg); + + /* Cache the parallel apply worker for this transaction. */ + pa_set_stream_apply_worker(winfo); + break; + + case TRANS_PARALLEL_APPLY: + if (first_segment) + { + /* Hold the lock until the end of the transaction. */ + pa_lock_transaction(MyParallelShared->xid, AccessExclusiveLock); + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_STARTED); + + /* + * Signal the leader apply worker, as it may be waiting for + * us. + */ + logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid); + } + + parallel_stream_nchanges = 0; + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + pgstat_report_activity(STATE_RUNNING, NULL); +} + +/* + * Update the information about subxacts and close the file. + * + * This function should be called when the stream_start_internal function has + * been called. + */ +void +stream_stop_internal(TransactionId xid) +{ + /* + * Serialize information about subxacts for the toplevel transaction, then + * close the stream messages spool file. + */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + stream_close_file(); + + /* We must be in a valid transaction state */ + Assert(IsTransactionState()); + + /* Commit the per-stream transaction */ + CommitTransactionCommand(); + + /* Reset per-stream context */ + MemoryContextReset(LogicalStreamingContext); +} + +/* + * Handle STREAM STOP message. + */ +static void +apply_handle_stream_stop(StringInfo s) +{ + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + if (!in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM STOP message without STREAM START"))); + + apply_action = get_transaction_apply_action(stream_xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_SERIALIZE: + stream_stop_internal(stream_xid); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * Lock before sending the STREAM_STOP message so that the leader + * can hold the lock first and the parallel apply worker will wait + * for leader to release the lock. See Locking Considerations atop + * applyparallelworker.c. + */ + pa_lock_stream(winfo->shared->xid, AccessExclusiveLock); + + if (pa_send_data(winfo, s->len, s->data)) + { + pa_set_stream_apply_worker(NULL); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + stream_write_change(LOGICAL_REP_MSG_STREAM_STOP, s); + stream_stop_internal(stream_xid); + pa_set_stream_apply_worker(NULL); + break; + + case TRANS_PARALLEL_APPLY: + elog(DEBUG1, "applied %u changes in the streaming chunk", + parallel_stream_nchanges); + + /* + * By the time parallel apply worker is processing the changes in + * the current streaming block, the leader apply worker may have + * sent multiple streaming blocks. This can lead to parallel apply + * worker start waiting even when there are more chunk of streams + * in the queue. So, try to lock only if there is no message left + * in the queue. See Locking Considerations atop + * applyparallelworker.c. + * + * Note that here we have a race condition where we can start + * waiting even when there are pending streaming chunks. This can + * happen if the leader sends another streaming block and acquires + * the stream lock again after the parallel apply worker checks + * that there is no pending streaming block and before it actually + * starts waiting on a lock. We can handle this case by not + * allowing the leader to increment the stream block count during + * the time parallel apply worker acquires the lock but it is not + * clear whether that is worth the complexity. + * + * Now, if this missed chunk contains rollback to savepoint, then + * there is a risk of deadlock which probably shouldn't happen + * after restart. + */ + pa_decr_and_wait_stream_block(); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + in_streamed_transaction = false; + stream_xid = InvalidTransactionId; + + /* + * The parallel apply worker could be in a transaction in which case we + * need to report the state as STATE_IDLEINTRANSACTION. + */ + if (IsTransactionOrTransactionBlock()) + pgstat_report_activity(STATE_IDLEINTRANSACTION, NULL); + else + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Helper function to handle STREAM ABORT message when the transaction was + * serialized to file. + */ +static void +stream_abort_internal(TransactionId xid, TransactionId subxid) +{ + /* + * If the two XIDs are the same, it's in fact abort of toplevel xact, so + * just delete the files with serialized info. + */ + if (xid == subxid) + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + else + { + /* + * OK, so it's a subxact. We need to read the subxact file for the + * toplevel transaction, determine the offset tracked for the subxact, + * and truncate the file with changes. We also remove the subxacts + * with higher offsets (or rather higher XIDs). + * + * We intentionally scan the array from the tail, because we're likely + * aborting a change for the most recent subtransactions. + * + * We can't use the binary search here as subxact XIDs won't + * necessarily arrive in sorted order, consider the case where we have + * released the savepoint for multiple subtransactions and then + * performed rollback to savepoint for one of the earlier + * sub-transaction. + */ + int64 i; + int64 subidx; + BufFile *fd; + bool found = false; + char path[MAXPGPATH]; + + subidx = -1; + begin_replication_step(); + subxact_info_read(MyLogicalRepWorker->subid, xid); + + for (i = subxact_data.nsubxacts; i > 0; i--) + { + if (subxact_data.subxacts[i - 1].xid == subxid) + { + subidx = (i - 1); + found = true; + break; + } + } + + /* + * If it's an empty sub-transaction then we will not find the subxid + * here so just cleanup the subxact info and return. + */ + if (!found) + { + /* Cleanup the subxact info */ + cleanup_subxact_info(); + end_replication_step(); + CommitTransactionCommand(); + return; + } + + /* open the changes file */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, + O_RDWR, false); + + /* OK, truncate the file at the right offset */ + BufFileTruncateFileSet(fd, subxact_data.subxacts[subidx].fileno, + subxact_data.subxacts[subidx].offset); + BufFileClose(fd); + + /* discard the subxacts added later */ + subxact_data.nsubxacts = subidx; + + /* write the updated subxact list */ + subxact_info_write(MyLogicalRepWorker->subid, xid); + + end_replication_step(); + CommitTransactionCommand(); + } +} + +/* + * Handle STREAM ABORT message. + */ +static void +apply_handle_stream_abort(StringInfo s) +{ + TransactionId xid; + TransactionId subxid; + LogicalRepStreamAbortData abort_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + bool toplevel_xact; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM ABORT message without STREAM STOP"))); + + /* We receive abort information only when we can apply in parallel. */ + logicalrep_read_stream_abort(s, &abort_data, + MyLogicalRepWorker->parallel_apply); + + xid = abort_data.xid; + subxid = abort_data.subxid; + toplevel_xact = (xid == subxid); + + set_apply_error_context_xact(subxid, abort_data.abort_lsn); + + apply_action = get_transaction_apply_action(xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * We are in the leader apply worker and the transaction has been + * serialized to file. + */ + stream_abort_internal(xid, subxid); + + elog(DEBUG1, "finished processing the STREAM ABORT command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + /* + * For the case of aborting the subtransaction, we increment the + * number of streaming blocks and take the lock again before + * sending the STREAM_ABORT to ensure that the parallel apply + * worker will wait on the lock for the next set of changes after + * processing the STREAM_ABORT message if it is not already + * waiting for STREAM_STOP message. + * + * It is important to perform this locking before sending the + * STREAM_ABORT message so that the leader can hold the lock first + * and the parallel apply worker will wait for the leader to + * release the lock. This is the same as what we do in + * apply_handle_stream_stop. See Locking Considerations atop + * applyparallelworker.c. + */ + if (!toplevel_xact) + { + pa_unlock_stream(xid, AccessExclusiveLock); + pg_atomic_add_fetch_u32(&winfo->shared->pending_stream_count, 1); + pa_lock_stream(xid, AccessExclusiveLock); + } + + if (pa_send_data(winfo, s->len, s->data)) + { + /* + * Unlike STREAM_COMMIT and STREAM_PREPARE, we don't need to + * wait here for the parallel apply worker to finish as that + * is not required to maintain the commit order and won't have + * the risk of failures due to transaction dependencies and + * deadlocks. However, it is possible that before the parallel + * worker finishes and we clear the worker info, the xid + * wraparound happens on the upstream and a new transaction + * with the same xid can appear and that can lead to duplicate + * entries in ParallelApplyTxnHash. Yet another problem could + * be that we may have serialized the changes in partial + * serialize mode and the file containing xact changes may + * already exist, and after xid wraparound trying to create + * the file for the same xid can lead to an error. To avoid + * these problems, we decide to wait for the aborts to finish. + * + * Note, it is okay to not update the flush location position + * for aborts as in worst case that means such a transaction + * won't be sent again after restart. + */ + if (toplevel_xact) + pa_xact_finish(winfo, InvalidXLogRecPtr); + + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + /* + * Parallel apply worker might have applied some changes, so write + * the STREAM_ABORT message so that it can rollback the + * subtransaction if needed. + */ + stream_open_and_write_change(xid, LOGICAL_REP_MSG_STREAM_ABORT, + &original_msg); + + if (toplevel_xact) + { + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + pa_xact_finish(winfo, InvalidXLogRecPtr); + } + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before aborting. + */ + if (toplevel_xact && stream_fd) + stream_close_file(); + + pa_stream_abort(&abort_data); + + /* + * We need to wait after processing rollback to savepoint for the + * next set of changes. + * + * We have a race condition here due to which we can start waiting + * here when there are more chunk of streams in the queue. See + * apply_handle_stream_stop. + */ + if (!toplevel_xact) + pa_decr_and_wait_stream_block(); + + elog(DEBUG1, "finished processing the STREAM ABORT command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + reset_apply_error_context_info(); +} + +/* + * Ensure that the passed location is fileset's end. + */ +static void +ensure_last_message(FileSet *stream_fileset, TransactionId xid, int fileno, + off_t offset) +{ + char path[MAXPGPATH]; + BufFile *fd; + int last_fileno; + off_t last_offset; + + Assert(!IsTransactionState()); + + begin_replication_step(); + + changes_filename(path, MyLogicalRepWorker->subid, xid); + + fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false); + + BufFileSeek(fd, 0, 0, SEEK_END); + BufFileTell(fd, &last_fileno, &last_offset); + + BufFileClose(fd); + + end_replication_step(); + + if (last_fileno != fileno || last_offset != offset) + elog(ERROR, "unexpected message left in streaming transaction's changes file \"%s\"", + path); +} + +/* + * Common spoolfile processing. + */ +void +apply_spooled_messages(FileSet *stream_fileset, TransactionId xid, + XLogRecPtr lsn) +{ + StringInfoData s2; + int nchanges; + char path[MAXPGPATH]; + char *buffer = NULL; + MemoryContext oldcxt; + ResourceOwner oldowner; + int fileno; + off_t offset; + + if (!am_parallel_apply_worker()) + maybe_start_skipping_changes(lsn); + + /* Make sure we have an open transaction */ + begin_replication_step(); + + /* + * Allocate file handle and memory required to process all the messages in + * TopTransactionContext to avoid them getting reset after each message is + * processed. + */ + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + + /* Open the spool file for the committed/prepared transaction */ + changes_filename(path, MyLogicalRepWorker->subid, xid); + elog(DEBUG1, "replaying changes from file \"%s\"", path); + + /* + * Make sure the file is owned by the toplevel transaction so that the + * file will not be accidentally closed when aborting a subtransaction. + */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = TopTransactionResourceOwner; + + stream_fd = BufFileOpenFileSet(stream_fileset, path, O_RDONLY, false); + + CurrentResourceOwner = oldowner; + + buffer = palloc(BLCKSZ); + initStringInfo(&s2); + + MemoryContextSwitchTo(oldcxt); + + remote_final_lsn = lsn; + + /* + * Make sure the handle apply_dispatch methods are aware we're in a remote + * transaction. + */ + in_remote_transaction = true; + pgstat_report_activity(STATE_RUNNING, NULL); + + end_replication_step(); + + /* + * Read the entries one by one and pass them through the same logic as in + * apply_dispatch. + */ + nchanges = 0; + while (true) + { + size_t nbytes; + int len; + + CHECK_FOR_INTERRUPTS(); + + /* read length of the on-disk record */ + nbytes = BufFileReadMaybeEOF(stream_fd, &len, sizeof(len), true); + + /* have we reached end of the file? */ + if (nbytes == 0) + break; + + /* do we have a correct length? */ + if (len <= 0) + elog(ERROR, "incorrect length %d in streaming transaction's changes file \"%s\"", + len, path); + + /* make sure we have sufficiently large buffer */ + buffer = repalloc(buffer, len); + + /* and finally read the data into the buffer */ + BufFileReadExact(stream_fd, buffer, len); + + BufFileTell(stream_fd, &fileno, &offset); + + /* copy the buffer to the stringinfo and call apply_dispatch */ + resetStringInfo(&s2); + appendBinaryStringInfo(&s2, buffer, len); + + /* Ensure we are reading the data into our memory context. */ + oldcxt = MemoryContextSwitchTo(ApplyMessageContext); + + apply_dispatch(&s2); + + MemoryContextReset(ApplyMessageContext); + + MemoryContextSwitchTo(oldcxt); + + nchanges++; + + /* + * It is possible the file has been closed because we have processed + * the transaction end message like stream_commit in which case that + * must be the last message. + */ + if (!stream_fd) + { + ensure_last_message(stream_fileset, xid, fileno, offset); + break; + } + + if (nchanges % 1000 == 0) + elog(DEBUG1, "replayed %d changes from file \"%s\"", + nchanges, path); + } + + if (stream_fd) + stream_close_file(); + + elog(DEBUG1, "replayed %d (all) changes from file \"%s\"", + nchanges, path); + + return; +} + +/* + * Handle STREAM COMMIT message. + */ +static void +apply_handle_stream_commit(StringInfo s) +{ + TransactionId xid; + LogicalRepCommitData commit_data; + ParallelApplyWorkerInfo *winfo; + TransApplyAction apply_action; + + /* Save the message before it is consumed. */ + StringInfoData original_msg = *s; + + if (in_streamed_transaction) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("STREAM COMMIT message without STREAM STOP"))); + + xid = logicalrep_read_stream_commit(s, &commit_data); + set_apply_error_context_xact(xid, commit_data.commit_lsn); + + apply_action = get_transaction_apply_action(xid, &winfo); + + switch (apply_action) + { + case TRANS_LEADER_APPLY: + + /* + * The transaction has been serialized to file, so replay all the + * spooled operations. + */ + apply_spooled_messages(MyLogicalRepWorker->stream_fileset, xid, + commit_data.commit_lsn); + + apply_handle_commit_internal(&commit_data); + + /* Unlink the files with serialized changes and subxact info. */ + stream_cleanup_files(MyLogicalRepWorker->subid, xid); + + elog(DEBUG1, "finished processing the STREAM COMMIT command"); + break; + + case TRANS_LEADER_SEND_TO_PARALLEL: + Assert(winfo); + + if (pa_send_data(winfo, s->len, s->data)) + { + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, commit_data.end_lsn); + break; + } + + /* + * Switch to serialize mode when we are not able to send the + * change to parallel apply worker. + */ + pa_switch_to_partial_serialize(winfo, true); + + /* fall through */ + case TRANS_LEADER_PARTIAL_SERIALIZE: + Assert(winfo); + + stream_open_and_write_change(xid, LOGICAL_REP_MSG_STREAM_COMMIT, + &original_msg); + + pa_set_fileset_state(winfo->shared, FS_SERIALIZE_DONE); + + /* Finish processing the streaming transaction. */ + pa_xact_finish(winfo, commit_data.end_lsn); + break; + + case TRANS_PARALLEL_APPLY: + + /* + * If the parallel apply worker is applying spooled messages then + * close the file before committing. + */ + if (stream_fd) + stream_close_file(); + + apply_handle_commit_internal(&commit_data); + + MyParallelShared->last_commit_end = XactLastCommitEnd; + + /* + * It is important to set the transaction state as finished before + * releasing the lock. See pa_wait_for_xact_finish. + */ + pa_set_xact_state(MyParallelShared, PARALLEL_TRANS_FINISHED); + pa_unlock_transaction(xid, AccessExclusiveLock); + + pa_reset_subtrans(); + + elog(DEBUG1, "finished processing the STREAM COMMIT command"); + break; + + default: + elog(ERROR, "unexpected apply action: %d", (int) apply_action); + break; + } + + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + + pgstat_report_activity(STATE_IDLE, NULL); + + reset_apply_error_context_info(); +} + +/* + * Helper function for apply_handle_commit and apply_handle_stream_commit. + */ +static void +apply_handle_commit_internal(LogicalRepCommitData *commit_data) +{ + if (is_skipping_changes()) + { + stop_skipping_changes(); + + /* + * Start a new transaction to clear the subskiplsn, if not started + * yet. + */ + if (!IsTransactionState()) + StartTransactionCommand(); + } + + if (IsTransactionState()) + { + /* + * The transaction is either non-empty or skipped, so we clear the + * subskiplsn. + */ + clear_subscription_skip_lsn(commit_data->commit_lsn); + + /* + * Update origin state so we can restart streaming from correct + * position in case of crash. + */ + replorigin_session_origin_lsn = commit_data->end_lsn; + replorigin_session_origin_timestamp = commit_data->committime; + + CommitTransactionCommand(); + + if (IsTransactionBlock()) + { + EndTransactionBlock(false); + CommitTransactionCommand(); + } + + pgstat_report_stat(false); + + store_flush_position(commit_data->end_lsn, XactLastCommitEnd); + } + else + { + /* Process any invalidation messages that might have accumulated. */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + } + + in_remote_transaction = false; +} + +/* + * Handle RELATION message. + * + * Note we don't do validation against local schema here. The validation + * against local schema is postponed until first change for given relation + * comes as we only care about it when applying changes for it anyway and we + * do less locking this way. + */ +static void +apply_handle_relation(StringInfo s) +{ + LogicalRepRelation *rel; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_RELATION, s)) + return; + + rel = logicalrep_read_rel(s); + logicalrep_relmap_update(rel); + + /* Also reset all entries in the partition map that refer to remoterel. */ + logicalrep_partmap_reset_relmap(rel); +} + +/* + * Handle TYPE message. + * + * This implementation pays no attention to TYPE messages; we expect the user + * to have set things up so that the incoming data is acceptable to the input + * functions for the locally subscribed tables. Hence, we just read and + * discard the message. + */ +static void +apply_handle_type(StringInfo s) +{ + LogicalRepTyp typ; + + if (handle_streamed_transaction(LOGICAL_REP_MSG_TYPE, s)) + return; + + logicalrep_read_typ(s, &typ); +} + +/* + * Check that we (the subscription owner) have sufficient privileges on the + * target relation to perform the given operation. + */ +static void +TargetPrivilegesCheck(Relation rel, AclMode mode) +{ + Oid relid; + AclResult aclresult; + + relid = RelationGetRelid(rel); + aclresult = pg_class_aclcheck(relid, GetUserId(), mode); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, + get_relkind_objtype(rel->rd_rel->relkind), + get_rel_name(relid)); + + /* + * We lack the infrastructure to honor RLS policies. It might be possible + * to add such infrastructure here, but tablesync workers lack it, too, so + * we don't bother. RLS does not ordinarily apply to TRUNCATE commands, + * but it seems dangerous to replicate a TRUNCATE and then refuse to + * replicate subsequent INSERTs, so we forbid all commands the same. + */ + if (check_enable_rls(relid, InvalidOid, false) == RLS_ENABLED) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("user \"%s\" cannot replicate into relation with row-level security enabled: \"%s\"", + GetUserNameFromId(GetUserId(), true), + RelationGetRelationName(rel)))); +} + +/* + * Handle INSERT message. + */ + +static void +apply_handle_insert(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData newtup; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_INSERT, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_insert(s, &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &newtup); + slot_fill_defaults(rel, estate, remoteslot); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, insert the tuple into a partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_INSERT); + else + apply_handle_insert_internal(edata, edata->targetRelInfo, + remoteslot); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_insert() + * relinfo is for the relation we're actually inserting into + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_insert_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot) +{ + EState *estate = edata->estate; + + /* We must open indexes here. */ + ExecOpenIndices(relinfo, false); + + /* Do the insert. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_INSERT); + ExecSimpleRelationInsert(relinfo, estate, remoteslot); + + /* Cleanup. */ + ExecCloseIndices(relinfo); +} + +/* + * Check if the logical replication relation is updatable and throw + * appropriate error if it isn't. + */ +static void +check_relation_updatable(LogicalRepRelMapEntry *rel) +{ + /* + * For partitioned tables, we only need to care if the target partition is + * updatable (aka has PK or RI defined for it). + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + return; + + /* Updatable, no error. */ + if (rel->updatable) + return; + + /* + * We are in error mode so it's fine this is somewhat slow. It's better to + * give user correct error. + */ + if (OidIsValid(GetRelationIdentityOrPK(rel->localrel))) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("publisher did not send replica identity column " + "expected by the logical replication target relation \"%s.%s\"", + rel->remoterel.nspname, rel->remoterel.relname))); + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication target relation \"%s.%s\" has " + "neither REPLICA IDENTITY index nor PRIMARY " + "KEY and published relation does not have " + "REPLICA IDENTITY FULL", + rel->remoterel.nspname, rel->remoterel.relname))); +} + +/* + * Handle UPDATE message. + * + * TODO: FDW support + */ +static void +apply_handle_update(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + LogicalRepTupleData oldtup; + LogicalRepTupleData newtup; + bool has_oldtup; + TupleTableSlot *remoteslot; + RTEPermissionInfo *target_perminfo; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_UPDATE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_update(s, &has_oldtup, &oldtup, + &newtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Check if we can do the update. */ + check_relation_updatable(rel); + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* + * Populate updatedCols so that per-column triggers can fire, and so + * executor can correctly pass down indexUnchanged hint. This could + * include more columns than were actually changed on the publisher + * because the logical replication protocol doesn't contain that + * information. But it would for example exclude columns that only exist + * on the subscriber, since we are not touching those. + */ + target_perminfo = list_nth(estate->es_rteperminfos, 0); + for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i); + int remoteattnum = rel->attrmap->attnums[i]; + + if (!att->attisdropped && remoteattnum >= 0) + { + Assert(remoteattnum < newtup.ncols); + if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED) + target_perminfo->updatedCols = + bms_add_member(target_perminfo->updatedCols, + i + 1 - FirstLowInvalidHeapAttributeNumber); + } + } + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, + has_oldtup ? &oldtup : &newtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply update to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, &newtup, CMD_UPDATE); + else + apply_handle_update_internal(edata, edata->targetRelInfo, + remoteslot, &newtup, rel->localindexoid); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_update() + * relinfo is for the relation we're actually updating in + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_update_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + Oid localindexoid) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + Relation localrel = relinfo->ri_RelationDesc; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + MemoryContext oldctx; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(edata, localrel, + &relmapentry->remoterel, + localindexoid, + remoteslot, &localslot); + ExecClearTuple(remoteslot); + + /* + * Tuple found. + * + * Note this will fail if there are other conflicting unique indexes. + */ + if (found) + { + /* Process and store remote tuple in the slot */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot, localslot, relmapentry, newtup); + MemoryContextSwitchTo(oldctx); + + EvalPlanQualSetSlot(&epqstate, remoteslot); + + /* Do the actual update. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_UPDATE); + ExecSimpleRelationUpdate(relinfo, estate, &epqstate, localslot, + remoteslot); + } + else + { + /* + * The tuple to be updated could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Handle DELETE message. + * + * TODO: FDW support + */ +static void +apply_handle_delete(StringInfo s) +{ + LogicalRepRelMapEntry *rel; + LogicalRepTupleData oldtup; + LogicalRepRelId relid; + UserContext ucxt; + ApplyExecutionData *edata; + EState *estate; + TupleTableSlot *remoteslot; + MemoryContext oldctx; + bool run_as_owner; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_DELETE, s)) + return; + + begin_replication_step(); + + relid = logicalrep_read_delete(s, &oldtup); + rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + end_replication_step(); + return; + } + + /* Set relation for error callback */ + apply_error_callback_arg.rel = rel; + + /* Check if we can do the delete. */ + check_relation_updatable(rel); + + /* + * Make sure that any user-supplied code runs as the table owner, unless + * the user has opted out of that behavior. + */ + run_as_owner = MySubscription->runasowner; + if (!run_as_owner) + SwitchToUntrustedUser(rel->localrel->rd_rel->relowner, &ucxt); + + /* Initialize the executor state. */ + edata = create_edata_for_relation(rel); + estate = edata->estate; + remoteslot = ExecInitExtraTupleSlot(estate, + RelationGetDescr(rel->localrel), + &TTSOpsVirtual); + + /* Build the search tuple. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_store_data(remoteslot, rel, &oldtup); + MemoryContextSwitchTo(oldctx); + + /* For a partitioned table, apply delete to correct partition. */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + apply_handle_tuple_routing(edata, + remoteslot, NULL, CMD_DELETE); + else + apply_handle_delete_internal(edata, edata->targetRelInfo, + remoteslot, rel->localindexoid); + + finish_edata(edata); + + /* Reset relation for error callback */ + apply_error_callback_arg.rel = NULL; + + if (!run_as_owner) + RestoreUserContext(&ucxt); + + logicalrep_rel_close(rel, NoLock); + + end_replication_step(); +} + +/* + * Workhorse for apply_handle_delete() + * relinfo is for the relation we're actually deleting from + * (could be a child partition of edata->targetRelInfo) + */ +static void +apply_handle_delete_internal(ApplyExecutionData *edata, + ResultRelInfo *relinfo, + TupleTableSlot *remoteslot, + Oid localindexoid) +{ + EState *estate = edata->estate; + Relation localrel = relinfo->ri_RelationDesc; + LogicalRepRelation *remoterel = &edata->targetRel->remoterel; + EPQState epqstate; + TupleTableSlot *localslot; + bool found; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(relinfo, false); + + found = FindReplTupleInLocalRel(edata, localrel, remoterel, localindexoid, + remoteslot, &localslot); + + /* If found delete it. */ + if (found) + { + EvalPlanQualSetSlot(&epqstate, localslot); + + /* Do the actual delete. */ + TargetPrivilegesCheck(relinfo->ri_RelationDesc, ACL_DELETE); + ExecSimpleRelationDelete(relinfo, estate, &epqstate, localslot); + } + else + { + /* + * The tuple to be deleted could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be deleted " + "in replication target relation \"%s\"", + RelationGetRelationName(localrel)); + } + + /* Cleanup. */ + ExecCloseIndices(relinfo); + EvalPlanQualEnd(&epqstate); +} + +/* + * Try to find a tuple received from the publication side (in 'remoteslot') in + * the corresponding local relation using either replica identity index, + * primary key, index or if needed, sequential scan. + * + * Local tuple, if found, is returned in '*localslot'. + */ +static bool +FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, + LogicalRepRelation *remoterel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TupleTableSlot **localslot) +{ + EState *estate = edata->estate; + bool found; + + /* + * Regardless of the top-level operation, we're performing a read here, so + * check for SELECT privileges. + */ + TargetPrivilegesCheck(localrel, ACL_SELECT); + + *localslot = table_slot_create(localrel, &estate->es_tupleTable); + + Assert(OidIsValid(localidxoid) || + (remoterel->replident == REPLICA_IDENTITY_FULL)); + + if (OidIsValid(localidxoid)) + { +#ifdef USE_ASSERT_CHECKING + Relation idxrel = index_open(localidxoid, AccessShareLock); + + /* Index must be PK, RI, or usable for REPLICA IDENTITY FULL tables */ + Assert(GetRelationIdentityOrPK(idxrel) == localidxoid || + IsIndexUsableForReplicaIdentityFull(BuildIndexInfo(idxrel), + edata->targetRel->attrmap)); + index_close(idxrel, AccessShareLock); +#endif + + found = RelationFindReplTupleByIndex(localrel, localidxoid, + LockTupleExclusive, + remoteslot, *localslot); + } + else + found = RelationFindReplTupleSeq(localrel, LockTupleExclusive, + remoteslot, *localslot); + + return found; +} + +/* + * This handles insert, update, delete on a partitioned table. + */ +static void +apply_handle_tuple_routing(ApplyExecutionData *edata, + TupleTableSlot *remoteslot, + LogicalRepTupleData *newtup, + CmdType operation) +{ + EState *estate = edata->estate; + LogicalRepRelMapEntry *relmapentry = edata->targetRel; + ResultRelInfo *relinfo = edata->targetRelInfo; + Relation parentrel = relinfo->ri_RelationDesc; + ModifyTableState *mtstate; + PartitionTupleRouting *proute; + ResultRelInfo *partrelinfo; + Relation partrel; + TupleTableSlot *remoteslot_part; + TupleConversionMap *map; + MemoryContext oldctx; + LogicalRepRelMapEntry *part_entry = NULL; + AttrMap *attrmap = NULL; + + /* ModifyTableState is needed for ExecFindPartition(). */ + edata->mtstate = mtstate = makeNode(ModifyTableState); + mtstate->ps.plan = NULL; + mtstate->ps.state = estate; + mtstate->operation = operation; + mtstate->resultRelInfo = relinfo; + + /* ... as is PartitionTupleRouting. */ + edata->proute = proute = ExecSetupPartitionTupleRouting(estate, parentrel); + + /* + * Find the partition to which the "search tuple" belongs. + */ + Assert(remoteslot != NULL); + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo = ExecFindPartition(mtstate, relinfo, proute, + remoteslot, estate); + Assert(partrelinfo != NULL); + partrel = partrelinfo->ri_RelationDesc; + + /* + * Check for supported relkind. We need this since partitions might be of + * unsupported relkinds; and the set of partitions can change, so checking + * at CREATE/ALTER SUBSCRIPTION would be insufficient. + */ + CheckSubscriptionRelkind(partrel->rd_rel->relkind, + get_namespace_name(RelationGetNamespace(partrel)), + RelationGetRelationName(partrel)); + + /* + * To perform any of the operations below, the tuple must match the + * partition's rowtype. Convert if needed or just copy, using a dedicated + * slot to store the tuple in any case. + */ + remoteslot_part = partrelinfo->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel, &estate->es_tupleTable); + map = ExecGetRootToChildMap(partrelinfo, estate); + if (map != NULL) + { + attrmap = map->attrMap; + remoteslot_part = execute_attr_map_slot(attrmap, remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, remoteslot); + slot_getallattrs(remoteslot_part); + } + MemoryContextSwitchTo(oldctx); + + /* Check if we can do the update or delete on the leaf partition. */ + if (operation == CMD_UPDATE || operation == CMD_DELETE) + { + part_entry = logicalrep_partition_open(relmapentry, partrel, + attrmap); + check_relation_updatable(part_entry); + } + + switch (operation) + { + case CMD_INSERT: + apply_handle_insert_internal(edata, partrelinfo, + remoteslot_part); + break; + + case CMD_DELETE: + apply_handle_delete_internal(edata, partrelinfo, + remoteslot_part, + part_entry->localindexoid); + break; + + case CMD_UPDATE: + + /* + * For UPDATE, depending on whether or not the updated tuple + * satisfies the partition's constraint, perform a simple UPDATE + * of the partition or move the updated tuple into a different + * suitable partition. + */ + { + TupleTableSlot *localslot; + ResultRelInfo *partrelinfo_new; + Relation partrel_new; + bool found; + + /* Get the matching local tuple from the partition. */ + found = FindReplTupleInLocalRel(edata, partrel, + &part_entry->remoterel, + part_entry->localindexoid, + remoteslot_part, &localslot); + if (!found) + { + /* + * The tuple to be updated could not be found. Do nothing + * except for emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ + elog(DEBUG1, + "logical replication did not find row to be updated " + "in replication target relation's partition \"%s\"", + RelationGetRelationName(partrel)); + return; + } + + /* + * Apply the update to the local tuple, putting the result in + * remoteslot_part. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot_part, localslot, part_entry, + newtup); + MemoryContextSwitchTo(oldctx); + + /* + * Does the updated tuple still satisfy the current + * partition's constraint? + */ + if (!partrel->rd_rel->relispartition || + ExecPartitionCheck(partrelinfo, remoteslot_part, estate, + false)) + { + /* + * Yes, so simply UPDATE the partition. We don't call + * apply_handle_update_internal() here, which would + * normally do the following work, to avoid repeating some + * work already done above to find the local tuple in the + * partition. + */ + EPQState epqstate; + + EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1, NIL); + ExecOpenIndices(partrelinfo, false); + + EvalPlanQualSetSlot(&epqstate, remoteslot_part); + TargetPrivilegesCheck(partrelinfo->ri_RelationDesc, + ACL_UPDATE); + ExecSimpleRelationUpdate(partrelinfo, estate, &epqstate, + localslot, remoteslot_part); + ExecCloseIndices(partrelinfo); + EvalPlanQualEnd(&epqstate); + } + else + { + /* Move the tuple into the new partition. */ + + /* + * New partition will be found using tuple routing, which + * can only occur via the parent table. We might need to + * convert the tuple to the parent's rowtype. Note that + * this is the tuple found in the partition, not the + * original search tuple received by this function. + */ + if (map) + { + TupleConversionMap *PartitionToRootMap = + convert_tuples_by_name(RelationGetDescr(partrel), + RelationGetDescr(parentrel)); + + remoteslot = + execute_attr_map_slot(PartitionToRootMap->attrMap, + remoteslot_part, remoteslot); + } + else + { + remoteslot = ExecCopySlot(remoteslot, remoteslot_part); + slot_getallattrs(remoteslot); + } + + /* Find the new partition. */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + partrelinfo_new = ExecFindPartition(mtstate, relinfo, + proute, remoteslot, + estate); + MemoryContextSwitchTo(oldctx); + Assert(partrelinfo_new != partrelinfo); + partrel_new = partrelinfo_new->ri_RelationDesc; + + /* Check that new partition also has supported relkind. */ + CheckSubscriptionRelkind(partrel_new->rd_rel->relkind, + get_namespace_name(RelationGetNamespace(partrel_new)), + RelationGetRelationName(partrel_new)); + + /* DELETE old tuple found in the old partition. */ + apply_handle_delete_internal(edata, partrelinfo, + localslot, + part_entry->localindexoid); + + /* INSERT new tuple into the new partition. */ + + /* + * Convert the replacement tuple to match the destination + * partition rowtype. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + remoteslot_part = partrelinfo_new->ri_PartitionTupleSlot; + if (remoteslot_part == NULL) + remoteslot_part = table_slot_create(partrel_new, + &estate->es_tupleTable); + map = ExecGetRootToChildMap(partrelinfo_new, estate); + if (map != NULL) + { + remoteslot_part = execute_attr_map_slot(map->attrMap, + remoteslot, + remoteslot_part); + } + else + { + remoteslot_part = ExecCopySlot(remoteslot_part, + remoteslot); + slot_getallattrs(remoteslot); + } + MemoryContextSwitchTo(oldctx); + apply_handle_insert_internal(edata, partrelinfo_new, + remoteslot_part); + } + } + break; + + default: + elog(ERROR, "unrecognized CmdType: %d", (int) operation); + break; + } +} + +/* + * Handle TRUNCATE message. + * + * TODO: FDW support + */ +static void +apply_handle_truncate(StringInfo s) +{ + bool cascade = false; + bool restart_seqs = false; + List *remote_relids = NIL; + List *remote_rels = NIL; + List *rels = NIL; + List *part_rels = NIL; + List *relids = NIL; + List *relids_logged = NIL; + ListCell *lc; + LOCKMODE lockmode = AccessExclusiveLock; + + /* + * Quick return if we are skipping data modification changes or handling + * streamed transactions. + */ + if (is_skipping_changes() || + handle_streamed_transaction(LOGICAL_REP_MSG_TRUNCATE, s)) + return; + + begin_replication_step(); + + remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs); + + foreach(lc, remote_relids) + { + LogicalRepRelId relid = lfirst_oid(lc); + LogicalRepRelMapEntry *rel; + + rel = logicalrep_rel_open(relid, lockmode); + if (!should_apply_changes_for_rel(rel)) + { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, lockmode); + continue; + } + + remote_rels = lappend(remote_rels, rel); + TargetPrivilegesCheck(rel->localrel, ACL_TRUNCATE); + rels = lappend(rels, rel->localrel); + relids = lappend_oid(relids, rel->localreloid); + if (RelationIsLogicallyLogged(rel->localrel)) + relids_logged = lappend_oid(relids_logged, rel->localreloid); + + /* + * Truncate partitions if we got a message to truncate a partitioned + * table. + */ + if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + ListCell *child; + List *children = find_all_inheritors(rel->localreloid, + lockmode, + NULL); + + foreach(child, children) + { + Oid childrelid = lfirst_oid(child); + Relation childrel; + + if (list_member_oid(relids, childrelid)) + continue; + + /* find_all_inheritors already got lock */ + childrel = table_open(childrelid, NoLock); + + /* + * Ignore temp tables of other backends. See similar code in + * ExecuteTruncate(). + */ + if (RELATION_IS_OTHER_TEMP(childrel)) + { + table_close(childrel, lockmode); + continue; + } + + TargetPrivilegesCheck(childrel, ACL_TRUNCATE); + rels = lappend(rels, childrel); + part_rels = lappend(part_rels, childrel); + relids = lappend_oid(relids, childrelid); + /* Log this relation only if needed for logical decoding */ + if (RelationIsLogicallyLogged(childrel)) + relids_logged = lappend_oid(relids_logged, childrelid); + } + } + } + + /* + * Even if we used CASCADE on the upstream primary we explicitly default + * to replaying changes without further cascading. This might be later + * changeable with a user specified option. + * + * MySubscription->runasowner tells us whether we want to execute + * replication actions as the subscription owner; the last argument to + * TruncateGuts tells it whether we want to switch to the table owner. + * Those are exactly opposite conditions. + */ + ExecuteTruncateGuts(rels, + relids, + relids_logged, + DROP_RESTRICT, + restart_seqs, + !MySubscription->runasowner); + foreach(lc, remote_rels) + { + LogicalRepRelMapEntry *rel = lfirst(lc); + + logicalrep_rel_close(rel, NoLock); + } + foreach(lc, part_rels) + { + Relation rel = lfirst(lc); + + table_close(rel, NoLock); + } + + end_replication_step(); +} + + +/* + * Logical replication protocol message dispatcher. + */ +void +apply_dispatch(StringInfo s) +{ + LogicalRepMsgType action = pq_getmsgbyte(s); + LogicalRepMsgType saved_command; + + /* + * Set the current command being applied. Since this function can be + * called recursively when applying spooled changes, save the current + * command. + */ + saved_command = apply_error_callback_arg.command; + apply_error_callback_arg.command = action; + + switch (action) + { + case LOGICAL_REP_MSG_BEGIN: + apply_handle_begin(s); + break; + + case LOGICAL_REP_MSG_COMMIT: + apply_handle_commit(s); + break; + + case LOGICAL_REP_MSG_INSERT: + apply_handle_insert(s); + break; + + case LOGICAL_REP_MSG_UPDATE: + apply_handle_update(s); + break; + + case LOGICAL_REP_MSG_DELETE: + apply_handle_delete(s); + break; + + case LOGICAL_REP_MSG_TRUNCATE: + apply_handle_truncate(s); + break; + + case LOGICAL_REP_MSG_RELATION: + apply_handle_relation(s); + break; + + case LOGICAL_REP_MSG_TYPE: + apply_handle_type(s); + break; + + case LOGICAL_REP_MSG_ORIGIN: + apply_handle_origin(s); + break; + + case LOGICAL_REP_MSG_MESSAGE: + + /* + * Logical replication does not use generic logical messages yet. + * Although, it could be used by other applications that use this + * output plugin. + */ + break; + + case LOGICAL_REP_MSG_STREAM_START: + apply_handle_stream_start(s); + break; + + case LOGICAL_REP_MSG_STREAM_STOP: + apply_handle_stream_stop(s); + break; + + case LOGICAL_REP_MSG_STREAM_ABORT: + apply_handle_stream_abort(s); + break; + + case LOGICAL_REP_MSG_STREAM_COMMIT: + apply_handle_stream_commit(s); + break; + + case LOGICAL_REP_MSG_BEGIN_PREPARE: + apply_handle_begin_prepare(s); + break; + + case LOGICAL_REP_MSG_PREPARE: + apply_handle_prepare(s); + break; + + case LOGICAL_REP_MSG_COMMIT_PREPARED: + apply_handle_commit_prepared(s); + break; + + case LOGICAL_REP_MSG_ROLLBACK_PREPARED: + apply_handle_rollback_prepared(s); + break; + + case LOGICAL_REP_MSG_STREAM_PREPARE: + apply_handle_stream_prepare(s); + break; + + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid logical replication message type \"??? (%d)\"", action))); + } + + /* Reset the current command */ + apply_error_callback_arg.command = saved_command; +} + +/* + * Figure out which write/flush positions to report to the walsender process. + * + * We can't simply report back the last LSN the walsender sent us because the + * local transaction might not yet be flushed to disk locally. Instead we + * build a list that associates local with remote LSNs for every commit. When + * reporting back the flush position to the sender we iterate that list and + * check which entries on it are already locally flushed. Those we can report + * as having been flushed. + * + * The have_pending_txes is true if there are outstanding transactions that + * need to be flushed. + */ +static void +get_flush_position(XLogRecPtr *write, XLogRecPtr *flush, + bool *have_pending_txes) +{ + dlist_mutable_iter iter; + XLogRecPtr local_flush = GetFlushRecPtr(NULL); + + *write = InvalidXLogRecPtr; + *flush = InvalidXLogRecPtr; + + dlist_foreach_modify(iter, &lsn_mapping) + { + FlushPosition *pos = + dlist_container(FlushPosition, node, iter.cur); + + *write = pos->remote_end; + + if (pos->local_end <= local_flush) + { + *flush = pos->remote_end; + dlist_delete(iter.cur); + pfree(pos); + } + else + { + /* + * Don't want to uselessly iterate over the rest of the list which + * could potentially be long. Instead get the last element and + * grab the write position from there. + */ + pos = dlist_tail_element(FlushPosition, node, + &lsn_mapping); + *write = pos->remote_end; + *have_pending_txes = true; + return; + } + } + + *have_pending_txes = !dlist_is_empty(&lsn_mapping); +} + +/* + * Store current remote/local lsn pair in the tracking list. + */ +void +store_flush_position(XLogRecPtr remote_lsn, XLogRecPtr local_lsn) +{ + FlushPosition *flushpos; + + /* + * Skip for parallel apply workers, because the lsn_mapping is maintained + * by the leader apply worker. + */ + if (am_parallel_apply_worker()) + return; + + /* Need to do this in permanent context */ + MemoryContextSwitchTo(ApplyContext); + + /* Track commit lsn */ + flushpos = (FlushPosition *) palloc(sizeof(FlushPosition)); + flushpos->local_end = local_lsn; + flushpos->remote_end = remote_lsn; + + dlist_push_tail(&lsn_mapping, &flushpos->node); + MemoryContextSwitchTo(ApplyMessageContext); +} + + +/* Update statistics of the worker. */ +static void +UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply) +{ + MyLogicalRepWorker->last_lsn = last_lsn; + MyLogicalRepWorker->last_send_time = send_time; + MyLogicalRepWorker->last_recv_time = GetCurrentTimestamp(); + if (reply) + { + MyLogicalRepWorker->reply_lsn = last_lsn; + MyLogicalRepWorker->reply_time = send_time; + } +} + +/* + * Apply main loop. + */ +static void +LogicalRepApplyLoop(XLogRecPtr last_received) +{ + TimestampTz last_recv_timestamp = GetCurrentTimestamp(); + bool ping_sent = false; + TimeLineID tli; + ErrorContextCallback errcallback; + + /* + * Init the ApplyMessageContext which we clean up after each replication + * protocol message. + */ + ApplyMessageContext = AllocSetContextCreate(ApplyContext, + "ApplyMessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * This memory context is used for per-stream data when the streaming mode + * is enabled. This context is reset on each stream stop. + */ + LogicalStreamingContext = AllocSetContextCreate(ApplyContext, + "LogicalStreamingContext", + ALLOCSET_DEFAULT_SIZES); + + /* mark as idle, before starting to loop */ + pgstat_report_activity(STATE_IDLE, NULL); + + /* + * Push apply error context callback. Fields will be filled while applying + * a change. + */ + errcallback.callback = apply_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + apply_error_context_stack = error_context_stack; + + /* This outer loop iterates once per wait. */ + for (;;) + { + pgsocket fd = PGINVALID_SOCKET; + int rc; + int len; + char *buf = NULL; + bool endofstream = false; + long wait_time; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextSwitchTo(ApplyMessageContext); + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + + if (len != 0) + { + /* Loop to process all available data (without blocking). */ + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + { + break; + } + else if (len < 0) + { + ereport(LOG, + (errmsg("data stream from publisher has ended"))); + endofstream = true; + break; + } + else + { + int c; + StringInfoData s; + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* Reset timeout. */ + last_recv_timestamp = GetCurrentTimestamp(); + ping_sent = false; + + /* Ensure we are reading the data into our memory context. */ + MemoryContextSwitchTo(ApplyMessageContext); + + s.data = buf; + s.len = len; + s.cursor = 0; + s.maxlen = -1; + + c = pq_getmsgbyte(&s); + + if (c == 'w') + { + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + TimestampTz send_time; + + start_lsn = pq_getmsgint64(&s); + end_lsn = pq_getmsgint64(&s); + send_time = pq_getmsgint64(&s); + + if (last_received < start_lsn) + last_received = start_lsn; + + if (last_received < end_lsn) + last_received = end_lsn; + + UpdateWorkerStats(last_received, send_time, false); + + apply_dispatch(&s); + } + else if (c == 'k') + { + XLogRecPtr end_lsn; + TimestampTz timestamp; + bool reply_requested; + + end_lsn = pq_getmsgint64(&s); + timestamp = pq_getmsgint64(&s); + reply_requested = pq_getmsgbyte(&s); + + if (last_received < end_lsn) + last_received = end_lsn; + + send_feedback(last_received, reply_requested, false); + UpdateWorkerStats(last_received, timestamp, true); + } + /* other message types are purposefully ignored */ + + MemoryContextReset(ApplyMessageContext); + } + + len = walrcv_receive(LogRepWorkerWalRcvConn, &buf, &fd); + } + } + + /* confirm all writes so far */ + send_feedback(last_received, false, false); + + if (!in_remote_transaction && !in_streamed_transaction) + { + /* + * If we didn't get any transactions for a while there might be + * unconsumed invalidation messages in the queue, consume them + * now. + */ + AcceptInvalidationMessages(); + maybe_reread_subscription(); + + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); + } + + /* Cleanup the memory. */ + MemoryContextResetAndDeleteChildren(ApplyMessageContext); + MemoryContextSwitchTo(TopMemoryContext); + + /* Check if we need to exit the streaming loop. */ + if (endofstream) + break; + + /* + * Wait for more data or latch. If we have unflushed transactions, + * wake up after WalWriterDelay to see if they've been flushed yet (in + * which case we should send a feedback message). Otherwise, there's + * no particular urgency about waking up unless we get data or a + * signal. + */ + if (!dlist_is_empty(&lsn_mapping)) + wait_time = WalWriterDelay; + else + wait_time = NAPTIME_PER_CYCLE; + + rc = WaitLatchOrSocket(MyLatch, + WL_SOCKET_READABLE | WL_LATCH_SET | + WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + fd, wait_time, + WAIT_EVENT_LOGICAL_APPLY_MAIN); + + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (rc & WL_TIMEOUT) + { + /* + * We didn't receive anything new. If we haven't heard anything + * from the server for more than wal_receiver_timeout / 2, ping + * the server. Also, if it's been longer than + * wal_receiver_status_interval since the last update we sent, + * send a status update to the primary anyway, to report any + * progress in applying WAL. + */ + bool requestReply = false; + + /* + * Check if time since last receive from primary has reached the + * configured limit. + */ + if (wal_receiver_timeout > 0) + { + TimestampTz now = GetCurrentTimestamp(); + TimestampTz timeout; + + timeout = + TimestampTzPlusMilliseconds(last_recv_timestamp, + wal_receiver_timeout); + + if (now >= timeout) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("terminating logical replication worker due to timeout"))); + + /* Check to see if it's time for a ping. */ + if (!ping_sent) + { + timeout = TimestampTzPlusMilliseconds(last_recv_timestamp, + (wal_receiver_timeout / 2)); + if (now >= timeout) + { + requestReply = true; + ping_sent = true; + } + } + } + + send_feedback(last_received, requestReply, requestReply); + + /* + * Force reporting to ensure long idle periods don't lead to + * arbitrarily delayed stats. Stats can only be reported outside + * of (implicit or explicit) transactions. That shouldn't lead to + * stats being delayed for long, because transactions are either + * sent as a whole on commit or streamed. Streamed transactions + * are spilled to disk and applied on commit. + */ + if (!IsTransactionState()) + pgstat_report_stat(true); + } + } + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + apply_error_context_stack = error_context_stack; + + /* All done */ + walrcv_endstreaming(LogRepWorkerWalRcvConn, &tli); +} + +/* + * Send a Standby Status Update message to server. + * + * 'recvpos' is the latest LSN we've received data to, force is set if we need + * to send a response to avoid timeouts. + */ +static void +send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) +{ + static StringInfo reply_message = NULL; + static TimestampTz send_time = 0; + + static XLogRecPtr last_recvpos = InvalidXLogRecPtr; + static XLogRecPtr last_writepos = InvalidXLogRecPtr; + static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + + XLogRecPtr writepos; + XLogRecPtr flushpos; + TimestampTz now; + bool have_pending_txes; + + /* + * If the user doesn't want status to be reported to the publisher, be + * sure to exit before doing anything at all. + */ + if (!force && wal_receiver_status_interval <= 0) + return; + + /* It's legal to not pass a recvpos */ + if (recvpos < last_recvpos) + recvpos = last_recvpos; + + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + /* + * No outstanding transactions to flush, we can report the latest received + * position. This is important for synchronous replication. + */ + if (!have_pending_txes) + flushpos = writepos = recvpos; + + if (writepos < last_writepos) + writepos = last_writepos; + + if (flushpos < last_flushpos) + flushpos = last_flushpos; + + now = GetCurrentTimestamp(); + + /* if we've already reported everything we're good */ + if (!force && + writepos == last_writepos && + flushpos == last_flushpos && + !TimestampDifferenceExceeds(send_time, now, + wal_receiver_status_interval * 1000)) + return; + send_time = now; + + if (!reply_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + reply_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(reply_message); + + pq_sendbyte(reply_message, 'r'); + pq_sendint64(reply_message, recvpos); /* write */ + pq_sendint64(reply_message, flushpos); /* flush */ + pq_sendint64(reply_message, writepos); /* apply */ + pq_sendint64(reply_message, now); /* sendTime */ + pq_sendbyte(reply_message, requestReply); /* replyRequested */ + + elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + force, + LSN_FORMAT_ARGS(recvpos), + LSN_FORMAT_ARGS(writepos), + LSN_FORMAT_ARGS(flushpos)); + + walrcv_send(LogRepWorkerWalRcvConn, + reply_message->data, reply_message->len); + + if (recvpos > last_recvpos) + last_recvpos = recvpos; + if (writepos > last_writepos) + last_writepos = writepos; + if (flushpos > last_flushpos) + last_flushpos = flushpos; +} + +/* + * Exit routine for apply workers due to subscription parameter changes. + */ +static void +apply_worker_exit(void) +{ + if (am_parallel_apply_worker()) + { + /* + * Don't stop the parallel apply worker as the leader will detect the + * subscription parameter change and restart logical replication later + * anyway. This also prevents the leader from reporting errors when + * trying to communicate with a stopped parallel apply worker, which + * would accidentally disable subscriptions if disable_on_error was + * set. + */ + return; + } + + /* + * Reset the last-start time for this apply worker so that the launcher + * will restart it without waiting for wal_retrieve_retry_interval if the + * subscription is still active, and so that we won't leak that hash table + * entry if it isn't. + */ + if (!am_tablesync_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + + proc_exit(0); +} + +/* + * Reread subscription info if needed. Most changes will be exit. + */ +void +maybe_reread_subscription(void) +{ + MemoryContext oldctx; + Subscription *newsub; + bool started_tx = false; + + /* When cache state is valid there is nothing to do here. */ + if (MySubscriptionValid) + return; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + /* Ensure allocations in permanent context. */ + oldctx = MemoryContextSwitchTo(ApplyContext); + + newsub = GetSubscription(MyLogicalRepWorker->subid, true); + + /* + * Exit if the subscription was removed. This normally should not happen + * as the worker gets killed during DROP SUBSCRIPTION. + */ + if (!newsub) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was removed", + MySubscription->name))); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + proc_exit(0); + } + + /* Exit if the subscription was disabled. */ + if (!newsub->enabled) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will stop because the subscription was disabled", + MySubscription->name))); + + apply_worker_exit(); + } + + /* !slotname should never happen when enabled is true. */ + Assert(newsub->slotname); + + /* two-phase should not be altered */ + Assert(newsub->twophasestate == MySubscription->twophasestate); + + /* + * Exit if any parameter that affects the remote connection was changed. + * The launcher will start a new worker but note that the parallel apply + * worker won't restart if the streaming option's value is changed from + * 'parallel' to any other value or the server decides not to stream the + * in-progress transaction. + */ + if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 || + strcmp(newsub->name, MySubscription->name) != 0 || + strcmp(newsub->slotname, MySubscription->slotname) != 0 || + newsub->binary != MySubscription->binary || + newsub->stream != MySubscription->stream || + newsub->passwordrequired != MySubscription->passwordrequired || + strcmp(newsub->origin, MySubscription->origin) != 0 || + newsub->owner != MySubscription->owner || + !equal(newsub->publications, MySubscription->publications)) + { + if (am_parallel_apply_worker()) + ereport(LOG, + (errmsg("logical replication parallel apply worker for subscription \"%s\" will stop because of a parameter change", + MySubscription->name))); + else + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will restart because of a parameter change", + MySubscription->name))); + + apply_worker_exit(); + } + + /* Check for other changes that should never happen too. */ + if (newsub->dbid != MySubscription->dbid) + { + elog(ERROR, "subscription %u changed unexpectedly", + MyLogicalRepWorker->subid); + } + + /* Clean old subscription info and switch to new one. */ + FreeSubscription(MySubscription); + MySubscription = newsub; + + MemoryContextSwitchTo(oldctx); + + /* Change synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + if (started_tx) + CommitTransactionCommand(); + + MySubscriptionValid = true; +} + +/* + * Callback from subscription syscache invalidation. + */ +static void +subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + MySubscriptionValid = false; +} + +/* + * subxact_info_write + * Store information about subxacts for a toplevel transaction. + * + * For each subxact we store offset of it's first change in the main file. + * The file is always over-written as a whole. + * + * XXX We should only store subxacts that were not aborted yet. + */ +static void +subxact_info_write(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + BufFile *fd; + + Assert(TransactionIdIsValid(xid)); + + /* construct the subxact filename */ + subxact_filename(path, subid, xid); + + /* Delete the subxacts file, if exists. */ + if (subxact_data.nsubxacts == 0) + { + cleanup_subxact_info(); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, true); + + return; + } + + /* + * Create the subxact file if it not already created, otherwise open the + * existing file. + */ + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, O_RDWR, + true); + if (fd == NULL) + fd = BufFileCreateFileSet(MyLogicalRepWorker->stream_fileset, path); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* Write the subxact count and subxact info */ + BufFileWrite(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts)); + BufFileWrite(fd, subxact_data.subxacts, len); + + BufFileClose(fd); + + /* free the memory allocated for subxact info */ + cleanup_subxact_info(); +} + +/* + * subxact_info_read + * Restore information about subxacts of a streamed transaction. + * + * Read information about subxacts into the structure subxact_data that can be + * used later. + */ +static void +subxact_info_read(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + Size len; + BufFile *fd; + MemoryContext oldctx; + + Assert(!subxact_data.subxacts); + Assert(subxact_data.nsubxacts == 0); + Assert(subxact_data.nsubxacts_max == 0); + + /* + * If the subxact file doesn't exist that means we don't have any subxact + * info. + */ + subxact_filename(path, subid, xid); + fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, path, O_RDONLY, + true); + if (fd == NULL) + return; + + /* read number of subxact items */ + BufFileReadExact(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts)); + + len = sizeof(SubXactInfo) * subxact_data.nsubxacts; + + /* we keep the maximum as a power of 2 */ + subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts); + + /* + * Allocate subxact information in the logical streaming context. We need + * this information during the complete stream so that we can add the sub + * transaction info to this. On stream stop we will flush this information + * to the subxact file and reset the logical streaming context. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxact_data.subxacts = palloc(subxact_data.nsubxacts_max * + sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + + if (len > 0) + BufFileReadExact(fd, subxact_data.subxacts, len); + + BufFileClose(fd); +} + +/* + * subxact_info_add + * Add information about a subxact (offset in the main file). + */ +static void +subxact_info_add(TransactionId xid) +{ + SubXactInfo *subxacts = subxact_data.subxacts; + int64 i; + + /* We must have a valid top level stream xid and a stream fd. */ + Assert(TransactionIdIsValid(stream_xid)); + Assert(stream_fd != NULL); + + /* + * If the XID matches the toplevel transaction, we don't want to add it. + */ + if (stream_xid == xid) + return; + + /* + * In most cases we're checking the same subxact as we've already seen in + * the last call, so make sure to ignore it (this change comes later). + */ + if (subxact_data.subxact_last == xid) + return; + + /* OK, remember we're processing this XID. */ + subxact_data.subxact_last = xid; + + /* + * Check if the transaction is already present in the array of subxact. We + * intentionally scan the array from the tail, because we're likely adding + * a change for the most recent subtransactions. + * + * XXX Can we rely on the subxact XIDs arriving in sorted order? That + * would allow us to use binary search here. + */ + for (i = subxact_data.nsubxacts; i > 0; i--) + { + /* found, so we're done */ + if (subxacts[i - 1].xid == xid) + return; + } + + /* This is a new subxact, so we need to add it to the array. */ + if (subxact_data.nsubxacts == 0) + { + MemoryContext oldctx; + + subxact_data.nsubxacts_max = 128; + + /* + * Allocate this memory for subxacts in per-stream context, see + * subxact_info_read. + */ + oldctx = MemoryContextSwitchTo(LogicalStreamingContext); + subxacts = palloc(subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + MemoryContextSwitchTo(oldctx); + } + else if (subxact_data.nsubxacts == subxact_data.nsubxacts_max) + { + subxact_data.nsubxacts_max *= 2; + subxacts = repalloc(subxacts, + subxact_data.nsubxacts_max * sizeof(SubXactInfo)); + } + + subxacts[subxact_data.nsubxacts].xid = xid; + + /* + * Get the current offset of the stream file and store it as offset of + * this subxact. + */ + BufFileTell(stream_fd, + &subxacts[subxact_data.nsubxacts].fileno, + &subxacts[subxact_data.nsubxacts].offset); + + subxact_data.nsubxacts++; + subxact_data.subxacts = subxacts; +} + +/* format filename for file containing the info about subxacts */ +static inline void +subxact_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid); +} + +/* format filename for file containing serialized changes */ +static inline void +changes_filename(char *path, Oid subid, TransactionId xid) +{ + snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid); +} + +/* + * stream_cleanup_files + * Cleanup files for a subscription / toplevel transaction. + * + * Remove files with serialized changes and subxact info for a particular + * toplevel transaction. Each subscription has a separate set of files + * for any toplevel transaction. + */ +void +stream_cleanup_files(Oid subid, TransactionId xid) +{ + char path[MAXPGPATH]; + + /* Delete the changes file. */ + changes_filename(path, subid, xid); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, false); + + /* Delete the subxact file, if it exists. */ + subxact_filename(path, subid, xid); + BufFileDeleteFileSet(MyLogicalRepWorker->stream_fileset, path, true); +} + +/* + * stream_open_file + * Open a file that we'll use to serialize changes for a toplevel + * transaction. + * + * Open a file for streamed changes from a toplevel transaction identified + * by stream_xid (global variable). If it's the first chunk of streamed + * changes for this transaction, create the buffile, otherwise open the + * previously created file. + */ +static void +stream_open_file(Oid subid, TransactionId xid, bool first_segment) +{ + char path[MAXPGPATH]; + MemoryContext oldcxt; + + Assert(OidIsValid(subid)); + Assert(TransactionIdIsValid(xid)); + Assert(stream_fd == NULL); + + + changes_filename(path, subid, xid); + elog(DEBUG1, "opening file \"%s\" for streamed changes", path); + + /* + * Create/open the buffiles under the logical streaming context so that we + * have those files until stream stop. + */ + oldcxt = MemoryContextSwitchTo(LogicalStreamingContext); + + /* + * If this is the first streamed segment, create the changes file. + * Otherwise, just open the file for writing, in append mode. + */ + if (first_segment) + stream_fd = BufFileCreateFileSet(MyLogicalRepWorker->stream_fileset, + path); + else + { + /* + * Open the file and seek to the end of the file because we always + * append the changes file. + */ + stream_fd = BufFileOpenFileSet(MyLogicalRepWorker->stream_fileset, + path, O_RDWR, false); + BufFileSeek(stream_fd, 0, 0, SEEK_END); + } + + MemoryContextSwitchTo(oldcxt); +} + +/* + * stream_close_file + * Close the currently open file with streamed changes. + */ +static void +stream_close_file(void) +{ + Assert(stream_fd != NULL); + + BufFileClose(stream_fd); + + stream_fd = NULL; +} + +/* + * stream_write_change + * Serialize a change to a file for the current toplevel transaction. + * + * The change is serialized in a simple format, with length (not including + * the length), action code (identifying the message type) and message + * contents (without the subxact TransactionId value). + */ +static void +stream_write_change(char action, StringInfo s) +{ + int len; + + Assert(stream_fd != NULL); + + /* total on-disk size, including the action type character */ + len = (s->len - s->cursor) + sizeof(char); + + /* first write the size */ + BufFileWrite(stream_fd, &len, sizeof(len)); + + /* then the action */ + BufFileWrite(stream_fd, &action, sizeof(action)); + + /* and finally the remaining part of the buffer (after the XID) */ + len = (s->len - s->cursor); + + BufFileWrite(stream_fd, &s->data[s->cursor], len); +} + +/* + * stream_open_and_write_change + * Serialize a message to a file for the given transaction. + * + * This function is similar to stream_write_change except that it will open the + * target file if not already before writing the message and close the file at + * the end. + */ +static void +stream_open_and_write_change(TransactionId xid, char action, StringInfo s) +{ + Assert(!in_streamed_transaction); + + if (!stream_fd) + stream_start_internal(xid, false); + + stream_write_change(action, s); + stream_stop_internal(xid); +} + +/* + * Cleanup the memory for subxacts and reset the related variables. + */ +static inline void +cleanup_subxact_info() +{ + if (subxact_data.subxacts) + pfree(subxact_data.subxacts); + + subxact_data.subxacts = NULL; + subxact_data.subxact_last = InvalidTransactionId; + subxact_data.nsubxacts = 0; + subxact_data.nsubxacts_max = 0; +} + +/* + * Form the prepared transaction GID for two_phase transactions. + * + * Return the GID in the supplied buffer. + */ +static void +TwoPhaseTransactionGid(Oid subid, TransactionId xid, char *gid, int szgid) +{ + Assert(subid != InvalidRepOriginId); + + if (!TransactionIdIsValid(xid)) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid two-phase transaction ID"))); + + snprintf(gid, szgid, "pg_gid_%u_%u", subid, xid); +} + +/* + * Execute the initial sync with error handling. Disable the subscription, + * if it's required. + * + * Allocate the slot name in long-lived context on return. Note that we don't + * handle FATAL errors which are probably because of system resource error and + * are not repeatable. + */ +static void +start_table_sync(XLogRecPtr *origin_startpos, char **myslotname) +{ + char *syncslotname = NULL; + + Assert(am_tablesync_worker()); + + PG_TRY(); + { + /* Call initial sync. */ + syncslotname = LogicalRepSyncTableStart(origin_startpos); + } + PG_CATCH(); + { + if (MySubscription->disableonerr) + DisableSubscriptionAndExit(); + else + { + /* + * Report the worker failed during table synchronization. Abort + * the current transaction so that the stats message is sent in an + * idle state. + */ + AbortOutOfAnyTransaction(); + pgstat_report_subscription_error(MySubscription->oid, false); + + PG_RE_THROW(); + } + } + PG_END_TRY(); + + /* allocate slot name in long-lived context */ + *myslotname = MemoryContextStrdup(ApplyContext, syncslotname); + pfree(syncslotname); +} + +/* + * Run the apply loop with error handling. Disable the subscription, + * if necessary. + * + * Note that we don't handle FATAL errors which are probably because + * of system resource error and are not repeatable. + */ +static void +start_apply(XLogRecPtr origin_startpos) +{ + PG_TRY(); + { + LogicalRepApplyLoop(origin_startpos); + } + PG_CATCH(); + { + if (MySubscription->disableonerr) + DisableSubscriptionAndExit(); + else + { + /* + * Report the worker failed while applying changes. Abort the + * current transaction so that the stats message is sent in an + * idle state. + */ + AbortOutOfAnyTransaction(); + pgstat_report_subscription_error(MySubscription->oid, !am_tablesync_worker()); + + PG_RE_THROW(); + } + } + PG_END_TRY(); +} + +/* + * Common initialization for leader apply worker and parallel apply worker. + * + * Initialize the database connection, in-memory subscription and necessary + * config options. + */ +void +InitializeApplyWorker(void) +{ + MemoryContext oldctx; + + /* Run as replica session replication role. */ + SetConfigOption("session_replication_role", "replica", + PGC_SUSET, PGC_S_OVERRIDE); + + /* Connect to our database. */ + BackgroundWorkerInitializeConnectionByOid(MyLogicalRepWorker->dbid, + MyLogicalRepWorker->userid, + 0); + + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* Load the subscription into persistent memory context. */ + ApplyContext = AllocSetContextCreate(TopMemoryContext, + "ApplyContext", + ALLOCSET_DEFAULT_SIZES); + StartTransactionCommand(); + oldctx = MemoryContextSwitchTo(ApplyContext); + + MySubscription = GetSubscription(MyLogicalRepWorker->subid, true); + if (!MySubscription) + { + ereport(LOG, + (errmsg("logical replication worker for subscription %u will not start because the subscription was removed during startup", + MyLogicalRepWorker->subid))); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + proc_exit(0); + } + + MySubscriptionValid = true; + MemoryContextSwitchTo(oldctx); + + if (!MySubscription->enabled) + { + ereport(LOG, + (errmsg("logical replication worker for subscription \"%s\" will not start because the subscription was disabled during startup", + MySubscription->name))); + + apply_worker_exit(); + } + + /* Setup synchronous commit according to the user's wishes */ + SetConfigOption("synchronous_commit", MySubscription->synccommit, + PGC_BACKEND, PGC_S_OVERRIDE); + + /* Keep us informed about subscription changes. */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONOID, + subscription_change_cb, + (Datum) 0); + + if (am_tablesync_worker()) + ereport(LOG, + (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started", + MySubscription->name, + get_rel_name(MyLogicalRepWorker->relid)))); + else + ereport(LOG, + (errmsg("logical replication apply worker for subscription \"%s\" has started", + MySubscription->name))); + + CommitTransactionCommand(); +} + +/* Logical Replication Apply worker entry point */ +void +ApplyWorkerMain(Datum main_arg) +{ + int worker_slot = DatumGetInt32(main_arg); + char originname[NAMEDATALEN]; + XLogRecPtr origin_startpos = InvalidXLogRecPtr; + char *myslotname = NULL; + WalRcvStreamOptions options; + int server_version; + + InitializingApplyWorker = true; + + /* Attach to slot */ + logicalrep_worker_attach(worker_slot); + + /* Setup signal handling */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + /* + * We don't currently need any ResourceOwner in a walreceiver process, but + * if we did, we could call CreateAuxProcessResourceOwner here. + */ + + /* Initialise stats to a sanish value */ + MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time = + MyLogicalRepWorker->reply_time = GetCurrentTimestamp(); + + /* Load the libpq-specific functions */ + load_file("libpqwalreceiver", false); + + InitializeApplyWorker(); + + InitializingApplyWorker = false; + + /* Connect to the origin and start the replication. */ + elog(DEBUG1, "connecting to publisher using connection string \"%s\"", + MySubscription->conninfo); + + if (am_tablesync_worker()) + { + start_table_sync(&origin_startpos, &myslotname); + + ReplicationOriginNameForLogicalRep(MySubscription->oid, + MyLogicalRepWorker->relid, + originname, + sizeof(originname)); + set_apply_error_context_origin(originname); + } + else + { + /* This is the leader apply worker */ + RepOriginId originid; + TimeLineID startpointTLI; + char *err; + bool must_use_password; + + myslotname = MySubscription->slotname; + + /* + * This shouldn't happen if the subscription is enabled, but guard + * against DDL bugs or manual catalog changes. (libpqwalreceiver will + * crash if slot is NULL.) + */ + if (!myslotname) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("subscription has no replication slot set"))); + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + ReplicationOriginNameForLogicalRep(MySubscription->oid, InvalidOid, + originname, sizeof(originname)); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid, 0); + replorigin_session_origin = originid; + origin_startpos = replorigin_session_get_progress(false); + + /* Is the use of a password mandatory? */ + must_use_password = MySubscription->passwordrequired && + !superuser_arg(MySubscription->owner); + + /* Note that the superuser_arg call can access the DB */ + CommitTransactionCommand(); + + LogRepWorkerWalRcvConn = walrcv_connect(MySubscription->conninfo, true, + must_use_password, + MySubscription->name, &err); + if (LogRepWorkerWalRcvConn == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the publisher: %s", err))); + + /* + * We don't really use the output identify_system for anything but it + * does some initializations on the upstream so let's still call it. + */ + (void) walrcv_identify_system(LogRepWorkerWalRcvConn, &startpointTLI); + + set_apply_error_context_origin(originname); + } + + /* + * Setup callback for syscache so that we know when something changes in + * the subscription relation state. + */ + CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP, + invalidate_syncing_table_states, + (Datum) 0); + + /* Build logical replication streaming options. */ + options.logical = true; + options.startpoint = origin_startpos; + options.slotname = myslotname; + + server_version = walrcv_server_version(LogRepWorkerWalRcvConn); + options.proto.logical.proto_version = + server_version >= 160000 ? LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM : + server_version >= 150000 ? LOGICALREP_PROTO_TWOPHASE_VERSION_NUM : + server_version >= 140000 ? LOGICALREP_PROTO_STREAM_VERSION_NUM : + LOGICALREP_PROTO_VERSION_NUM; + + options.proto.logical.publication_names = MySubscription->publications; + options.proto.logical.binary = MySubscription->binary; + + /* + * Assign the appropriate option value for streaming option according to + * the 'streaming' mode and the publisher's ability to support that mode. + */ + if (server_version >= 160000 && + MySubscription->stream == LOGICALREP_STREAM_PARALLEL) + { + options.proto.logical.streaming_str = "parallel"; + MyLogicalRepWorker->parallel_apply = true; + } + else if (server_version >= 140000 && + MySubscription->stream != LOGICALREP_STREAM_OFF) + { + options.proto.logical.streaming_str = "on"; + MyLogicalRepWorker->parallel_apply = false; + } + else + { + options.proto.logical.streaming_str = NULL; + MyLogicalRepWorker->parallel_apply = false; + } + + options.proto.logical.twophase = false; + options.proto.logical.origin = pstrdup(MySubscription->origin); + + if (!am_tablesync_worker()) + { + /* + * Even when the two_phase mode is requested by the user, it remains + * as the tri-state PENDING until all tablesyncs have reached READY + * state. Only then, can it become ENABLED. + * + * Note: If the subscription has no tables then leave the state as + * PENDING, which allows ALTER SUBSCRIPTION ... REFRESH PUBLICATION to + * work. + */ + if (MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING && + AllTablesyncsReady()) + { + /* Start streaming with two_phase enabled */ + options.proto.logical.twophase = true; + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + + StartTransactionCommand(); + UpdateTwoPhaseState(MySubscription->oid, LOGICALREP_TWOPHASE_STATE_ENABLED); + MySubscription->twophasestate = LOGICALREP_TWOPHASE_STATE_ENABLED; + CommitTransactionCommand(); + } + else + { + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + } + + ereport(DEBUG1, + (errmsg_internal("logical replication apply worker for subscription \"%s\" two_phase is %s", + MySubscription->name, + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_DISABLED ? "DISABLED" : + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_PENDING ? "PENDING" : + MySubscription->twophasestate == LOGICALREP_TWOPHASE_STATE_ENABLED ? "ENABLED" : + "?"))); + } + else + { + /* Start normal logical streaming replication. */ + walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); + } + + /* Run the main loop. */ + start_apply(origin_startpos); + + proc_exit(0); +} + +/* + * After error recovery, disable the subscription in a new transaction + * and exit cleanly. + */ +static void +DisableSubscriptionAndExit(void) +{ + /* + * Emit the error message, and recover from the error state to an idle + * state + */ + HOLD_INTERRUPTS(); + + EmitErrorReport(); + AbortOutOfAnyTransaction(); + FlushErrorState(); + + RESUME_INTERRUPTS(); + + /* Report the worker failed during either table synchronization or apply */ + pgstat_report_subscription_error(MyLogicalRepWorker->subid, + !am_tablesync_worker()); + + /* Disable the subscription */ + StartTransactionCommand(); + DisableSubscription(MySubscription->oid); + CommitTransactionCommand(); + + /* Ensure we remove no-longer-useful entry for worker's start time */ + if (!am_tablesync_worker() && !am_parallel_apply_worker()) + ApplyLauncherForgetWorkerStartTime(MyLogicalRepWorker->subid); + + /* Notify the subscription has been disabled and exit */ + ereport(LOG, + errmsg("subscription \"%s\" has been disabled because of an error", + MySubscription->name)); + + proc_exit(0); +} + +/* + * Is current process a logical replication worker? + */ +bool +IsLogicalWorker(void) +{ + return MyLogicalRepWorker != NULL; +} + +/* + * Is current process a logical replication parallel apply worker? + */ +bool +IsLogicalParallelApplyWorker(void) +{ + return IsLogicalWorker() && am_parallel_apply_worker(); +} + +/* + * Start skipping changes of the transaction if the given LSN matches the + * LSN specified by subscription's skiplsn. + */ +static void +maybe_start_skipping_changes(XLogRecPtr finish_lsn) +{ + Assert(!is_skipping_changes()); + Assert(!in_remote_transaction); + Assert(!in_streamed_transaction); + + /* + * Quick return if it's not requested to skip this transaction. This + * function is called for every remote transaction and we assume that + * skipping the transaction is not used often. + */ + if (likely(XLogRecPtrIsInvalid(MySubscription->skiplsn) || + MySubscription->skiplsn != finish_lsn)) + return; + + /* Start skipping all changes of this transaction */ + skip_xact_finish_lsn = finish_lsn; + + ereport(LOG, + errmsg("logical replication starts skipping transaction at LSN %X/%X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn))); +} + +/* + * Stop skipping changes by resetting skip_xact_finish_lsn if enabled. + */ +static void +stop_skipping_changes(void) +{ + if (!is_skipping_changes()) + return; + + ereport(LOG, + (errmsg("logical replication completed skipping transaction at LSN %X/%X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn)))); + + /* Stop skipping changes */ + skip_xact_finish_lsn = InvalidXLogRecPtr; +} + +/* + * Clear subskiplsn of pg_subscription catalog. + * + * finish_lsn is the transaction's finish LSN that is used to check if the + * subskiplsn matches it. If not matched, we raise a warning when clearing the + * subskiplsn in order to inform users for cases e.g., where the user mistakenly + * specified the wrong subskiplsn. + */ +static void +clear_subscription_skip_lsn(XLogRecPtr finish_lsn) +{ + Relation rel; + Form_pg_subscription subform; + HeapTuple tup; + XLogRecPtr myskiplsn = MySubscription->skiplsn; + bool started_tx = false; + + if (likely(XLogRecPtrIsInvalid(myskiplsn)) || am_parallel_apply_worker()) + return; + + if (!IsTransactionState()) + { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Protect subskiplsn of pg_subscription from being concurrently updated + * while clearing it. + */ + LockSharedObject(SubscriptionRelationId, MySubscription->oid, 0, + AccessShareLock); + + rel = table_open(SubscriptionRelationId, RowExclusiveLock); + + /* Fetch the existing tuple. */ + tup = SearchSysCacheCopy1(SUBSCRIPTIONOID, + ObjectIdGetDatum(MySubscription->oid)); + + if (!HeapTupleIsValid(tup)) + elog(ERROR, "subscription \"%s\" does not exist", MySubscription->name); + + subform = (Form_pg_subscription) GETSTRUCT(tup); + + /* + * Clear the subskiplsn. If the user has already changed subskiplsn before + * clearing it we don't update the catalog and the replication origin + * state won't get advanced. So in the worst case, if the server crashes + * before sending an acknowledgment of the flush position the transaction + * will be sent again and the user needs to set subskiplsn again. We can + * reduce the possibility by logging a replication origin WAL record to + * advance the origin LSN instead but there is no way to advance the + * origin timestamp and it doesn't seem to be worth doing anything about + * it since it's a very rare case. + */ + if (subform->subskiplsn == myskiplsn) + { + bool nulls[Natts_pg_subscription]; + bool replaces[Natts_pg_subscription]; + Datum values[Natts_pg_subscription]; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replaces, false, sizeof(replaces)); + + /* reset subskiplsn */ + values[Anum_pg_subscription_subskiplsn - 1] = LSNGetDatum(InvalidXLogRecPtr); + replaces[Anum_pg_subscription_subskiplsn - 1] = true; + + tup = heap_modify_tuple(tup, RelationGetDescr(rel), values, nulls, + replaces); + CatalogTupleUpdate(rel, &tup->t_self, tup); + + if (myskiplsn != finish_lsn) + ereport(WARNING, + errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name), + errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.", + LSN_FORMAT_ARGS(finish_lsn), + LSN_FORMAT_ARGS(myskiplsn))); + } + + heap_freetuple(tup); + table_close(rel, NoLock); + + if (started_tx) + CommitTransactionCommand(); +} + +/* Error callback to give more context info about the change being applied */ +void +apply_error_callback(void *arg) +{ + ApplyErrorCallbackArg *errarg = &apply_error_callback_arg; + + if (apply_error_callback_arg.command == 0) + return; + + Assert(errarg->origin_name); + + if (errarg->rel == NULL) + { + if (!TransactionIdIsValid(errarg->remote_xid)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\"", + errarg->origin_name, + logicalrep_message_type(errarg->command)); + else if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + else + { + if (errarg->remote_attnum < 0) + { + if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + else + { + if (XLogRecPtrIsInvalid(errarg->finish_lsn)) + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->rel->remoterel.attnames[errarg->remote_attnum], + errarg->remote_xid); + else + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X", + errarg->origin_name, + logicalrep_message_type(errarg->command), + errarg->rel->remoterel.nspname, + errarg->rel->remoterel.relname, + errarg->rel->remoterel.attnames[errarg->remote_attnum], + errarg->remote_xid, + LSN_FORMAT_ARGS(errarg->finish_lsn)); + } + } +} + +/* Set transaction information of apply error callback */ +static inline void +set_apply_error_context_xact(TransactionId xid, XLogRecPtr lsn) +{ + apply_error_callback_arg.remote_xid = xid; + apply_error_callback_arg.finish_lsn = lsn; +} + +/* Reset all information of apply error callback */ +static inline void +reset_apply_error_context_info(void) +{ + apply_error_callback_arg.command = 0; + apply_error_callback_arg.rel = NULL; + apply_error_callback_arg.remote_attnum = -1; + set_apply_error_context_xact(InvalidTransactionId, InvalidXLogRecPtr); +} + +/* + * Request wakeup of the workers for the given subscription OID + * at commit of the current transaction. + * + * This is used to ensure that the workers process assorted changes + * as soon as possible. + */ +void +LogicalRepWorkersWakeupAtCommit(Oid subid) +{ + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(TopTransactionContext); + on_commit_wakeup_workers_subids = + list_append_unique_oid(on_commit_wakeup_workers_subids, subid); + MemoryContextSwitchTo(oldcxt); +} + +/* + * Wake up the workers of any subscriptions that were changed in this xact. + */ +void +AtEOXact_LogicalRepWorkers(bool isCommit) +{ + if (isCommit && on_commit_wakeup_workers_subids != NIL) + { + ListCell *lc; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + foreach(lc, on_commit_wakeup_workers_subids) + { + Oid subid = lfirst_oid(lc); + List *workers; + ListCell *lc2; + + workers = logicalrep_workers_find(subid, true); + foreach(lc2, workers) + { + LogicalRepWorker *worker = (LogicalRepWorker *) lfirst(lc2); + + logicalrep_worker_wakeup_ptr(worker); + } + } + LWLockRelease(LogicalRepWorkerLock); + } + + /* The List storage will be reclaimed automatically in xact cleanup. */ + on_commit_wakeup_workers_subids = NIL; +} + +/* + * Allocate the origin name in long-lived context for error context message. + */ +void +set_apply_error_context_origin(char *originname) +{ + apply_error_callback_arg.origin_name = MemoryContextStrdup(ApplyContext, + originname); +} + +/* + * Return the action to be taken for the given transaction. See + * TransApplyAction for information on each of the actions. + * + * *winfo is assigned to the destination parallel worker info when the leader + * apply worker has to pass all the transaction's changes to the parallel + * apply worker. + */ +static TransApplyAction +get_transaction_apply_action(TransactionId xid, ParallelApplyWorkerInfo **winfo) +{ + *winfo = NULL; + + if (am_parallel_apply_worker()) + { + return TRANS_PARALLEL_APPLY; + } + + /* + * If we are processing this transaction using a parallel apply worker + * then either we send the changes to the parallel worker or if the worker + * is busy then serialize the changes to the file which will later be + * processed by the parallel worker. + */ + *winfo = pa_find_worker(xid); + + if (*winfo && (*winfo)->serialize_changes) + { + return TRANS_LEADER_PARTIAL_SERIALIZE; + } + else if (*winfo) + { + return TRANS_LEADER_SEND_TO_PARALLEL; + } + + /* + * If there is no parallel worker involved to process this transaction + * then we either directly apply the change or serialize it to a file + * which will later be applied when the transaction finish message is + * processed. + */ + else if (in_streamed_transaction) + { + return TRANS_LEADER_SERIALIZE; + } + else + { + return TRANS_LEADER_APPLY; + } +} diff --git a/src/backend/replication/meson.build b/src/backend/replication/meson.build new file mode 100644 index 0000000..c158a75 --- /dev/null +++ b/src/backend/replication/meson.build @@ -0,0 +1,53 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'slot.c', + 'slotfuncs.c', + 'syncrep.c', + 'walreceiver.c', + 'walreceiverfuncs.c', + 'walsender.c', +) + +# see ../parser/meson.build +repl_parser_sources = [] + +repl_scanner = custom_target('repl_scanner', + input: 'repl_scanner.l', + output: 'repl_scanner.c', + command: flex_cmd, +) +generated_sources += repl_scanner +repl_parser_sources += repl_scanner + +repl_gram = custom_target('repl_gram', + input: 'repl_gram.y', + kwargs: bison_kw, +) +generated_sources += repl_gram.to_list() +repl_parser_sources += repl_gram + +syncrep_scanner = custom_target('syncrep_scanner', + input: 'syncrep_scanner.l', + output: 'syncrep_scanner.c', + command: flex_cmd, +) +generated_sources += syncrep_scanner +repl_parser_sources += syncrep_scanner + +syncrep_gram = custom_target('syncrep_gram', + input: 'syncrep_gram.y', + kwargs: bison_kw, +) +generated_sources += syncrep_gram.to_list() +repl_parser_sources += syncrep_gram + +repl_parser = static_library('repl_parser', + repl_parser_sources, + dependencies: [backend_code], + include_directories: include_directories('.'), + kwargs: internal_lib_args, +) +backend_link_with += repl_parser + +subdir('logical') diff --git a/src/backend/replication/pgoutput/Makefile b/src/backend/replication/pgoutput/Makefile new file mode 100644 index 0000000..3b41fbc --- /dev/null +++ b/src/backend/replication/pgoutput/Makefile @@ -0,0 +1,32 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/pgoutput +# +# IDENTIFICATION +# src/backend/replication/pgoutput +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/pgoutput +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(WIN32RES) \ + pgoutput.o +PGFILEDESC = "pgoutput - standard logical replication output plugin" +NAME = pgoutput + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean maintainer-clean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/pgoutput/meson.build b/src/backend/replication/pgoutput/meson.build new file mode 100644 index 0000000..243c92d --- /dev/null +++ b/src/backend/replication/pgoutput/meson.build @@ -0,0 +1,18 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pgoutput_sources = files( + 'pgoutput.c', +) + +if host_system == 'windows' + pgoutput_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pgoutput', + '--FILEDESC', 'pgoutput - standard logical replication output plugin',]) +endif + +pgoutput = shared_module('pgoutput', + pgoutput_sources, + kwargs: pg_mod_args, +) + +backend_targets += pgoutput diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c new file mode 100644 index 0000000..05688cd --- /dev/null +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -0,0 +1,2362 @@ +/*------------------------------------------------------------------------- + * + * pgoutput.c + * Logical Replication output plugin + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/pgoutput/pgoutput.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tupconvert.h" +#include "catalog/partition.h" +#include "catalog/pg_publication.h" +#include "catalog/pg_publication_rel.h" +#include "catalog/pg_subscription.h" +#include "commands/defrem.h" +#include "commands/subscriptioncmds.h" +#include "executor/executor.h" +#include "fmgr.h" +#include "nodes/makefuncs.h" +#include "optimizer/optimizer.h" +#include "parser/parse_relation.h" +#include "replication/logical.h" +#include "replication/logicalproto.h" +#include "replication/origin.h" +#include "replication/pgoutput.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +PG_MODULE_MAGIC; + +static void pgoutput_startup(LogicalDecodingContext *ctx, + OutputPluginOptions *opt, bool is_init); +static void pgoutput_shutdown(LogicalDecodingContext *ctx); +static void pgoutput_begin_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_commit_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr commit_lsn); +static void pgoutput_change(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, Relation relation, + ReorderBufferChange *change); +static void pgoutput_truncate(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, int nrelations, Relation relations[], + ReorderBufferChange *change); +static void pgoutput_message(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr message_lsn, + bool transactional, const char *prefix, + Size sz, const char *message); +static bool pgoutput_origin_filter(LogicalDecodingContext *ctx, + RepOriginId origin_id); +static void pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_prepare_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr prepare_lsn); +static void pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr commit_lsn); +static void pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time); +static void pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn); +static void pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn); +static void pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn); +static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, XLogRecPtr prepare_lsn); + +static bool publications_valid; +static bool in_streaming; + +static List *LoadPublications(List *pubnames); +static void publication_invalidation_cb(Datum arg, int cacheid, + uint32 hashvalue); +static void send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx, + Bitmapset *columns); +static void send_repl_origin(LogicalDecodingContext *ctx, + RepOriginId origin_id, XLogRecPtr origin_lsn, + bool send_origin); + +/* + * Only 3 publication actions are used for row filtering ("insert", "update", + * "delete"). See RelationSyncEntry.exprstate[]. + */ +enum RowFilterPubAction +{ + PUBACTION_INSERT, + PUBACTION_UPDATE, + PUBACTION_DELETE +}; + +#define NUM_ROWFILTER_PUBACTIONS (PUBACTION_DELETE+1) + +/* + * Entry in the map used to remember which relation schemas we sent. + * + * The schema_sent flag determines if the current schema record for the + * relation (and for its ancestor if publish_as_relid is set) was already + * sent to the subscriber (in which case we don't need to send it again). + * + * The schema cache on downstream is however updated only at commit time, + * and with streamed transactions the commit order may be different from + * the order the transactions are sent in. Also, the (sub) transactions + * might get aborted so we need to send the schema for each (sub) transaction + * so that we don't lose the schema information on abort. For handling this, + * we maintain the list of xids (streamed_txns) for those we have already sent + * the schema. + * + * For partitions, 'pubactions' considers not only the table's own + * publications, but also those of all of its ancestors. + */ +typedef struct RelationSyncEntry +{ + Oid relid; /* relation oid */ + + bool replicate_valid; /* overall validity flag for entry */ + + bool schema_sent; + List *streamed_txns; /* streamed toplevel transactions with this + * schema */ + + /* are we publishing this rel? */ + PublicationActions pubactions; + + /* + * ExprState array for row filter. Different publication actions don't + * allow multiple expressions to always be combined into one, because + * updates or deletes restrict the column in expression to be part of the + * replica identity index whereas inserts do not have this restriction, so + * there is one ExprState per publication action. + */ + ExprState *exprstate[NUM_ROWFILTER_PUBACTIONS]; + EState *estate; /* executor state used for row filter */ + TupleTableSlot *new_slot; /* slot for storing new tuple */ + TupleTableSlot *old_slot; /* slot for storing old tuple */ + + /* + * OID of the relation to publish changes as. For a partition, this may + * be set to one of its ancestors whose schema will be used when + * replicating changes, if publish_via_partition_root is set for the + * publication. + */ + Oid publish_as_relid; + + /* + * Map used when replicating using an ancestor's schema to convert tuples + * from partition's type to the ancestor's; NULL if publish_as_relid is + * same as 'relid' or if unnecessary due to partition and the ancestor + * having identical TupleDesc. + */ + AttrMap *attrmap; + + /* + * Columns included in the publication, or NULL if all columns are + * included implicitly. Note that the attnums in this bitmap are not + * shifted by FirstLowInvalidHeapAttributeNumber. + */ + Bitmapset *columns; + + /* + * Private context to store additional data for this entry - state for the + * row filter expressions, column list, etc. + */ + MemoryContext entry_cxt; +} RelationSyncEntry; + +/* + * Maintain a per-transaction level variable to track whether the transaction + * has sent BEGIN. BEGIN is only sent when the first change in a transaction + * is processed. This makes it possible to skip sending a pair of BEGIN/COMMIT + * messages for empty transactions which saves network bandwidth. + * + * This optimization is not used for prepared transactions because if the + * WALSender restarts after prepare of a transaction and before commit prepared + * of the same transaction then we won't be able to figure out if we have + * skipped sending BEGIN/PREPARE of a transaction as it was empty. This is + * because we would have lost the in-memory txndata information that was + * present prior to the restart. This will result in sending a spurious + * COMMIT PREPARED without a corresponding prepared transaction at the + * downstream which would lead to an error when it tries to process it. + * + * XXX We could achieve this optimization by changing protocol to send + * additional information so that downstream can detect that the corresponding + * prepare has not been sent. However, adding such a check for every + * transaction in the downstream could be costly so we might want to do it + * optionally. + * + * We also don't have this optimization for streamed transactions because + * they can contain prepared transactions. + */ +typedef struct PGOutputTxnData +{ + bool sent_begin_txn; /* flag indicating whether BEGIN has been sent */ +} PGOutputTxnData; + +/* Map used to remember which relation schemas we sent. */ +static HTAB *RelationSyncCache = NULL; + +static void init_rel_sync_cache(MemoryContext cachectx); +static void cleanup_rel_sync_cache(TransactionId xid, bool is_commit); +static RelationSyncEntry *get_rel_sync_entry(PGOutputData *data, + Relation relation); +static void rel_sync_cache_relation_cb(Datum arg, Oid relid); +static void rel_sync_cache_publication_cb(Datum arg, int cacheid, + uint32 hashvalue); +static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); +static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, + TransactionId xid); +static void init_tuple_slot(PGOutputData *data, Relation relation, + RelationSyncEntry *entry); + +/* row filter routines */ +static EState *create_estate_for_relation(Relation rel); +static void pgoutput_row_filter_init(PGOutputData *data, + List *publications, + RelationSyncEntry *entry); +static bool pgoutput_row_filter_exec_expr(ExprState *state, + ExprContext *econtext); +static bool pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, + TupleTableSlot **new_slot_ptr, + RelationSyncEntry *entry, + ReorderBufferChangeType *action); + +/* column list routines */ +static void pgoutput_column_list_init(PGOutputData *data, + List *publications, + RelationSyncEntry *entry); + +/* + * Specify output plugin callbacks + */ +void +_PG_output_plugin_init(OutputPluginCallbacks *cb) +{ + cb->startup_cb = pgoutput_startup; + cb->begin_cb = pgoutput_begin_txn; + cb->change_cb = pgoutput_change; + cb->truncate_cb = pgoutput_truncate; + cb->message_cb = pgoutput_message; + cb->commit_cb = pgoutput_commit_txn; + + cb->begin_prepare_cb = pgoutput_begin_prepare_txn; + cb->prepare_cb = pgoutput_prepare_txn; + cb->commit_prepared_cb = pgoutput_commit_prepared_txn; + cb->rollback_prepared_cb = pgoutput_rollback_prepared_txn; + cb->filter_by_origin_cb = pgoutput_origin_filter; + cb->shutdown_cb = pgoutput_shutdown; + + /* transaction streaming */ + cb->stream_start_cb = pgoutput_stream_start; + cb->stream_stop_cb = pgoutput_stream_stop; + cb->stream_abort_cb = pgoutput_stream_abort; + cb->stream_commit_cb = pgoutput_stream_commit; + cb->stream_change_cb = pgoutput_change; + cb->stream_message_cb = pgoutput_message; + cb->stream_truncate_cb = pgoutput_truncate; + /* transaction streaming - two-phase commit */ + cb->stream_prepare_cb = pgoutput_stream_prepare_txn; +} + +static void +parse_output_parameters(List *options, PGOutputData *data) +{ + ListCell *lc; + bool protocol_version_given = false; + bool publication_names_given = false; + bool binary_option_given = false; + bool messages_option_given = false; + bool streaming_given = false; + bool two_phase_option_given = false; + bool origin_option_given = false; + + data->binary = false; + data->streaming = LOGICALREP_STREAM_OFF; + data->messages = false; + data->two_phase = false; + + foreach(lc, options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + Assert(defel->arg == NULL || IsA(defel->arg, String)); + + /* Check each param, whether or not we recognize it */ + if (strcmp(defel->defname, "proto_version") == 0) + { + unsigned long parsed; + char *endptr; + + if (protocol_version_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + protocol_version_given = true; + + errno = 0; + parsed = strtoul(strVal(defel->arg), &endptr, 10); + if (errno != 0 || *endptr != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid proto_version"))); + + if (parsed > PG_UINT32_MAX) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("proto_version \"%s\" out of range", + strVal(defel->arg)))); + + data->protocol_version = (uint32) parsed; + } + else if (strcmp(defel->defname, "publication_names") == 0) + { + if (publication_names_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + publication_names_given = true; + + if (!SplitIdentifierString(strVal(defel->arg), ',', + &data->publication_names)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid publication_names syntax"))); + } + else if (strcmp(defel->defname, "binary") == 0) + { + if (binary_option_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + binary_option_given = true; + + data->binary = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "messages") == 0) + { + if (messages_option_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + messages_option_given = true; + + data->messages = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "streaming") == 0) + { + if (streaming_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + streaming_given = true; + + data->streaming = defGetStreamingMode(defel); + } + else if (strcmp(defel->defname, "two_phase") == 0) + { + if (two_phase_option_given) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + two_phase_option_given = true; + + data->two_phase = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "origin") == 0) + { + if (origin_option_given) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options")); + origin_option_given = true; + + data->origin = defGetString(defel); + + if (pg_strcasecmp(data->origin, LOGICALREP_ORIGIN_NONE) != 0 && + pg_strcasecmp(data->origin, LOGICALREP_ORIGIN_ANY) != 0) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized origin value: \"%s\"", data->origin)); + } + else + elog(ERROR, "unrecognized pgoutput option: %s", defel->defname); + } +} + +/* + * Initialize this plugin + */ +static void +pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt, + bool is_init) +{ + PGOutputData *data = palloc0(sizeof(PGOutputData)); + static bool publication_callback_registered = false; + + /* Create our memory context for private allocations. */ + data->context = AllocSetContextCreate(ctx->context, + "logical replication output context", + ALLOCSET_DEFAULT_SIZES); + + data->cachectx = AllocSetContextCreate(ctx->context, + "logical replication cache context", + ALLOCSET_DEFAULT_SIZES); + + ctx->output_plugin_private = data; + + /* This plugin uses binary protocol. */ + opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT; + + /* + * This is replication start and not slot initialization. + * + * Parse and validate options passed by the client. + */ + if (!is_init) + { + /* Parse the params and ERROR if we see any we don't recognize */ + parse_output_parameters(ctx->output_plugin_options, data); + + /* Check if we support requested protocol */ + if (data->protocol_version > LOGICALREP_PROTO_MAX_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("client sent proto_version=%d but server only supports protocol %d or lower", + data->protocol_version, LOGICALREP_PROTO_MAX_VERSION_NUM))); + + if (data->protocol_version < LOGICALREP_PROTO_MIN_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("client sent proto_version=%d but server only supports protocol %d or higher", + data->protocol_version, LOGICALREP_PROTO_MIN_VERSION_NUM))); + + if (data->publication_names == NIL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("publication_names parameter missing"))); + + /* + * Decide whether to enable streaming. It is disabled by default, in + * which case we just update the flag in decoding context. Otherwise + * we only allow it with sufficient version of the protocol, and when + * the output plugin supports it. + */ + if (data->streaming == LOGICALREP_STREAM_OFF) + ctx->streaming = false; + else if (data->streaming == LOGICALREP_STREAM_ON && + data->protocol_version < LOGICALREP_PROTO_STREAM_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("requested proto_version=%d does not support streaming, need %d or higher", + data->protocol_version, LOGICALREP_PROTO_STREAM_VERSION_NUM))); + else if (data->streaming == LOGICALREP_STREAM_PARALLEL && + data->protocol_version < LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("requested proto_version=%d does not support parallel streaming, need %d or higher", + data->protocol_version, LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM))); + else if (!ctx->streaming) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("streaming requested, but not supported by output plugin"))); + + /* Also remember we're currently not streaming any transaction. */ + in_streaming = false; + + /* + * Here, we just check whether the two-phase option is passed by + * plugin and decide whether to enable it at later point of time. It + * remains enabled if the previous start-up has done so. But we only + * allow the option to be passed in with sufficient version of the + * protocol, and when the output plugin supports it. + */ + if (!data->two_phase) + ctx->twophase_opt_given = false; + else if (data->protocol_version < LOGICALREP_PROTO_TWOPHASE_VERSION_NUM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("requested proto_version=%d does not support two-phase commit, need %d or higher", + data->protocol_version, LOGICALREP_PROTO_TWOPHASE_VERSION_NUM))); + else if (!ctx->twophase) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("two-phase commit requested, but not supported by output plugin"))); + else + ctx->twophase_opt_given = true; + + /* Init publication state. */ + data->publications = NIL; + publications_valid = false; + + /* + * Register callback for pg_publication if we didn't already do that + * during some previous call in this process. + */ + if (!publication_callback_registered) + { + CacheRegisterSyscacheCallback(PUBLICATIONOID, + publication_invalidation_cb, + (Datum) 0); + publication_callback_registered = true; + } + + /* Initialize relation schema cache. */ + init_rel_sync_cache(CacheMemoryContext); + } + else + { + /* + * Disable the streaming and prepared transactions during the slot + * initialization mode. + */ + ctx->streaming = false; + ctx->twophase = false; + } +} + +/* + * BEGIN callback. + * + * Don't send the BEGIN message here instead postpone it until the first + * change. In logical replication, a common scenario is to replicate a set of + * tables (instead of all tables) and transactions whose changes were on + * the table(s) that are not published will produce empty transactions. These + * empty transactions will send BEGIN and COMMIT messages to subscribers, + * using bandwidth on something with little/no use for logical replication. + */ +static void +pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ + PGOutputTxnData *txndata = MemoryContextAllocZero(ctx->context, + sizeof(PGOutputTxnData)); + + txn->output_plugin_private = txndata; +} + +/* + * Send BEGIN. + * + * This is called while processing the first change of the transaction. + */ +static void +pgoutput_send_begin(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ + bool send_replication_origin = txn->origin_id != InvalidRepOriginId; + PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private; + + Assert(txndata); + Assert(!txndata->sent_begin_txn); + + OutputPluginPrepareWrite(ctx, !send_replication_origin); + logicalrep_write_begin(ctx->out, txn); + txndata->sent_begin_txn = true; + + send_repl_origin(ctx, txn->origin_id, txn->origin_lsn, + send_replication_origin); + + OutputPluginWrite(ctx, true); +} + +/* + * COMMIT callback + */ +static void +pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private; + bool sent_begin_txn; + + Assert(txndata); + + /* + * We don't need to send the commit message unless some relevant change + * from this transaction has been sent to the downstream. + */ + sent_begin_txn = txndata->sent_begin_txn; + OutputPluginUpdateProgress(ctx, !sent_begin_txn); + pfree(txndata); + txn->output_plugin_private = NULL; + + if (!sent_begin_txn) + { + elog(DEBUG1, "skipped replication of an empty transaction with XID: %u", txn->xid); + return; + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_commit(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); +} + +/* + * BEGIN PREPARE callback + */ +static void +pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) +{ + bool send_replication_origin = txn->origin_id != InvalidRepOriginId; + + OutputPluginPrepareWrite(ctx, !send_replication_origin); + logicalrep_write_begin_prepare(ctx->out, txn); + + send_repl_origin(ctx, txn->origin_id, txn->origin_lsn, + send_replication_origin); + + OutputPluginWrite(ctx, true); +} + +/* + * PREPARE callback + */ +static void +pgoutput_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + OutputPluginUpdateProgress(ctx, false); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_prepare(ctx->out, txn, prepare_lsn); + OutputPluginWrite(ctx, true); +} + +/* + * COMMIT PREPARED callback + */ +static void +pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + OutputPluginUpdateProgress(ctx, false); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_commit_prepared(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); +} + +/* + * ROLLBACK PREPARED callback + */ +static void +pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr prepare_end_lsn, + TimestampTz prepare_time) +{ + OutputPluginUpdateProgress(ctx, false); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_rollback_prepared(ctx->out, txn, prepare_end_lsn, + prepare_time); + OutputPluginWrite(ctx, true); +} + +/* + * Write the current schema of the relation and its ancestor (if any) if not + * done yet. + */ +static void +maybe_send_schema(LogicalDecodingContext *ctx, + ReorderBufferChange *change, + Relation relation, RelationSyncEntry *relentry) +{ + bool schema_sent; + TransactionId xid = InvalidTransactionId; + TransactionId topxid = InvalidTransactionId; + + /* + * Remember XID of the (sub)transaction for the change. We don't care if + * it's top-level transaction or not (we have already sent that XID in + * start of the current streaming block). + * + * If we're not in a streaming block, just use InvalidTransactionId and + * the write methods will not include it. + */ + if (in_streaming) + xid = change->txn->xid; + + if (rbtxn_is_subtxn(change->txn)) + topxid = rbtxn_get_toptxn(change->txn)->xid; + else + topxid = xid; + + /* + * Do we need to send the schema? We do track streamed transactions + * separately, because those may be applied later (and the regular + * transactions won't see their effects until then) and in an order that + * we don't know at this point. + * + * XXX There is a scope of optimization here. Currently, we always send + * the schema first time in a streaming transaction but we can probably + * avoid that by checking 'relentry->schema_sent' flag. However, before + * doing that we need to study its impact on the case where we have a mix + * of streaming and non-streaming transactions. + */ + if (in_streaming) + schema_sent = get_schema_sent_in_streamed_txn(relentry, topxid); + else + schema_sent = relentry->schema_sent; + + /* Nothing to do if we already sent the schema. */ + if (schema_sent) + return; + + /* + * Send the schema. If the changes will be published using an ancestor's + * schema, not the relation's own, send that ancestor's schema before + * sending relation's own (XXX - maybe sending only the former suffices?). + */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid); + + send_relation_and_attrs(ancestor, xid, ctx, relentry->columns); + RelationClose(ancestor); + } + + send_relation_and_attrs(relation, xid, ctx, relentry->columns); + + if (in_streaming) + set_schema_sent_in_streamed_txn(relentry, topxid); + else + relentry->schema_sent = true; +} + +/* + * Sends a relation + */ +static void +send_relation_and_attrs(Relation relation, TransactionId xid, + LogicalDecodingContext *ctx, + Bitmapset *columns) +{ + TupleDesc desc = RelationGetDescr(relation); + int i; + + /* + * Write out type info if needed. We do that only for user-created types. + * We use FirstGenbkiObjectId as the cutoff, so that we only consider + * objects with hand-assigned OIDs to be "built in", not for instance any + * function or type defined in the information_schema. This is important + * because only hand-assigned OIDs can be expected to remain stable across + * major versions. + */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + if (att->atttypid < FirstGenbkiObjectId) + continue; + + /* Skip this attribute if it's not present in the column list */ + if (columns != NULL && !bms_is_member(att->attnum, columns)) + continue; + + OutputPluginPrepareWrite(ctx, false); + logicalrep_write_typ(ctx->out, xid, att->atttypid); + OutputPluginWrite(ctx, false); + } + + OutputPluginPrepareWrite(ctx, false); + logicalrep_write_rel(ctx->out, xid, relation, columns); + OutputPluginWrite(ctx, false); +} + +/* + * Executor state preparation for evaluation of row filter expressions for the + * specified relation. + */ +static EState * +create_estate_for_relation(Relation rel) +{ + EState *estate; + RangeTblEntry *rte; + List *perminfos = NIL; + + estate = CreateExecutorState(); + + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = RelationGetRelid(rel); + rte->relkind = rel->rd_rel->relkind; + rte->rellockmode = AccessShareLock; + + addRTEPermissionInfo(&perminfos, rte); + + ExecInitRangeTable(estate, list_make1(rte), perminfos); + + estate->es_output_cid = GetCurrentCommandId(false); + + return estate; +} + +/* + * Evaluates row filter. + * + * If the row filter evaluates to NULL, it is taken as false i.e. the change + * isn't replicated. + */ +static bool +pgoutput_row_filter_exec_expr(ExprState *state, ExprContext *econtext) +{ + Datum ret; + bool isnull; + + Assert(state != NULL); + + ret = ExecEvalExprSwitchContext(state, econtext, &isnull); + + elog(DEBUG3, "row filter evaluates to %s (isnull: %s)", + isnull ? "false" : DatumGetBool(ret) ? "true" : "false", + isnull ? "true" : "false"); + + if (isnull) + return false; + + return DatumGetBool(ret); +} + +/* + * Make sure the per-entry memory context exists. + */ +static void +pgoutput_ensure_entry_cxt(PGOutputData *data, RelationSyncEntry *entry) +{ + Relation relation; + + /* The context may already exist, in which case bail out. */ + if (entry->entry_cxt) + return; + + relation = RelationIdGetRelation(entry->publish_as_relid); + + entry->entry_cxt = AllocSetContextCreate(data->cachectx, + "entry private context", + ALLOCSET_SMALL_SIZES); + + MemoryContextCopyAndSetIdentifier(entry->entry_cxt, + RelationGetRelationName(relation)); +} + +/* + * Initialize the row filter. + */ +static void +pgoutput_row_filter_init(PGOutputData *data, List *publications, + RelationSyncEntry *entry) +{ + ListCell *lc; + List *rfnodes[] = {NIL, NIL, NIL}; /* One per pubaction */ + bool no_filter[] = {false, false, false}; /* One per pubaction */ + MemoryContext oldctx; + int idx; + bool has_filter = true; + Oid schemaid = get_rel_namespace(entry->publish_as_relid); + + /* + * Find if there are any row filters for this relation. If there are, then + * prepare the necessary ExprState and cache it in entry->exprstate. To + * build an expression state, we need to ensure the following: + * + * All the given publication-table mappings must be checked. + * + * Multiple publications might have multiple row filters for this + * relation. Since row filter usage depends on the DML operation, there + * are multiple lists (one for each operation) to which row filters will + * be appended. + * + * FOR ALL TABLES and FOR TABLES IN SCHEMA implies "don't use row filter + * expression" so it takes precedence. + */ + foreach(lc, publications) + { + Publication *pub = lfirst(lc); + HeapTuple rftuple = NULL; + Datum rfdatum = 0; + bool pub_no_filter = true; + + /* + * If the publication is FOR ALL TABLES, or the publication includes a + * FOR TABLES IN SCHEMA where the table belongs to the referred + * schema, then it is treated the same as if there are no row filters + * (even if other publications have a row filter). + */ + if (!pub->alltables && + !SearchSysCacheExists2(PUBLICATIONNAMESPACEMAP, + ObjectIdGetDatum(schemaid), + ObjectIdGetDatum(pub->oid))) + { + /* + * Check for the presence of a row filter in this publication. + */ + rftuple = SearchSysCache2(PUBLICATIONRELMAP, + ObjectIdGetDatum(entry->publish_as_relid), + ObjectIdGetDatum(pub->oid)); + + if (HeapTupleIsValid(rftuple)) + { + /* Null indicates no filter. */ + rfdatum = SysCacheGetAttr(PUBLICATIONRELMAP, rftuple, + Anum_pg_publication_rel_prqual, + &pub_no_filter); + } + } + + if (pub_no_filter) + { + if (rftuple) + ReleaseSysCache(rftuple); + + no_filter[PUBACTION_INSERT] |= pub->pubactions.pubinsert; + no_filter[PUBACTION_UPDATE] |= pub->pubactions.pubupdate; + no_filter[PUBACTION_DELETE] |= pub->pubactions.pubdelete; + + /* + * Quick exit if all the DML actions are publicized via this + * publication. + */ + if (no_filter[PUBACTION_INSERT] && + no_filter[PUBACTION_UPDATE] && + no_filter[PUBACTION_DELETE]) + { + has_filter = false; + break; + } + + /* No additional work for this publication. Next one. */ + continue; + } + + /* Form the per pubaction row filter lists. */ + if (pub->pubactions.pubinsert && !no_filter[PUBACTION_INSERT]) + rfnodes[PUBACTION_INSERT] = lappend(rfnodes[PUBACTION_INSERT], + TextDatumGetCString(rfdatum)); + if (pub->pubactions.pubupdate && !no_filter[PUBACTION_UPDATE]) + rfnodes[PUBACTION_UPDATE] = lappend(rfnodes[PUBACTION_UPDATE], + TextDatumGetCString(rfdatum)); + if (pub->pubactions.pubdelete && !no_filter[PUBACTION_DELETE]) + rfnodes[PUBACTION_DELETE] = lappend(rfnodes[PUBACTION_DELETE], + TextDatumGetCString(rfdatum)); + + ReleaseSysCache(rftuple); + } /* loop all subscribed publications */ + + /* Clean the row filter */ + for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++) + { + if (no_filter[idx]) + { + list_free_deep(rfnodes[idx]); + rfnodes[idx] = NIL; + } + } + + if (has_filter) + { + Relation relation = RelationIdGetRelation(entry->publish_as_relid); + + pgoutput_ensure_entry_cxt(data, entry); + + /* + * Now all the filters for all pubactions are known. Combine them when + * their pubactions are the same. + */ + oldctx = MemoryContextSwitchTo(entry->entry_cxt); + entry->estate = create_estate_for_relation(relation); + for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++) + { + List *filters = NIL; + Expr *rfnode; + + if (rfnodes[idx] == NIL) + continue; + + foreach(lc, rfnodes[idx]) + filters = lappend(filters, stringToNode((char *) lfirst(lc))); + + /* combine the row filter and cache the ExprState */ + rfnode = make_orclause(filters); + entry->exprstate[idx] = ExecPrepareExpr(rfnode, entry->estate); + } /* for each pubaction */ + MemoryContextSwitchTo(oldctx); + + RelationClose(relation); + } +} + +/* + * Initialize the column list. + */ +static void +pgoutput_column_list_init(PGOutputData *data, List *publications, + RelationSyncEntry *entry) +{ + ListCell *lc; + bool first = true; + Relation relation = RelationIdGetRelation(entry->publish_as_relid); + + /* + * Find if there are any column lists for this relation. If there are, + * build a bitmap using the column lists. + * + * Multiple publications might have multiple column lists for this + * relation. + * + * Note that we don't support the case where the column list is different + * for the same table when combining publications. See comments atop + * fetch_table_list. But one can later change the publication so we still + * need to check all the given publication-table mappings and report an + * error if any publications have a different column list. + * + * FOR ALL TABLES and FOR TABLES IN SCHEMA imply "don't use column list". + */ + foreach(lc, publications) + { + Publication *pub = lfirst(lc); + HeapTuple cftuple = NULL; + Datum cfdatum = 0; + Bitmapset *cols = NULL; + + /* + * If the publication is FOR ALL TABLES then it is treated the same as + * if there are no column lists (even if other publications have a + * list). + */ + if (!pub->alltables) + { + bool pub_no_list = true; + + /* + * Check for the presence of a column list in this publication. + * + * Note: If we find no pg_publication_rel row, it's a publication + * defined for a whole schema, so it can't have a column list, + * just like a FOR ALL TABLES publication. + */ + cftuple = SearchSysCache2(PUBLICATIONRELMAP, + ObjectIdGetDatum(entry->publish_as_relid), + ObjectIdGetDatum(pub->oid)); + + if (HeapTupleIsValid(cftuple)) + { + /* Lookup the column list attribute. */ + cfdatum = SysCacheGetAttr(PUBLICATIONRELMAP, cftuple, + Anum_pg_publication_rel_prattrs, + &pub_no_list); + + /* Build the column list bitmap in the per-entry context. */ + if (!pub_no_list) /* when not null */ + { + int i; + int nliveatts = 0; + TupleDesc desc = RelationGetDescr(relation); + + pgoutput_ensure_entry_cxt(data, entry); + + cols = pub_collist_to_bitmapset(cols, cfdatum, + entry->entry_cxt); + + /* Get the number of live attributes. */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + if (att->attisdropped || att->attgenerated) + continue; + + nliveatts++; + } + + /* + * If column list includes all the columns of the table, + * set it to NULL. + */ + if (bms_num_members(cols) == nliveatts) + { + bms_free(cols); + cols = NULL; + } + } + + ReleaseSysCache(cftuple); + } + } + + if (first) + { + entry->columns = cols; + first = false; + } + else if (!bms_equal(entry->columns, cols)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use different column lists for table \"%s.%s\" in different publications", + get_namespace_name(RelationGetNamespace(relation)), + RelationGetRelationName(relation))); + } /* loop all subscribed publications */ + + RelationClose(relation); +} + +/* + * Initialize the slot for storing new and old tuples, and build the map that + * will be used to convert the relation's tuples into the ancestor's format. + */ +static void +init_tuple_slot(PGOutputData *data, Relation relation, + RelationSyncEntry *entry) +{ + MemoryContext oldctx; + TupleDesc oldtupdesc; + TupleDesc newtupdesc; + + oldctx = MemoryContextSwitchTo(data->cachectx); + + /* + * Create tuple table slots. Create a copy of the TupleDesc as it needs to + * live as long as the cache remains. + */ + oldtupdesc = CreateTupleDescCopyConstr(RelationGetDescr(relation)); + newtupdesc = CreateTupleDescCopyConstr(RelationGetDescr(relation)); + + entry->old_slot = MakeSingleTupleTableSlot(oldtupdesc, &TTSOpsHeapTuple); + entry->new_slot = MakeSingleTupleTableSlot(newtupdesc, &TTSOpsHeapTuple); + + MemoryContextSwitchTo(oldctx); + + /* + * Cache the map that will be used to convert the relation's tuples into + * the ancestor's format, if needed. + */ + if (entry->publish_as_relid != RelationGetRelid(relation)) + { + Relation ancestor = RelationIdGetRelation(entry->publish_as_relid); + TupleDesc indesc = RelationGetDescr(relation); + TupleDesc outdesc = RelationGetDescr(ancestor); + + /* Map must live as long as the session does. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + + entry->attrmap = build_attrmap_by_name_if_req(indesc, outdesc, false); + + MemoryContextSwitchTo(oldctx); + RelationClose(ancestor); + } +} + +/* + * Change is checked against the row filter if any. + * + * Returns true if the change is to be replicated, else false. + * + * For inserts, evaluate the row filter for new tuple. + * For deletes, evaluate the row filter for old tuple. + * For updates, evaluate the row filter for old and new tuple. + * + * For updates, if both evaluations are true, we allow sending the UPDATE and + * if both the evaluations are false, it doesn't replicate the UPDATE. Now, if + * only one of the tuples matches the row filter expression, we transform + * UPDATE to DELETE or INSERT to avoid any data inconsistency based on the + * following rules: + * + * Case 1: old-row (no match) new-row (no match) -> (drop change) + * Case 2: old-row (no match) new row (match) -> INSERT + * Case 3: old-row (match) new-row (no match) -> DELETE + * Case 4: old-row (match) new row (match) -> UPDATE + * + * The new action is updated in the action parameter. + * + * The new slot could be updated when transforming the UPDATE into INSERT, + * because the original new tuple might not have column values from the replica + * identity. + * + * Examples: + * Let's say the old tuple satisfies the row filter but the new tuple doesn't. + * Since the old tuple satisfies, the initial table synchronization copied this + * row (or another method was used to guarantee that there is data + * consistency). However, after the UPDATE the new tuple doesn't satisfy the + * row filter, so from a data consistency perspective, that row should be + * removed on the subscriber. The UPDATE should be transformed into a DELETE + * statement and be sent to the subscriber. Keeping this row on the subscriber + * is undesirable because it doesn't reflect what was defined in the row filter + * expression on the publisher. This row on the subscriber would likely not be + * modified by replication again. If someone inserted a new row with the same + * old identifier, replication could stop due to a constraint violation. + * + * Let's say the old tuple doesn't match the row filter but the new tuple does. + * Since the old tuple doesn't satisfy, the initial table synchronization + * probably didn't copy this row. However, after the UPDATE the new tuple does + * satisfy the row filter, so from a data consistency perspective, that row + * should be inserted on the subscriber. Otherwise, subsequent UPDATE or DELETE + * statements have no effect (it matches no row -- see + * apply_handle_update_internal()). So, the UPDATE should be transformed into a + * INSERT statement and be sent to the subscriber. However, this might surprise + * someone who expects the data set to satisfy the row filter expression on the + * provider. + */ +static bool +pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, + TupleTableSlot **new_slot_ptr, RelationSyncEntry *entry, + ReorderBufferChangeType *action) +{ + TupleDesc desc; + int i; + bool old_matched, + new_matched, + result; + TupleTableSlot *tmp_new_slot; + TupleTableSlot *new_slot = *new_slot_ptr; + ExprContext *ecxt; + ExprState *filter_exprstate; + + /* + * We need this map to avoid relying on ReorderBufferChangeType enums + * having specific values. + */ + static const int map_changetype_pubaction[] = { + [REORDER_BUFFER_CHANGE_INSERT] = PUBACTION_INSERT, + [REORDER_BUFFER_CHANGE_UPDATE] = PUBACTION_UPDATE, + [REORDER_BUFFER_CHANGE_DELETE] = PUBACTION_DELETE + }; + + Assert(*action == REORDER_BUFFER_CHANGE_INSERT || + *action == REORDER_BUFFER_CHANGE_UPDATE || + *action == REORDER_BUFFER_CHANGE_DELETE); + + Assert(new_slot || old_slot); + + /* Get the corresponding row filter */ + filter_exprstate = entry->exprstate[map_changetype_pubaction[*action]]; + + /* Bail out if there is no row filter */ + if (!filter_exprstate) + return true; + + elog(DEBUG3, "table \"%s.%s\" has row filter", + get_namespace_name(RelationGetNamespace(relation)), + RelationGetRelationName(relation)); + + ResetPerTupleExprContext(entry->estate); + + ecxt = GetPerTupleExprContext(entry->estate); + + /* + * For the following occasions where there is only one tuple, we can + * evaluate the row filter for that tuple and return. + * + * For inserts, we only have the new tuple. + * + * For updates, we can have only a new tuple when none of the replica + * identity columns changed and none of those columns have external data + * but we still need to evaluate the row filter for the new tuple as the + * existing values of those columns might not match the filter. Also, + * users can use constant expressions in the row filter, so we anyway need + * to evaluate it for the new tuple. + * + * For deletes, we only have the old tuple. + */ + if (!new_slot || !old_slot) + { + ecxt->ecxt_scantuple = new_slot ? new_slot : old_slot; + result = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt); + + return result; + } + + /* + * Both the old and new tuples must be valid only for updates and need to + * be checked against the row filter. + */ + Assert(map_changetype_pubaction[*action] == PUBACTION_UPDATE); + + slot_getallattrs(new_slot); + slot_getallattrs(old_slot); + + tmp_new_slot = NULL; + desc = RelationGetDescr(relation); + + /* + * The new tuple might not have all the replica identity columns, in which + * case it needs to be copied over from the old tuple. + */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); + + /* + * if the column in the new tuple or old tuple is null, nothing to do + */ + if (new_slot->tts_isnull[i] || old_slot->tts_isnull[i]) + continue; + + /* + * Unchanged toasted replica identity columns are only logged in the + * old tuple. Copy this over to the new tuple. The changed (or WAL + * Logged) toast values are always assembled in memory and set as + * VARTAG_INDIRECT. See ReorderBufferToastReplace. + */ + if (att->attlen == -1 && + VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && + !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + { + if (!tmp_new_slot) + { + tmp_new_slot = MakeSingleTupleTableSlot(desc, &TTSOpsVirtual); + ExecClearTuple(tmp_new_slot); + + memcpy(tmp_new_slot->tts_values, new_slot->tts_values, + desc->natts * sizeof(Datum)); + memcpy(tmp_new_slot->tts_isnull, new_slot->tts_isnull, + desc->natts * sizeof(bool)); + } + + tmp_new_slot->tts_values[i] = old_slot->tts_values[i]; + tmp_new_slot->tts_isnull[i] = old_slot->tts_isnull[i]; + } + } + + ecxt->ecxt_scantuple = old_slot; + old_matched = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt); + + if (tmp_new_slot) + { + ExecStoreVirtualTuple(tmp_new_slot); + ecxt->ecxt_scantuple = tmp_new_slot; + } + else + ecxt->ecxt_scantuple = new_slot; + + new_matched = pgoutput_row_filter_exec_expr(filter_exprstate, ecxt); + + /* + * Case 1: if both tuples don't match the row filter, bailout. Send + * nothing. + */ + if (!old_matched && !new_matched) + return false; + + /* + * Case 2: if the old tuple doesn't satisfy the row filter but the new + * tuple does, transform the UPDATE into INSERT. + * + * Use the newly transformed tuple that must contain the column values for + * all the replica identity columns. This is required to ensure that the + * while inserting the tuple in the downstream node, we have all the + * required column values. + */ + if (!old_matched && new_matched) + { + *action = REORDER_BUFFER_CHANGE_INSERT; + + if (tmp_new_slot) + *new_slot_ptr = tmp_new_slot; + } + + /* + * Case 3: if the old tuple satisfies the row filter but the new tuple + * doesn't, transform the UPDATE into DELETE. + * + * This transformation does not require another tuple. The Old tuple will + * be used for DELETE. + */ + else if (old_matched && !new_matched) + *action = REORDER_BUFFER_CHANGE_DELETE; + + /* + * Case 4: if both tuples match the row filter, transformation isn't + * required. (*action is default UPDATE). + */ + + return true; +} + +/* + * Sends the decoded DML over wire. + * + * This is called both in streaming and non-streaming modes. + */ +static void +pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + Relation relation, ReorderBufferChange *change) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private; + MemoryContext old; + RelationSyncEntry *relentry; + TransactionId xid = InvalidTransactionId; + Relation ancestor = NULL; + Relation targetrel = relation; + ReorderBufferChangeType action = change->action; + TupleTableSlot *old_slot = NULL; + TupleTableSlot *new_slot = NULL; + + if (!is_publishable_relation(relation)) + return; + + /* + * Remember the xid for the change in streaming mode. We need to send xid + * with each change in the streaming mode so that subscriber can make + * their association and on aborts, it can discard the corresponding + * changes. + */ + if (in_streaming) + xid = change->txn->xid; + + relentry = get_rel_sync_entry(data, relation); + + /* First check the table filter */ + switch (action) + { + case REORDER_BUFFER_CHANGE_INSERT: + if (!relentry->pubactions.pubinsert) + return; + break; + case REORDER_BUFFER_CHANGE_UPDATE: + if (!relentry->pubactions.pubupdate) + return; + break; + case REORDER_BUFFER_CHANGE_DELETE: + if (!relentry->pubactions.pubdelete) + return; + + /* + * This is only possible if deletes are allowed even when replica + * identity is not defined for a table. Since the DELETE action + * can't be published, we simply return. + */ + if (!change->data.tp.oldtuple) + { + elog(DEBUG1, "didn't send DELETE change because of missing oldtuple"); + return; + } + break; + default: + Assert(false); + } + + /* Avoid leaking memory by using and resetting our own context */ + old = MemoryContextSwitchTo(data->context); + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + ancestor = RelationIdGetRelation(relentry->publish_as_relid); + targetrel = ancestor; + } + + if (change->data.tp.oldtuple) + { + old_slot = relentry->old_slot; + ExecStoreHeapTuple(&change->data.tp.oldtuple->tuple, old_slot, false); + + /* Convert tuple if needed. */ + if (relentry->attrmap) + { + TupleTableSlot *slot = MakeTupleTableSlot(RelationGetDescr(targetrel), + &TTSOpsVirtual); + + old_slot = execute_attr_map_slot(relentry->attrmap, old_slot, slot); + } + } + + if (change->data.tp.newtuple) + { + new_slot = relentry->new_slot; + ExecStoreHeapTuple(&change->data.tp.newtuple->tuple, new_slot, false); + + /* Convert tuple if needed. */ + if (relentry->attrmap) + { + TupleTableSlot *slot = MakeTupleTableSlot(RelationGetDescr(targetrel), + &TTSOpsVirtual); + + new_slot = execute_attr_map_slot(relentry->attrmap, new_slot, slot); + } + } + + /* + * Check row filter. + * + * Updates could be transformed to inserts or deletes based on the results + * of the row filter for old and new tuple. + */ + if (!pgoutput_row_filter(targetrel, old_slot, &new_slot, relentry, &action)) + goto cleanup; + + /* + * Send BEGIN if we haven't yet. + * + * We send the BEGIN message after ensuring that we will actually send the + * change. This avoids sending a pair of BEGIN/COMMIT messages for empty + * transactions. + */ + if (txndata && !txndata->sent_begin_txn) + pgoutput_send_begin(ctx, txn); + + /* + * Schema should be sent using the original relation because it also sends + * the ancestor's relation. + */ + maybe_send_schema(ctx, change, relation, relentry); + + OutputPluginPrepareWrite(ctx, true); + + /* Send the data */ + switch (action) + { + case REORDER_BUFFER_CHANGE_INSERT: + logicalrep_write_insert(ctx->out, xid, targetrel, new_slot, + data->binary, relentry->columns); + break; + case REORDER_BUFFER_CHANGE_UPDATE: + logicalrep_write_update(ctx->out, xid, targetrel, old_slot, + new_slot, data->binary, relentry->columns); + break; + case REORDER_BUFFER_CHANGE_DELETE: + logicalrep_write_delete(ctx->out, xid, targetrel, old_slot, + data->binary, relentry->columns); + break; + default: + Assert(false); + } + + OutputPluginWrite(ctx, true); + +cleanup: + if (RelationIsValid(ancestor)) + { + RelationClose(ancestor); + ancestor = NULL; + } + + MemoryContextSwitchTo(old); + MemoryContextReset(data->context); +} + +static void +pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + int nrelations, Relation relations[], ReorderBufferChange *change) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private; + MemoryContext old; + RelationSyncEntry *relentry; + int i; + int nrelids; + Oid *relids; + TransactionId xid = InvalidTransactionId; + + /* Remember the xid for the change in streaming mode. See pgoutput_change. */ + if (in_streaming) + xid = change->txn->xid; + + old = MemoryContextSwitchTo(data->context); + + relids = palloc0(nrelations * sizeof(Oid)); + nrelids = 0; + + for (i = 0; i < nrelations; i++) + { + Relation relation = relations[i]; + Oid relid = RelationGetRelid(relation); + + if (!is_publishable_relation(relation)) + continue; + + relentry = get_rel_sync_entry(data, relation); + + if (!relentry->pubactions.pubtruncate) + continue; + + /* + * Don't send partitions if the publication wants to send only the + * root tables through it. + */ + if (relation->rd_rel->relispartition && + relentry->publish_as_relid != relid) + continue; + + relids[nrelids++] = relid; + + /* Send BEGIN if we haven't yet */ + if (txndata && !txndata->sent_begin_txn) + pgoutput_send_begin(ctx, txn); + + maybe_send_schema(ctx, change, relation, relentry); + } + + if (nrelids > 0) + { + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_truncate(ctx->out, + xid, + nrelids, + relids, + change->data.truncate.cascade, + change->data.truncate.restart_seqs); + OutputPluginWrite(ctx, true); + } + + MemoryContextSwitchTo(old); + MemoryContextReset(data->context); +} + +static void +pgoutput_message(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, + XLogRecPtr message_lsn, bool transactional, const char *prefix, Size sz, + const char *message) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + TransactionId xid = InvalidTransactionId; + + if (!data->messages) + return; + + /* + * Remember the xid for the message in streaming mode. See + * pgoutput_change. + */ + if (in_streaming) + xid = txn->xid; + + /* + * Output BEGIN if we haven't yet. Avoid for non-transactional messages. + */ + if (transactional) + { + PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private; + + /* Send BEGIN if we haven't yet */ + if (txndata && !txndata->sent_begin_txn) + pgoutput_send_begin(ctx, txn); + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_message(ctx->out, + xid, + message_lsn, + transactional, + prefix, + sz, + message); + OutputPluginWrite(ctx, true); +} + +/* + * Return true if the data is associated with an origin and the user has + * requested the changes that don't have an origin, false otherwise. + */ +static bool +pgoutput_origin_filter(LogicalDecodingContext *ctx, + RepOriginId origin_id) +{ + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + + if (data->origin && (pg_strcasecmp(data->origin, LOGICALREP_ORIGIN_NONE) == 0) && + origin_id != InvalidRepOriginId) + return true; + + return false; +} + +/* + * Shutdown the output plugin. + * + * Note, we don't need to clean the data->context and data->cachectx as + * they are child contexts of the ctx->context so they will be cleaned up by + * logical decoding machinery. + */ +static void +pgoutput_shutdown(LogicalDecodingContext *ctx) +{ + if (RelationSyncCache) + { + hash_destroy(RelationSyncCache); + RelationSyncCache = NULL; + } +} + +/* + * Load publications from the list of publication names. + */ +static List * +LoadPublications(List *pubnames) +{ + List *result = NIL; + ListCell *lc; + + foreach(lc, pubnames) + { + char *pubname = (char *) lfirst(lc); + Publication *pub = GetPublicationByName(pubname, false); + + result = lappend(result, pub); + } + + return result; +} + +/* + * Publication syscache invalidation callback. + * + * Called for invalidations on pg_publication. + */ +static void +publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + publications_valid = false; + + /* + * Also invalidate per-relation cache so that next time the filtering info + * is checked it will be updated with the new publication settings. + */ + rel_sync_cache_publication_cb(arg, cacheid, hashvalue); +} + +/* + * START STREAM callback + */ +static void +pgoutput_stream_start(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + bool send_replication_origin = txn->origin_id != InvalidRepOriginId; + + /* we can't nest streaming of transactions */ + Assert(!in_streaming); + + /* + * If we already sent the first stream for this transaction then don't + * send the origin id in the subsequent streams. + */ + if (rbtxn_is_streamed(txn)) + send_replication_origin = false; + + OutputPluginPrepareWrite(ctx, !send_replication_origin); + logicalrep_write_stream_start(ctx->out, txn->xid, !rbtxn_is_streamed(txn)); + + send_repl_origin(ctx, txn->origin_id, InvalidXLogRecPtr, + send_replication_origin); + + OutputPluginWrite(ctx, true); + + /* we're streaming a chunk of transaction now */ + in_streaming = true; +} + +/* + * STOP STREAM callback + */ +static void +pgoutput_stream_stop(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn) +{ + /* we should be streaming a transaction */ + Assert(in_streaming); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_stop(ctx->out); + OutputPluginWrite(ctx, true); + + /* we've stopped streaming a transaction */ + in_streaming = false; +} + +/* + * Notify downstream to discard the streamed transaction (along with all + * it's subtransactions, if it's a toplevel transaction). + */ +static void +pgoutput_stream_abort(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr abort_lsn) +{ + ReorderBufferTXN *toptxn; + PGOutputData *data = (PGOutputData *) ctx->output_plugin_private; + bool write_abort_info = (data->streaming == LOGICALREP_STREAM_PARALLEL); + + /* + * The abort should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + + /* determine the toplevel transaction */ + toptxn = rbtxn_get_toptxn(txn); + + Assert(rbtxn_is_streamed(toptxn)); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_abort(ctx->out, toptxn->xid, txn->xid, abort_lsn, + txn->xact_time.abort_time, write_abort_info); + + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(toptxn->xid, false); +} + +/* + * Notify downstream to apply the streamed transaction (along with all + * it's subtransactions). + */ +static void +pgoutput_stream_commit(struct LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr commit_lsn) +{ + /* + * The commit should happen outside streaming block, even for streamed + * transactions. The transaction has to be marked as streamed, though. + */ + Assert(!in_streaming); + Assert(rbtxn_is_streamed(txn)); + + OutputPluginUpdateProgress(ctx, false); + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_commit(ctx->out, txn, commit_lsn); + OutputPluginWrite(ctx, true); + + cleanup_rel_sync_cache(txn->xid, true); +} + +/* + * PREPARE callback (for streaming two-phase commit). + * + * Notify the downstream to prepare the transaction. + */ +static void +pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx, + ReorderBufferTXN *txn, + XLogRecPtr prepare_lsn) +{ + Assert(rbtxn_is_streamed(txn)); + + OutputPluginUpdateProgress(ctx, false); + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_stream_prepare(ctx->out, txn, prepare_lsn); + OutputPluginWrite(ctx, true); +} + +/* + * Initialize the relation schema sync cache for a decoding session. + * + * The hash table is destroyed at the end of a decoding session. While + * relcache invalidations still exist and will still be invoked, they + * will just see the null hash table global and take no action. + */ +static void +init_rel_sync_cache(MemoryContext cachectx) +{ + HASHCTL ctl; + static bool relation_callbacks_registered = false; + + /* Nothing to do if hash table already exists */ + if (RelationSyncCache != NULL) + return; + + /* Make a new hash table for the cache */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(RelationSyncEntry); + ctl.hcxt = cachectx; + + RelationSyncCache = hash_create("logical replication output relation cache", + 128, &ctl, + HASH_ELEM | HASH_CONTEXT | HASH_BLOBS); + + Assert(RelationSyncCache != NULL); + + /* No more to do if we already registered callbacks */ + if (relation_callbacks_registered) + return; + + /* We must update the cache entry for a relation after a relcache flush */ + CacheRegisterRelcacheCallback(rel_sync_cache_relation_cb, (Datum) 0); + + /* + * Flush all cache entries after a pg_namespace change, in case it was a + * schema rename affecting a relation being replicated. + */ + CacheRegisterSyscacheCallback(NAMESPACEOID, + rel_sync_cache_publication_cb, + (Datum) 0); + + /* + * Flush all cache entries after any publication changes. (We need no + * callback entry for pg_publication, because publication_invalidation_cb + * will take care of it.) + */ + CacheRegisterSyscacheCallback(PUBLICATIONRELMAP, + rel_sync_cache_publication_cb, + (Datum) 0); + CacheRegisterSyscacheCallback(PUBLICATIONNAMESPACEMAP, + rel_sync_cache_publication_cb, + (Datum) 0); + + relation_callbacks_registered = true; +} + +/* + * We expect relatively small number of streamed transactions. + */ +static bool +get_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + return list_member_xid(entry->streamed_txns, xid); +} + +/* + * Add the xid in the rel sync entry for which we have already sent the schema + * of the relation. + */ +static void +set_schema_sent_in_streamed_txn(RelationSyncEntry *entry, TransactionId xid) +{ + MemoryContext oldctx; + + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + + entry->streamed_txns = lappend_xid(entry->streamed_txns, xid); + + MemoryContextSwitchTo(oldctx); +} + +/* + * Find or create entry in the relation schema cache. + * + * This looks up publications that the given relation is directly or + * indirectly part of (the latter if it's really the relation's ancestor that + * is part of a publication) and fills up the found entry with the information + * about which operations to publish and whether to use an ancestor's schema + * when publishing. + */ +static RelationSyncEntry * +get_rel_sync_entry(PGOutputData *data, Relation relation) +{ + RelationSyncEntry *entry; + bool found; + MemoryContext oldctx; + Oid relid = RelationGetRelid(relation); + + Assert(RelationSyncCache != NULL); + + /* Find cached relation info, creating if not found */ + entry = (RelationSyncEntry *) hash_search(RelationSyncCache, + &relid, + HASH_ENTER, &found); + Assert(entry != NULL); + + /* initialize entry, if it's new */ + if (!found) + { + entry->replicate_valid = false; + entry->schema_sent = false; + entry->streamed_txns = NIL; + entry->pubactions.pubinsert = entry->pubactions.pubupdate = + entry->pubactions.pubdelete = entry->pubactions.pubtruncate = false; + entry->new_slot = NULL; + entry->old_slot = NULL; + memset(entry->exprstate, 0, sizeof(entry->exprstate)); + entry->entry_cxt = NULL; + entry->publish_as_relid = InvalidOid; + entry->columns = NULL; + entry->attrmap = NULL; + } + + /* Validate the entry */ + if (!entry->replicate_valid) + { + Oid schemaId = get_rel_namespace(relid); + List *pubids = GetRelationPublications(relid); + + /* + * We don't acquire a lock on the namespace system table as we build + * the cache entry using a historic snapshot and all the later changes + * are absorbed while decoding WAL. + */ + List *schemaPubids = GetSchemaPublications(schemaId); + ListCell *lc; + Oid publish_as_relid = relid; + int publish_ancestor_level = 0; + bool am_partition = get_rel_relispartition(relid); + char relkind = get_rel_relkind(relid); + List *rel_publications = NIL; + + /* Reload publications if needed before use. */ + if (!publications_valid) + { + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + if (data->publications) + { + list_free_deep(data->publications); + data->publications = NIL; + } + data->publications = LoadPublications(data->publication_names); + MemoryContextSwitchTo(oldctx); + publications_valid = true; + } + + /* + * Reset schema_sent status as the relation definition may have + * changed. Also reset pubactions to empty in case rel was dropped + * from a publication. Also free any objects that depended on the + * earlier definition. + */ + entry->schema_sent = false; + list_free(entry->streamed_txns); + entry->streamed_txns = NIL; + bms_free(entry->columns); + entry->columns = NULL; + entry->pubactions.pubinsert = false; + entry->pubactions.pubupdate = false; + entry->pubactions.pubdelete = false; + entry->pubactions.pubtruncate = false; + + /* + * Tuple slots cleanups. (Will be rebuilt later if needed). + */ + if (entry->old_slot) + ExecDropSingleTupleTableSlot(entry->old_slot); + if (entry->new_slot) + ExecDropSingleTupleTableSlot(entry->new_slot); + + entry->old_slot = NULL; + entry->new_slot = NULL; + + if (entry->attrmap) + free_attrmap(entry->attrmap); + entry->attrmap = NULL; + + /* + * Row filter cache cleanups. + */ + if (entry->entry_cxt) + MemoryContextDelete(entry->entry_cxt); + + entry->entry_cxt = NULL; + entry->estate = NULL; + memset(entry->exprstate, 0, sizeof(entry->exprstate)); + + /* + * Build publication cache. We can't use one provided by relcache as + * relcache considers all publications that the given relation is in, + * but here we only need to consider ones that the subscriber + * requested. + */ + foreach(lc, data->publications) + { + Publication *pub = lfirst(lc); + bool publish = false; + + /* + * Under what relid should we publish changes in this publication? + * We'll use the top-most relid across all publications. Also + * track the ancestor level for this publication. + */ + Oid pub_relid = relid; + int ancestor_level = 0; + + /* + * If this is a FOR ALL TABLES publication, pick the partition + * root and set the ancestor level accordingly. + */ + if (pub->alltables) + { + publish = true; + if (pub->pubviaroot && am_partition) + { + List *ancestors = get_partition_ancestors(relid); + + pub_relid = llast_oid(ancestors); + ancestor_level = list_length(ancestors); + } + } + + if (!publish) + { + bool ancestor_published = false; + + /* + * For a partition, check if any of the ancestors are + * published. If so, note down the topmost ancestor that is + * published via this publication, which will be used as the + * relation via which to publish the partition's changes. + */ + if (am_partition) + { + Oid ancestor; + int level; + List *ancestors = get_partition_ancestors(relid); + + ancestor = GetTopMostAncestorInPublication(pub->oid, + ancestors, + &level); + + if (ancestor != InvalidOid) + { + ancestor_published = true; + if (pub->pubviaroot) + { + pub_relid = ancestor; + ancestor_level = level; + } + } + } + + if (list_member_oid(pubids, pub->oid) || + list_member_oid(schemaPubids, pub->oid) || + ancestor_published) + publish = true; + } + + /* + * If the relation is to be published, determine actions to + * publish, and list of columns, if appropriate. + * + * Don't publish changes for partitioned tables, because + * publishing those of its partitions suffices, unless partition + * changes won't be published due to pubviaroot being set. + */ + if (publish && + (relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot)) + { + entry->pubactions.pubinsert |= pub->pubactions.pubinsert; + entry->pubactions.pubupdate |= pub->pubactions.pubupdate; + entry->pubactions.pubdelete |= pub->pubactions.pubdelete; + entry->pubactions.pubtruncate |= pub->pubactions.pubtruncate; + + /* + * We want to publish the changes as the top-most ancestor + * across all publications. So we need to check if the already + * calculated level is higher than the new one. If yes, we can + * ignore the new value (as it's a child). Otherwise the new + * value is an ancestor, so we keep it. + */ + if (publish_ancestor_level > ancestor_level) + continue; + + /* + * If we found an ancestor higher up in the tree, discard the + * list of publications through which we replicate it, and use + * the new ancestor. + */ + if (publish_ancestor_level < ancestor_level) + { + publish_as_relid = pub_relid; + publish_ancestor_level = ancestor_level; + + /* reset the publication list for this relation */ + rel_publications = NIL; + } + else + { + /* Same ancestor level, has to be the same OID. */ + Assert(publish_as_relid == pub_relid); + } + + /* Track publications for this ancestor. */ + rel_publications = lappend(rel_publications, pub); + } + } + + entry->publish_as_relid = publish_as_relid; + + /* + * Initialize the tuple slot, map, and row filter. These are only used + * when publishing inserts, updates, or deletes. + */ + if (entry->pubactions.pubinsert || entry->pubactions.pubupdate || + entry->pubactions.pubdelete) + { + /* Initialize the tuple slot and map */ + init_tuple_slot(data, relation, entry); + + /* Initialize the row filter */ + pgoutput_row_filter_init(data, rel_publications, entry); + + /* Initialize the column list */ + pgoutput_column_list_init(data, rel_publications, entry); + } + + list_free(pubids); + list_free(schemaPubids); + list_free(rel_publications); + + entry->replicate_valid = true; + } + + return entry; +} + +/* + * Cleanup list of streamed transactions and update the schema_sent flag. + * + * When a streamed transaction commits or aborts, we need to remove the + * toplevel XID from the schema cache. If the transaction aborted, the + * subscriber will simply throw away the schema records we streamed, so + * we don't need to do anything else. + * + * If the transaction is committed, the subscriber will update the relation + * cache - so tweak the schema_sent flag accordingly. + */ +static void +cleanup_rel_sync_cache(TransactionId xid, bool is_commit) +{ + HASH_SEQ_STATUS hash_seq; + RelationSyncEntry *entry; + ListCell *lc; + + Assert(RelationSyncCache != NULL); + + hash_seq_init(&hash_seq, RelationSyncCache); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + /* + * We can set the schema_sent flag for an entry that has committed xid + * in the list as that ensures that the subscriber would have the + * corresponding schema and we don't need to send it unless there is + * any invalidation for that relation. + */ + foreach(lc, entry->streamed_txns) + { + if (xid == lfirst_xid(lc)) + { + if (is_commit) + entry->schema_sent = true; + + entry->streamed_txns = + foreach_delete_current(entry->streamed_txns, lc); + break; + } + } + } +} + +/* + * Relcache invalidation callback + */ +static void +rel_sync_cache_relation_cb(Datum arg, Oid relid) +{ + RelationSyncEntry *entry; + + /* + * We can get here if the plugin was used in SQL interface as the + * RelSchemaSyncCache is destroyed when the decoding finishes, but there + * is no way to unregister the relcache invalidation callback. + */ + if (RelationSyncCache == NULL) + return; + + /* + * Nobody keeps pointers to entries in this hash table around outside + * logical decoding callback calls - but invalidation events can come in + * *during* a callback if we do any syscache access in the callback. + * Because of that we must mark the cache entry as invalid but not damage + * any of its substructure here. The next get_rel_sync_entry() call will + * rebuild it all. + */ + if (OidIsValid(relid)) + { + /* + * Getting invalidations for relations that aren't in the table is + * entirely normal. So we don't care if it's found or not. + */ + entry = (RelationSyncEntry *) hash_search(RelationSyncCache, &relid, + HASH_FIND, NULL); + if (entry != NULL) + entry->replicate_valid = false; + } + else + { + /* Whole cache must be flushed. */ + HASH_SEQ_STATUS status; + + hash_seq_init(&status, RelationSyncCache); + while ((entry = (RelationSyncEntry *) hash_seq_search(&status)) != NULL) + { + entry->replicate_valid = false; + } + } +} + +/* + * Publication relation/schema map syscache invalidation callback + * + * Called for invalidations on pg_publication, pg_publication_rel, + * pg_publication_namespace, and pg_namespace. + */ +static void +rel_sync_cache_publication_cb(Datum arg, int cacheid, uint32 hashvalue) +{ + HASH_SEQ_STATUS status; + RelationSyncEntry *entry; + + /* + * We can get here if the plugin was used in SQL interface as the + * RelSchemaSyncCache is destroyed when the decoding finishes, but there + * is no way to unregister the invalidation callbacks. + */ + if (RelationSyncCache == NULL) + return; + + /* + * We have no easy way to identify which cache entries this invalidation + * event might have affected, so just mark them all invalid. + */ + hash_seq_init(&status, RelationSyncCache); + while ((entry = (RelationSyncEntry *) hash_seq_search(&status)) != NULL) + { + entry->replicate_valid = false; + } +} + +/* Send Replication origin */ +static void +send_repl_origin(LogicalDecodingContext *ctx, RepOriginId origin_id, + XLogRecPtr origin_lsn, bool send_origin) +{ + if (send_origin) + { + char *origin; + + /*---------- + * XXX: which behaviour do we want here? + * + * Alternatives: + * - don't send origin message if origin name not found + * (that's what we do now) + * - throw error - that will break replication, not good + * - send some special "unknown" origin + *---------- + */ + if (replorigin_by_oid(origin_id, true, &origin)) + { + /* Message boundary */ + OutputPluginWrite(ctx, false); + OutputPluginPrepareWrite(ctx, true); + + logicalrep_write_origin(ctx->out, origin, origin_lsn); + } + } +} diff --git a/src/backend/replication/repl_gram.c b/src/backend/replication/repl_gram.c new file mode 100644 index 0000000..53a385f --- /dev/null +++ b/src/backend/replication/repl_gram.c @@ -0,0 +1,1917 @@ +/* A Bison parser, made by GNU Bison 3.7.5. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output, and Bison version. */ +#define YYBISON 30705 + +/* Bison version string. */ +#define YYBISON_VERSION "3.7.5" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + + +/* Substitute the variable and function names. */ +#define yyparse replication_yyparse +#define yylex replication_yylex +#define yyerror replication_yyerror +#define yydebug replication_yydebug +#define yynerrs replication_yynerrs +#define yylval replication_yylval +#define yychar replication_yychar + +/* First part of user prologue. */ +#line 1 "repl_gram.y" + +/*------------------------------------------------------------------------- + * + * repl_gram.y - Parser for the replication commands + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_gram.y + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "nodes/makefuncs.h" +#include "nodes/parsenodes.h" +#include "nodes/replnodes.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" + + +/* Result of the parsing is returned here */ +Node *replication_parse_result; + + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree + + +#line 117 "repl_gram.c" + +# ifndef YY_CAST +# ifdef __cplusplus +# define YY_CAST(Type, Val) static_cast<Type> (Val) +# define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast<Type> (Val) +# else +# define YY_CAST(Type, Val) ((Type) (Val)) +# define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val)) +# endif +# endif +# ifndef YY_NULLPTR +# if defined __cplusplus +# if 201103L <= __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# else +# define YY_NULLPTR ((void*)0) +# endif +# endif + +#include "repl_gram.h" +/* Symbol kind. */ +enum yysymbol_kind_t +{ + YYSYMBOL_YYEMPTY = -2, + YYSYMBOL_YYEOF = 0, /* "end of file" */ + YYSYMBOL_YYerror = 1, /* error */ + YYSYMBOL_YYUNDEF = 2, /* "invalid token" */ + YYSYMBOL_SCONST = 3, /* SCONST */ + YYSYMBOL_IDENT = 4, /* IDENT */ + YYSYMBOL_UCONST = 5, /* UCONST */ + YYSYMBOL_RECPTR = 6, /* RECPTR */ + YYSYMBOL_K_BASE_BACKUP = 7, /* K_BASE_BACKUP */ + YYSYMBOL_K_IDENTIFY_SYSTEM = 8, /* K_IDENTIFY_SYSTEM */ + YYSYMBOL_K_READ_REPLICATION_SLOT = 9, /* K_READ_REPLICATION_SLOT */ + YYSYMBOL_K_SHOW = 10, /* K_SHOW */ + YYSYMBOL_K_START_REPLICATION = 11, /* K_START_REPLICATION */ + YYSYMBOL_K_CREATE_REPLICATION_SLOT = 12, /* K_CREATE_REPLICATION_SLOT */ + YYSYMBOL_K_DROP_REPLICATION_SLOT = 13, /* K_DROP_REPLICATION_SLOT */ + YYSYMBOL_K_TIMELINE_HISTORY = 14, /* K_TIMELINE_HISTORY */ + YYSYMBOL_K_WAIT = 15, /* K_WAIT */ + YYSYMBOL_K_TIMELINE = 16, /* K_TIMELINE */ + YYSYMBOL_K_PHYSICAL = 17, /* K_PHYSICAL */ + YYSYMBOL_K_LOGICAL = 18, /* K_LOGICAL */ + YYSYMBOL_K_SLOT = 19, /* K_SLOT */ + YYSYMBOL_K_RESERVE_WAL = 20, /* K_RESERVE_WAL */ + YYSYMBOL_K_TEMPORARY = 21, /* K_TEMPORARY */ + YYSYMBOL_K_TWO_PHASE = 22, /* K_TWO_PHASE */ + YYSYMBOL_K_EXPORT_SNAPSHOT = 23, /* K_EXPORT_SNAPSHOT */ + YYSYMBOL_K_NOEXPORT_SNAPSHOT = 24, /* K_NOEXPORT_SNAPSHOT */ + YYSYMBOL_K_USE_SNAPSHOT = 25, /* K_USE_SNAPSHOT */ + YYSYMBOL_26_ = 26, /* ';' */ + YYSYMBOL_27_ = 27, /* '.' */ + YYSYMBOL_28_ = 28, /* '(' */ + YYSYMBOL_29_ = 29, /* ')' */ + YYSYMBOL_30_ = 30, /* ',' */ + YYSYMBOL_YYACCEPT = 31, /* $accept */ + YYSYMBOL_firstcmd = 32, /* firstcmd */ + YYSYMBOL_opt_semicolon = 33, /* opt_semicolon */ + YYSYMBOL_command = 34, /* command */ + YYSYMBOL_identify_system = 35, /* identify_system */ + YYSYMBOL_read_replication_slot = 36, /* read_replication_slot */ + YYSYMBOL_show = 37, /* show */ + YYSYMBOL_var_name = 38, /* var_name */ + YYSYMBOL_base_backup = 39, /* base_backup */ + YYSYMBOL_create_replication_slot = 40, /* create_replication_slot */ + YYSYMBOL_create_slot_options = 41, /* create_slot_options */ + YYSYMBOL_create_slot_legacy_opt_list = 42, /* create_slot_legacy_opt_list */ + YYSYMBOL_create_slot_legacy_opt = 43, /* create_slot_legacy_opt */ + YYSYMBOL_drop_replication_slot = 44, /* drop_replication_slot */ + YYSYMBOL_start_replication = 45, /* start_replication */ + YYSYMBOL_start_logical_replication = 46, /* start_logical_replication */ + YYSYMBOL_timeline_history = 47, /* timeline_history */ + YYSYMBOL_opt_physical = 48, /* opt_physical */ + YYSYMBOL_opt_temporary = 49, /* opt_temporary */ + YYSYMBOL_opt_slot = 50, /* opt_slot */ + YYSYMBOL_opt_timeline = 51, /* opt_timeline */ + YYSYMBOL_plugin_options = 52, /* plugin_options */ + YYSYMBOL_plugin_opt_list = 53, /* plugin_opt_list */ + YYSYMBOL_plugin_opt_elem = 54, /* plugin_opt_elem */ + YYSYMBOL_plugin_opt_arg = 55, /* plugin_opt_arg */ + YYSYMBOL_generic_option_list = 56, /* generic_option_list */ + YYSYMBOL_generic_option = 57, /* generic_option */ + YYSYMBOL_ident_or_keyword = 58 /* ident_or_keyword */ +}; +typedef enum yysymbol_kind_t yysymbol_kind_t; + + + + +#ifdef short +# undef short +#endif + +/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure + <limits.h> and (if available) <stdint.h> are included + so that the code can choose integer types of a good width. */ + +#ifndef __PTRDIFF_MAX__ +# include <limits.h> /* INFRINGES ON USER NAME SPACE */ +# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include <stdint.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_STDINT_H +# endif +#endif + +/* Narrow types that promote to a signed type and that can represent a + signed or unsigned integer of at least N bits. In tables they can + save space and decrease cache pressure. Promoting to a signed type + helps avoid bugs in integer arithmetic. */ + +#ifdef __INT_LEAST8_MAX__ +typedef __INT_LEAST8_TYPE__ yytype_int8; +#elif defined YY_STDINT_H +typedef int_least8_t yytype_int8; +#else +typedef signed char yytype_int8; +#endif + +#ifdef __INT_LEAST16_MAX__ +typedef __INT_LEAST16_TYPE__ yytype_int16; +#elif defined YY_STDINT_H +typedef int_least16_t yytype_int16; +#else +typedef short yytype_int16; +#endif + +/* Work around bug in HP-UX 11.23, which defines these macros + incorrectly for preprocessor constants. This workaround can likely + be removed in 2023, as HPE has promised support for HP-UX 11.23 + (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of + <https://h20195.www2.hpe.com/V2/getpdf.aspx/4AA4-7673ENW.pdf>. */ +#ifdef __hpux +# undef UINT_LEAST8_MAX +# undef UINT_LEAST16_MAX +# define UINT_LEAST8_MAX 255 +# define UINT_LEAST16_MAX 65535 +#endif + +#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST8_TYPE__ yytype_uint8; +#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST8_MAX <= INT_MAX) +typedef uint_least8_t yytype_uint8; +#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX +typedef unsigned char yytype_uint8; +#else +typedef short yytype_uint8; +#endif + +#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST16_TYPE__ yytype_uint16; +#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST16_MAX <= INT_MAX) +typedef uint_least16_t yytype_uint16; +#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX +typedef unsigned short yytype_uint16; +#else +typedef int yytype_uint16; +#endif + +#ifndef YYPTRDIFF_T +# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__ +# define YYPTRDIFF_T __PTRDIFF_TYPE__ +# define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__ +# elif defined PTRDIFF_MAX +# ifndef ptrdiff_t +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# endif +# define YYPTRDIFF_T ptrdiff_t +# define YYPTRDIFF_MAXIMUM PTRDIFF_MAX +# else +# define YYPTRDIFF_T long +# define YYPTRDIFF_MAXIMUM LONG_MAX +# endif +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned +# endif +#endif + +#define YYSIZE_MAXIMUM \ + YY_CAST (YYPTRDIFF_T, \ + (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1) \ + ? YYPTRDIFF_MAXIMUM \ + : YY_CAST (YYSIZE_T, -1))) + +#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X)) + + +/* Stored state numbers (used for stacks). */ +typedef yytype_int8 yy_state_t; + +/* State numbers in computations. */ +typedef int yy_state_fast_t; + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include <libintl.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_(Msgid) dgettext ("bison-runtime", Msgid) +# endif +# endif +# ifndef YY_ +# define YY_(Msgid) Msgid +# endif +#endif + + +#ifndef YY_ATTRIBUTE_PURE +# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_PURE __attribute__ ((__pure__)) +# else +# define YY_ATTRIBUTE_PURE +# endif +#endif + +#ifndef YY_ATTRIBUTE_UNUSED +# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__)) +# else +# define YY_ATTRIBUTE_UNUSED +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YY_USE(E) ((void) (E)) +#else +# define YY_USE(E) /* empty */ +#endif + +#if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__ +/* Suppress an incorrect diagnostic about yylval being uninitialized. */ +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"") \ + _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") +# define YY_IGNORE_MAYBE_UNINITIALIZED_END \ + _Pragma ("GCC diagnostic pop") +#else +# define YY_INITIAL_VALUE(Value) Value +#endif +#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_END +#endif +#ifndef YY_INITIAL_VALUE +# define YY_INITIAL_VALUE(Value) /* Nothing. */ +#endif + +#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__ +# define YY_IGNORE_USELESS_CAST_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"") +# define YY_IGNORE_USELESS_CAST_END \ + _Pragma ("GCC diagnostic pop") +#endif +#ifndef YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_END +#endif + + +#define YY_ASSERT(E) ((void) (0 && (E))) + +#if !defined yyoverflow + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include <alloca.h> /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include <malloc.h> /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's 'empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* !defined yyoverflow */ + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yy_state_t yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYPTRDIFF_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / YYSIZEOF (*yyptr); \ + } \ + while (0) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYPTRDIFF_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (0) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 29 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 71 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 31 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 28 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 76 +/* YYNSTATES -- Number of states. */ +#define YYNSTATES 99 + +/* YYMAXUTOK -- Last valid token kind. */ +#define YYMAXUTOK 280 + + +/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM + as returned by yylex, with out-of-bounds checking. */ +#define YYTRANSLATE(YYX) \ + (0 <= (YYX) && (YYX) <= YYMAXUTOK \ + ? YY_CAST (yysymbol_kind_t, yytranslate[YYX]) \ + : YYSYMBOL_YYUNDEF) + +/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM + as returned by yylex. */ +static const yytype_int8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 28, 29, 2, 2, 30, 2, 27, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 26, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25 +}; + +#if YYDEBUG + /* YYRLINE[YYN] -- Source line where rule number YYN was defined. */ +static const yytype_int16 yyrline[] = +{ + 0, 97, 97, 103, 104, 108, 109, 110, 111, 112, + 113, 114, 115, 116, 123, 133, 145, 152, 153, 161, + 167, 176, 187, 201, 202, 206, 209, 213, 218, 223, + 228, 233, 242, 250, 264, 279, 294, 311, 312, 316, + 317, 321, 324, 328, 336, 341, 342, 346, 350, 357, + 364, 365, 369, 371, 376, 380, 384, 388, 395, 396, + 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, + 407, 408, 409, 410, 411, 412, 413 +}; +#endif + +/** Accessing symbol of state STATE. */ +#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State]) + +#if YYDEBUG || 0 +/* The user-facing name of the symbol whose (internal) number is + YYSYMBOL. No bounds checking. */ +static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED; + +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "\"end of file\"", "error", "\"invalid token\"", "SCONST", "IDENT", + "UCONST", "RECPTR", "K_BASE_BACKUP", "K_IDENTIFY_SYSTEM", + "K_READ_REPLICATION_SLOT", "K_SHOW", "K_START_REPLICATION", + "K_CREATE_REPLICATION_SLOT", "K_DROP_REPLICATION_SLOT", + "K_TIMELINE_HISTORY", "K_WAIT", "K_TIMELINE", "K_PHYSICAL", "K_LOGICAL", + "K_SLOT", "K_RESERVE_WAL", "K_TEMPORARY", "K_TWO_PHASE", + "K_EXPORT_SNAPSHOT", "K_NOEXPORT_SNAPSHOT", "K_USE_SNAPSHOT", "';'", + "'.'", "'('", "')'", "','", "$accept", "firstcmd", "opt_semicolon", + "command", "identify_system", "read_replication_slot", "show", + "var_name", "base_backup", "create_replication_slot", + "create_slot_options", "create_slot_legacy_opt_list", + "create_slot_legacy_opt", "drop_replication_slot", "start_replication", + "start_logical_replication", "timeline_history", "opt_physical", + "opt_temporary", "opt_slot", "opt_timeline", "plugin_options", + "plugin_opt_list", "plugin_opt_elem", "plugin_opt_arg", + "generic_option_list", "generic_option", "ident_or_keyword", YY_NULLPTR +}; + +static const char * +yysymbol_name (yysymbol_kind_t yysymbol) +{ + return yytname[yysymbol]; +} +#endif + +#ifdef YYPRINT +/* YYTOKNUM[NUM] -- (External) token number corresponding to the + (internal) symbol number NUM (which must be that of a token). */ +static const yytype_int16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 59, 46, 40, 41, + 44 +}; +#endif + +#define YYPACT_NINF (-29) + +#define yypact_value_is_default(Yyn) \ + ((Yyn) == YYPACT_NINF) + +#define YYTABLE_NINF (-1) + +#define yytable_value_is_error(Yyn) \ + 0 + + /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +static const yytype_int8 yypact[] = +{ + 15, -23, -29, 27, 27, 26, 42, 43, 44, 48, + 24, -29, -29, -29, -29, -29, -29, -29, -29, -29, + -4, -29, 25, 25, 47, 36, 34, 39, -29, -29, + -29, -29, -29, -29, -29, -29, -29, -29, -29, -29, + -29, -29, -29, -29, -29, -29, -29, -29, -29, -29, + -29, -28, -29, 33, 52, 40, -29, 51, -29, 22, + -29, -29, -4, -29, -29, -29, -29, 53, 45, 32, + 58, -29, 35, 59, -29, -4, -29, 10, 32, 61, + -29, -29, 12, -29, -29, -29, -29, -29, -29, -29, + 63, 14, -29, -29, -29, -29, -29, 61, -29 +}; + + /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE does not specify something else to do. Zero + means the default is an error. */ +static const yytype_int8 yydefact[] = +{ + 0, 20, 14, 0, 0, 42, 0, 0, 0, 0, + 4, 5, 11, 13, 6, 9, 10, 7, 8, 12, + 0, 17, 15, 16, 0, 38, 40, 32, 36, 1, + 3, 2, 58, 59, 60, 61, 62, 63, 64, 65, + 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, + 76, 0, 53, 54, 0, 41, 37, 0, 39, 0, + 33, 19, 0, 56, 55, 57, 18, 0, 44, 26, + 0, 52, 46, 0, 34, 0, 21, 24, 26, 0, + 35, 43, 0, 30, 31, 27, 28, 29, 25, 22, + 51, 0, 47, 23, 50, 49, 45, 0, 48 +}; + + /* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -29, -29, -29, -29, -29, -29, -29, 64, -29, -29, + -11, -29, -29, -29, -29, -29, -29, -29, -29, -29, + -29, -29, -29, -27, -29, -6, 9, -29 +}; + + /* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + 0, 9, 31, 10, 11, 12, 13, 22, 14, 15, + 76, 77, 88, 16, 17, 18, 19, 57, 59, 25, + 74, 80, 91, 92, 95, 51, 52, 53 +}; + + /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule whose + number is the opposite. If YYTABLE_NINF, syntax error. */ +static const yytype_int8 yytable[] = +{ + 32, 61, 62, 33, 34, 20, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 1, 2, 3, 4, 5, 6, 7, 8, + 83, 21, 84, 85, 86, 87, 63, 64, 65, 69, + 70, 93, 62, 96, 97, 24, 26, 27, 29, 28, + 30, 55, 54, 56, 60, 58, 66, 68, 67, 72, + 75, 73, 78, 79, 81, 90, 94, 89, 23, 82, + 98, 71 +}; + +static const yytype_int8 yycheck[] = +{ + 4, 29, 30, 7, 8, 28, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 7, 8, 9, 10, 11, 12, 13, 14, + 20, 4, 22, 23, 24, 25, 3, 4, 5, 17, + 18, 29, 30, 29, 30, 19, 4, 4, 0, 5, + 26, 4, 27, 17, 15, 21, 4, 6, 18, 6, + 28, 16, 4, 28, 5, 4, 3, 78, 4, 75, + 97, 62 +}; + + /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_int8 yystos[] = +{ + 0, 7, 8, 9, 10, 11, 12, 13, 14, 32, + 34, 35, 36, 37, 39, 40, 44, 45, 46, 47, + 28, 4, 38, 38, 19, 50, 4, 4, 5, 0, + 26, 33, 4, 7, 8, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 56, 57, 58, 27, 4, 17, 48, 21, 49, + 15, 29, 30, 3, 4, 5, 4, 18, 6, 17, + 18, 57, 6, 16, 51, 28, 41, 42, 4, 28, + 52, 5, 56, 20, 22, 23, 24, 25, 43, 41, + 4, 53, 54, 29, 3, 55, 29, 30, 54 +}; + + /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_int8 yyr1[] = +{ + 0, 31, 32, 33, 33, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 35, 36, 37, 38, 38, 39, + 39, 40, 40, 41, 41, 42, 42, 43, 43, 43, + 43, 43, 44, 44, 45, 46, 47, 48, 48, 49, + 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, + 55, 55, 56, 56, 57, 57, 57, 57, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58 +}; + + /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN. */ +static const yytype_int8 yyr2[] = +{ + 0, 2, 2, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 2, 1, 3, 4, + 1, 5, 6, 3, 1, 2, 0, 1, 1, 1, + 1, 1, 2, 3, 5, 6, 2, 1, 0, 1, + 0, 2, 0, 2, 0, 3, 0, 1, 3, 2, + 1, 0, 3, 1, 1, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1 +}; + + +enum { YYENOMEM = -2 }; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ + do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ + while (0) + +/* Backward compatibility with an undocumented macro. + Use YYerror or YYUNDEF. */ +#define YYERRCODE YYUNDEF + + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + +/* This macro is provided for backward compatibility. */ +# ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif + + +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Kind, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + + +/*-----------------------------------. +| Print this symbol's value on YYO. | +`-----------------------------------*/ + +static void +yy_symbol_value_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + FILE *yyoutput = yyo; + YY_USE (yyoutput); + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yykind < YYNTOKENS) + YYPRINT (yyo, yytoknum[yykind], *yyvaluep); +# endif + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/*---------------------------. +| Print this symbol on YYO. | +`---------------------------*/ + +static void +yy_symbol_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + YYFPRINTF (yyo, "%s %s (", + yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind)); + + yy_symbol_value_print (yyo, yykind, yyvaluep); + YYFPRINTF (yyo, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +static void +yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop) +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +static void +yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp, + int yyrule) +{ + int yylno = yyrline[yyrule]; + int yynrhs = yyr2[yyrule]; + int yyi; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, + YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]), + &yyvsp[(yyi + 1) - (yynrhs)]); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyssp, yyvsp, Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) ((void) 0) +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + + + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +static void +yydestruct (const char *yymsg, + yysymbol_kind_t yykind, YYSTYPE *yyvaluep) +{ + YY_USE (yyvaluep); + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp); + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/* Lookahead token kind. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; +/* Number of syntax errors so far. */ +int yynerrs; + + + + +/*----------. +| yyparse. | +`----------*/ + +int +yyparse (void) +{ + yy_state_fast_t yystate = 0; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus = 0; + + /* Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* Their size. */ + YYPTRDIFF_T yystacksize = YYINITDEPTH; + + /* The state stack: array, bottom, top. */ + yy_state_t yyssa[YYINITDEPTH]; + yy_state_t *yyss = yyssa; + yy_state_t *yyssp = yyss; + + /* The semantic value stack: array, bottom, top. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + YYSTYPE *yyvsp = yyvs; + + int yyn; + /* The return value of yyparse. */ + int yyresult; + /* Lookahead symbol kind. */ + yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yychar = YYEMPTY; /* Cause a token to be read. */ + goto yysetstate; + + +/*------------------------------------------------------------. +| yynewstate -- push a new state, which is found in yystate. | +`------------------------------------------------------------*/ +yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + +/*--------------------------------------------------------------------. +| yysetstate -- set current state (the top of the stack) to yystate. | +`--------------------------------------------------------------------*/ +yysetstate: + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + YY_ASSERT (0 <= yystate && yystate < YYNSTATES); + YY_IGNORE_USELESS_CAST_BEGIN + *yyssp = YY_CAST (yy_state_t, yystate); + YY_IGNORE_USELESS_CAST_END + YY_STACK_PRINT (yyss, yyssp); + + if (yyss + yystacksize - 1 <= yyssp) +#if !defined yyoverflow && !defined YYSTACK_RELOCATE + goto yyexhaustedlab; +#else + { + /* Get the current used size of the three stacks, in elements. */ + YYPTRDIFF_T yysize = yyssp - yyss + 1; + +# if defined yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + yy_state_t *yyss1 = yyss; + YYSTYPE *yyvs1 = yyvs; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * YYSIZEOF (*yyssp), + &yyvs1, yysize * YYSIZEOF (*yyvsp), + &yystacksize); + yyss = yyss1; + yyvs = yyvs1; + } +# else /* defined YYSTACK_RELOCATE */ + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yy_state_t *yyss1 = yyss; + union yyalloc *yyptr = + YY_CAST (union yyalloc *, + YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize)))); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YY_IGNORE_USELESS_CAST_BEGIN + YYDPRINTF ((stderr, "Stack size increased to %ld\n", + YY_CAST (long, yystacksize))); + YY_IGNORE_USELESS_CAST_END + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } +#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */ + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either empty, or end-of-input, or a valid lookahead. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token\n")); + yychar = yylex (); + } + + if (yychar <= YYEOF) + { + yychar = YYEOF; + yytoken = YYSYMBOL_YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else if (yychar == YYerror) + { + /* The scanner already issued an error message, process directly + to error recovery. But do not keep the error token as + lookahead, it is too special and may lead us to an endless + loop in error recovery. */ + yychar = YYUNDEF; + yytoken = YYSYMBOL_YYerror; + goto yyerrlab1; + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + yystate = yyn; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + /* Discard the shifted token. */ + yychar = YYEMPTY; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + '$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: /* firstcmd: command opt_semicolon */ +#line 98 "repl_gram.y" + { + replication_parse_result = (yyvsp[-1].node); + } +#line 1243 "repl_gram.c" + break; + + case 14: /* identify_system: K_IDENTIFY_SYSTEM */ +#line 124 "repl_gram.y" + { + (yyval.node) = (Node *) makeNode(IdentifySystemCmd); + } +#line 1251 "repl_gram.c" + break; + + case 15: /* read_replication_slot: K_READ_REPLICATION_SLOT var_name */ +#line 134 "repl_gram.y" + { + ReadReplicationSlotCmd *n = makeNode(ReadReplicationSlotCmd); + n->slotname = (yyvsp[0].str); + (yyval.node) = (Node *) n; + } +#line 1261 "repl_gram.c" + break; + + case 16: /* show: K_SHOW var_name */ +#line 146 "repl_gram.y" + { + VariableShowStmt *n = makeNode(VariableShowStmt); + n->name = (yyvsp[0].str); + (yyval.node) = (Node *) n; + } +#line 1271 "repl_gram.c" + break; + + case 17: /* var_name: IDENT */ +#line 152 "repl_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1277 "repl_gram.c" + break; + + case 18: /* var_name: var_name '.' IDENT */ +#line 154 "repl_gram.y" + { (yyval.str) = psprintf("%s.%s", (yyvsp[-2].str), (yyvsp[0].str)); } +#line 1283 "repl_gram.c" + break; + + case 19: /* base_backup: K_BASE_BACKUP '(' generic_option_list ')' */ +#line 162 "repl_gram.y" + { + BaseBackupCmd *cmd = makeNode(BaseBackupCmd); + cmd->options = (yyvsp[-1].list); + (yyval.node) = (Node *) cmd; + } +#line 1293 "repl_gram.c" + break; + + case 20: /* base_backup: K_BASE_BACKUP */ +#line 168 "repl_gram.y" + { + BaseBackupCmd *cmd = makeNode(BaseBackupCmd); + (yyval.node) = (Node *) cmd; + } +#line 1302 "repl_gram.c" + break; + + case 21: /* create_replication_slot: K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_PHYSICAL create_slot_options */ +#line 177 "repl_gram.y" + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->temporary = (yyvsp[-2].boolval); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1316 "repl_gram.c" + break; + + case 22: /* create_replication_slot: K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_LOGICAL IDENT create_slot_options */ +#line 188 "repl_gram.y" + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = (yyvsp[-4].str); + cmd->temporary = (yyvsp[-3].boolval); + cmd->plugin = (yyvsp[-1].str); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1331 "repl_gram.c" + break; + + case 23: /* create_slot_options: '(' generic_option_list ')' */ +#line 201 "repl_gram.y" + { (yyval.list) = (yyvsp[-1].list); } +#line 1337 "repl_gram.c" + break; + + case 24: /* create_slot_options: create_slot_legacy_opt_list */ +#line 202 "repl_gram.y" + { (yyval.list) = (yyvsp[0].list); } +#line 1343 "repl_gram.c" + break; + + case 25: /* create_slot_legacy_opt_list: create_slot_legacy_opt_list create_slot_legacy_opt */ +#line 207 "repl_gram.y" + { (yyval.list) = lappend((yyvsp[-1].list), (yyvsp[0].defelt)); } +#line 1349 "repl_gram.c" + break; + + case 26: /* create_slot_legacy_opt_list: %empty */ +#line 209 "repl_gram.y" + { (yyval.list) = NIL; } +#line 1355 "repl_gram.c" + break; + + case 27: /* create_slot_legacy_opt: K_EXPORT_SNAPSHOT */ +#line 214 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("snapshot", + (Node *) makeString("export"), -1); + } +#line 1364 "repl_gram.c" + break; + + case 28: /* create_slot_legacy_opt: K_NOEXPORT_SNAPSHOT */ +#line 219 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("snapshot", + (Node *) makeString("nothing"), -1); + } +#line 1373 "repl_gram.c" + break; + + case 29: /* create_slot_legacy_opt: K_USE_SNAPSHOT */ +#line 224 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("snapshot", + (Node *) makeString("use"), -1); + } +#line 1382 "repl_gram.c" + break; + + case 30: /* create_slot_legacy_opt: K_RESERVE_WAL */ +#line 229 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("reserve_wal", + (Node *) makeBoolean(true), -1); + } +#line 1391 "repl_gram.c" + break; + + case 31: /* create_slot_legacy_opt: K_TWO_PHASE */ +#line 234 "repl_gram.y" + { + (yyval.defelt) = makeDefElem("two_phase", + (Node *) makeBoolean(true), -1); + } +#line 1400 "repl_gram.c" + break; + + case 32: /* drop_replication_slot: K_DROP_REPLICATION_SLOT IDENT */ +#line 243 "repl_gram.y" + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = (yyvsp[0].str); + cmd->wait = false; + (yyval.node) = (Node *) cmd; + } +#line 1412 "repl_gram.c" + break; + + case 33: /* drop_replication_slot: K_DROP_REPLICATION_SLOT IDENT K_WAIT */ +#line 251 "repl_gram.y" + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = (yyvsp[-1].str); + cmd->wait = true; + (yyval.node) = (Node *) cmd; + } +#line 1424 "repl_gram.c" + break; + + case 34: /* start_replication: K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline */ +#line 265 "repl_gram.y" + { + StartReplicationCmd *cmd; + + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->startpoint = (yyvsp[-1].recptr); + cmd->timeline = (yyvsp[0].uintval); + (yyval.node) = (Node *) cmd; + } +#line 1439 "repl_gram.c" + break; + + case 35: /* start_logical_replication: K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options */ +#line 280 "repl_gram.y" + { + StartReplicationCmd *cmd; + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = (yyvsp[-3].str); + cmd->startpoint = (yyvsp[-1].recptr); + cmd->options = (yyvsp[0].list); + (yyval.node) = (Node *) cmd; + } +#line 1453 "repl_gram.c" + break; + + case 36: /* timeline_history: K_TIMELINE_HISTORY UCONST */ +#line 295 "repl_gram.y" + { + TimeLineHistoryCmd *cmd; + + if ((yyvsp[0].uintval) <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", (yyvsp[0].uintval)))); + + cmd = makeNode(TimeLineHistoryCmd); + cmd->timeline = (yyvsp[0].uintval); + + (yyval.node) = (Node *) cmd; + } +#line 1471 "repl_gram.c" + break; + + case 39: /* opt_temporary: K_TEMPORARY */ +#line 316 "repl_gram.y" + { (yyval.boolval) = true; } +#line 1477 "repl_gram.c" + break; + + case 40: /* opt_temporary: %empty */ +#line 317 "repl_gram.y" + { (yyval.boolval) = false; } +#line 1483 "repl_gram.c" + break; + + case 41: /* opt_slot: K_SLOT IDENT */ +#line 322 "repl_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1489 "repl_gram.c" + break; + + case 42: /* opt_slot: %empty */ +#line 324 "repl_gram.y" + { (yyval.str) = NULL; } +#line 1495 "repl_gram.c" + break; + + case 43: /* opt_timeline: K_TIMELINE UCONST */ +#line 329 "repl_gram.y" + { + if ((yyvsp[0].uintval) <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", (yyvsp[0].uintval)))); + (yyval.uintval) = (yyvsp[0].uintval); + } +#line 1507 "repl_gram.c" + break; + + case 44: /* opt_timeline: %empty */ +#line 336 "repl_gram.y" + { (yyval.uintval) = 0; } +#line 1513 "repl_gram.c" + break; + + case 45: /* plugin_options: '(' plugin_opt_list ')' */ +#line 341 "repl_gram.y" + { (yyval.list) = (yyvsp[-1].list); } +#line 1519 "repl_gram.c" + break; + + case 46: /* plugin_options: %empty */ +#line 342 "repl_gram.y" + { (yyval.list) = NIL; } +#line 1525 "repl_gram.c" + break; + + case 47: /* plugin_opt_list: plugin_opt_elem */ +#line 347 "repl_gram.y" + { + (yyval.list) = list_make1((yyvsp[0].defelt)); + } +#line 1533 "repl_gram.c" + break; + + case 48: /* plugin_opt_list: plugin_opt_list ',' plugin_opt_elem */ +#line 351 "repl_gram.y" + { + (yyval.list) = lappend((yyvsp[-2].list), (yyvsp[0].defelt)); + } +#line 1541 "repl_gram.c" + break; + + case 49: /* plugin_opt_elem: IDENT plugin_opt_arg */ +#line 358 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[-1].str), (yyvsp[0].node), -1); + } +#line 1549 "repl_gram.c" + break; + + case 50: /* plugin_opt_arg: SCONST */ +#line 364 "repl_gram.y" + { (yyval.node) = (Node *) makeString((yyvsp[0].str)); } +#line 1555 "repl_gram.c" + break; + + case 51: /* plugin_opt_arg: %empty */ +#line 365 "repl_gram.y" + { (yyval.node) = NULL; } +#line 1561 "repl_gram.c" + break; + + case 52: /* generic_option_list: generic_option_list ',' generic_option */ +#line 370 "repl_gram.y" + { (yyval.list) = lappend((yyvsp[-2].list), (yyvsp[0].defelt)); } +#line 1567 "repl_gram.c" + break; + + case 53: /* generic_option_list: generic_option */ +#line 372 "repl_gram.y" + { (yyval.list) = list_make1((yyvsp[0].defelt)); } +#line 1573 "repl_gram.c" + break; + + case 54: /* generic_option: ident_or_keyword */ +#line 377 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[0].str), NULL, -1); + } +#line 1581 "repl_gram.c" + break; + + case 55: /* generic_option: ident_or_keyword IDENT */ +#line 381 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[-1].str), (Node *) makeString((yyvsp[0].str)), -1); + } +#line 1589 "repl_gram.c" + break; + + case 56: /* generic_option: ident_or_keyword SCONST */ +#line 385 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[-1].str), (Node *) makeString((yyvsp[0].str)), -1); + } +#line 1597 "repl_gram.c" + break; + + case 57: /* generic_option: ident_or_keyword UCONST */ +#line 389 "repl_gram.y" + { + (yyval.defelt) = makeDefElem((yyvsp[-1].str), (Node *) makeInteger((yyvsp[0].uintval)), -1); + } +#line 1605 "repl_gram.c" + break; + + case 58: /* ident_or_keyword: IDENT */ +#line 395 "repl_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1611 "repl_gram.c" + break; + + case 59: /* ident_or_keyword: K_BASE_BACKUP */ +#line 396 "repl_gram.y" + { (yyval.str) = "base_backup"; } +#line 1617 "repl_gram.c" + break; + + case 60: /* ident_or_keyword: K_IDENTIFY_SYSTEM */ +#line 397 "repl_gram.y" + { (yyval.str) = "identify_system"; } +#line 1623 "repl_gram.c" + break; + + case 61: /* ident_or_keyword: K_SHOW */ +#line 398 "repl_gram.y" + { (yyval.str) = "show"; } +#line 1629 "repl_gram.c" + break; + + case 62: /* ident_or_keyword: K_START_REPLICATION */ +#line 399 "repl_gram.y" + { (yyval.str) = "start_replication"; } +#line 1635 "repl_gram.c" + break; + + case 63: /* ident_or_keyword: K_CREATE_REPLICATION_SLOT */ +#line 400 "repl_gram.y" + { (yyval.str) = "create_replication_slot"; } +#line 1641 "repl_gram.c" + break; + + case 64: /* ident_or_keyword: K_DROP_REPLICATION_SLOT */ +#line 401 "repl_gram.y" + { (yyval.str) = "drop_replication_slot"; } +#line 1647 "repl_gram.c" + break; + + case 65: /* ident_or_keyword: K_TIMELINE_HISTORY */ +#line 402 "repl_gram.y" + { (yyval.str) = "timeline_history"; } +#line 1653 "repl_gram.c" + break; + + case 66: /* ident_or_keyword: K_WAIT */ +#line 403 "repl_gram.y" + { (yyval.str) = "wait"; } +#line 1659 "repl_gram.c" + break; + + case 67: /* ident_or_keyword: K_TIMELINE */ +#line 404 "repl_gram.y" + { (yyval.str) = "timeline"; } +#line 1665 "repl_gram.c" + break; + + case 68: /* ident_or_keyword: K_PHYSICAL */ +#line 405 "repl_gram.y" + { (yyval.str) = "physical"; } +#line 1671 "repl_gram.c" + break; + + case 69: /* ident_or_keyword: K_LOGICAL */ +#line 406 "repl_gram.y" + { (yyval.str) = "logical"; } +#line 1677 "repl_gram.c" + break; + + case 70: /* ident_or_keyword: K_SLOT */ +#line 407 "repl_gram.y" + { (yyval.str) = "slot"; } +#line 1683 "repl_gram.c" + break; + + case 71: /* ident_or_keyword: K_RESERVE_WAL */ +#line 408 "repl_gram.y" + { (yyval.str) = "reserve_wal"; } +#line 1689 "repl_gram.c" + break; + + case 72: /* ident_or_keyword: K_TEMPORARY */ +#line 409 "repl_gram.y" + { (yyval.str) = "temporary"; } +#line 1695 "repl_gram.c" + break; + + case 73: /* ident_or_keyword: K_TWO_PHASE */ +#line 410 "repl_gram.y" + { (yyval.str) = "two_phase"; } +#line 1701 "repl_gram.c" + break; + + case 74: /* ident_or_keyword: K_EXPORT_SNAPSHOT */ +#line 411 "repl_gram.y" + { (yyval.str) = "export_snapshot"; } +#line 1707 "repl_gram.c" + break; + + case 75: /* ident_or_keyword: K_NOEXPORT_SNAPSHOT */ +#line 412 "repl_gram.y" + { (yyval.str) = "noexport_snapshot"; } +#line 1713 "repl_gram.c" + break; + + case 76: /* ident_or_keyword: K_USE_SNAPSHOT */ +#line 413 "repl_gram.y" + { (yyval.str) = "use_snapshot"; } +#line 1719 "repl_gram.c" + break; + + +#line 1723 "repl_gram.c" + + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + + *++yyvsp = yyval; + + /* Now 'shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + { + const int yylhs = yyr1[yyn] - YYNTOKENS; + const int yyi = yypgoto[yylhs] + *yyssp; + yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp + ? yytable[yyi] + : yydefgoto[yylhs]); + } + + goto yynewstate; + + +/*--------------------------------------. +| yyerrlab -- here on detecting error. | +`--------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar); + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; + yyerror (YY_("syntax error")); + } + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + /* Pacify compilers when the user code never invokes YYERROR and the + label yyerrorlab therefore never appears in user code. */ + if (0) + YYERROR; + + /* Do not reclaim the symbols of the rule whose action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + /* Pop stack until we find a state that shifts the error token. */ + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYSYMBOL_YYerror; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + YY_ACCESSING_SYMBOL (yystate), yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + + +#if !defined yyoverflow +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + goto yyreturn; +#endif + + +/*-------------------------------------------------------. +| yyreturn -- parsing is finished, clean up and return. | +`-------------------------------------------------------*/ +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule whose action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + YY_ACCESSING_SYMBOL (+*yyssp), yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + + return yyresult; +} + +#line 416 "repl_gram.y" + diff --git a/src/backend/replication/repl_gram.h b/src/backend/replication/repl_gram.h new file mode 100644 index 0000000..63b64b2 --- /dev/null +++ b/src/backend/replication/repl_gram.h @@ -0,0 +1,111 @@ +/* A Bison parser, made by GNU Bison 3.7.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +#ifndef YY_REPLICATION_YY_REPL_GRAM_H_INCLUDED +# define YY_REPLICATION_YY_REPL_GRAM_H_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int replication_yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + SCONST = 258, /* SCONST */ + IDENT = 259, /* IDENT */ + UCONST = 260, /* UCONST */ + RECPTR = 261, /* RECPTR */ + K_BASE_BACKUP = 262, /* K_BASE_BACKUP */ + K_IDENTIFY_SYSTEM = 263, /* K_IDENTIFY_SYSTEM */ + K_READ_REPLICATION_SLOT = 264, /* K_READ_REPLICATION_SLOT */ + K_SHOW = 265, /* K_SHOW */ + K_START_REPLICATION = 266, /* K_START_REPLICATION */ + K_CREATE_REPLICATION_SLOT = 267, /* K_CREATE_REPLICATION_SLOT */ + K_DROP_REPLICATION_SLOT = 268, /* K_DROP_REPLICATION_SLOT */ + K_TIMELINE_HISTORY = 269, /* K_TIMELINE_HISTORY */ + K_WAIT = 270, /* K_WAIT */ + K_TIMELINE = 271, /* K_TIMELINE */ + K_PHYSICAL = 272, /* K_PHYSICAL */ + K_LOGICAL = 273, /* K_LOGICAL */ + K_SLOT = 274, /* K_SLOT */ + K_RESERVE_WAL = 275, /* K_RESERVE_WAL */ + K_TEMPORARY = 276, /* K_TEMPORARY */ + K_TWO_PHASE = 277, /* K_TWO_PHASE */ + K_EXPORT_SNAPSHOT = 278, /* K_EXPORT_SNAPSHOT */ + K_NOEXPORT_SNAPSHOT = 279, /* K_NOEXPORT_SNAPSHOT */ + K_USE_SNAPSHOT = 280 /* K_USE_SNAPSHOT */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +union YYSTYPE +{ +#line 44 "repl_gram.y" + + char *str; + bool boolval; + uint32 uintval; + XLogRecPtr recptr; + Node *node; + List *list; + DefElem *defelt; + +#line 99 "repl_gram.h" + +}; +typedef union YYSTYPE YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE replication_yylval; + +int replication_yyparse (void); + +#endif /* !YY_REPLICATION_YY_REPL_GRAM_H_INCLUDED */ diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y new file mode 100644 index 0000000..0c874e3 --- /dev/null +++ b/src/backend/replication/repl_gram.y @@ -0,0 +1,416 @@ +%{ +/*------------------------------------------------------------------------- + * + * repl_gram.y - Parser for the replication commands + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_gram.y + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "nodes/makefuncs.h" +#include "nodes/parsenodes.h" +#include "nodes/replnodes.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" + + +/* Result of the parsing is returned here */ +Node *replication_parse_result; + + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree + +%} + +%expect 0 +%name-prefix="replication_yy" + +%union +{ + char *str; + bool boolval; + uint32 uintval; + XLogRecPtr recptr; + Node *node; + List *list; + DefElem *defelt; +} + +/* Non-keyword tokens */ +%token <str> SCONST IDENT +%token <uintval> UCONST +%token <recptr> RECPTR + +/* Keyword tokens. */ +%token K_BASE_BACKUP +%token K_IDENTIFY_SYSTEM +%token K_READ_REPLICATION_SLOT +%token K_SHOW +%token K_START_REPLICATION +%token K_CREATE_REPLICATION_SLOT +%token K_DROP_REPLICATION_SLOT +%token K_TIMELINE_HISTORY +%token K_WAIT +%token K_TIMELINE +%token K_PHYSICAL +%token K_LOGICAL +%token K_SLOT +%token K_RESERVE_WAL +%token K_TEMPORARY +%token K_TWO_PHASE +%token K_EXPORT_SNAPSHOT +%token K_NOEXPORT_SNAPSHOT +%token K_USE_SNAPSHOT + +%type <node> command +%type <node> base_backup start_replication start_logical_replication + create_replication_slot drop_replication_slot identify_system + read_replication_slot timeline_history show +%type <list> generic_option_list +%type <defelt> generic_option +%type <uintval> opt_timeline +%type <list> plugin_options plugin_opt_list +%type <defelt> plugin_opt_elem +%type <node> plugin_opt_arg +%type <str> opt_slot var_name ident_or_keyword +%type <boolval> opt_temporary +%type <list> create_slot_options create_slot_legacy_opt_list +%type <defelt> create_slot_legacy_opt + +%% + +firstcmd: command opt_semicolon + { + replication_parse_result = $1; + } + ; + +opt_semicolon: ';' + | /* EMPTY */ + ; + +command: + identify_system + | base_backup + | start_replication + | start_logical_replication + | create_replication_slot + | drop_replication_slot + | read_replication_slot + | timeline_history + | show + ; + +/* + * IDENTIFY_SYSTEM + */ +identify_system: + K_IDENTIFY_SYSTEM + { + $$ = (Node *) makeNode(IdentifySystemCmd); + } + ; + +/* + * READ_REPLICATION_SLOT %s + */ +read_replication_slot: + K_READ_REPLICATION_SLOT var_name + { + ReadReplicationSlotCmd *n = makeNode(ReadReplicationSlotCmd); + n->slotname = $2; + $$ = (Node *) n; + } + ; + +/* + * SHOW setting + */ +show: + K_SHOW var_name + { + VariableShowStmt *n = makeNode(VariableShowStmt); + n->name = $2; + $$ = (Node *) n; + } + +var_name: IDENT { $$ = $1; } + | var_name '.' IDENT + { $$ = psprintf("%s.%s", $1, $3); } + ; + +/* + * BASE_BACKUP [ ( option [ 'value' ] [, ...] ) ] + */ +base_backup: + K_BASE_BACKUP '(' generic_option_list ')' + { + BaseBackupCmd *cmd = makeNode(BaseBackupCmd); + cmd->options = $3; + $$ = (Node *) cmd; + } + | K_BASE_BACKUP + { + BaseBackupCmd *cmd = makeNode(BaseBackupCmd); + $$ = (Node *) cmd; + } + ; + +create_replication_slot: + /* CREATE_REPLICATION_SLOT slot [TEMPORARY] PHYSICAL [options] */ + K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_PHYSICAL create_slot_options + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = $2; + cmd->temporary = $3; + cmd->options = $5; + $$ = (Node *) cmd; + } + /* CREATE_REPLICATION_SLOT slot [TEMPORARY] LOGICAL plugin [options] */ + | K_CREATE_REPLICATION_SLOT IDENT opt_temporary K_LOGICAL IDENT create_slot_options + { + CreateReplicationSlotCmd *cmd; + cmd = makeNode(CreateReplicationSlotCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = $2; + cmd->temporary = $3; + cmd->plugin = $5; + cmd->options = $6; + $$ = (Node *) cmd; + } + ; + +create_slot_options: + '(' generic_option_list ')' { $$ = $2; } + | create_slot_legacy_opt_list { $$ = $1; } + ; + +create_slot_legacy_opt_list: + create_slot_legacy_opt_list create_slot_legacy_opt + { $$ = lappend($1, $2); } + | /* EMPTY */ + { $$ = NIL; } + ; + +create_slot_legacy_opt: + K_EXPORT_SNAPSHOT + { + $$ = makeDefElem("snapshot", + (Node *) makeString("export"), -1); + } + | K_NOEXPORT_SNAPSHOT + { + $$ = makeDefElem("snapshot", + (Node *) makeString("nothing"), -1); + } + | K_USE_SNAPSHOT + { + $$ = makeDefElem("snapshot", + (Node *) makeString("use"), -1); + } + | K_RESERVE_WAL + { + $$ = makeDefElem("reserve_wal", + (Node *) makeBoolean(true), -1); + } + | K_TWO_PHASE + { + $$ = makeDefElem("two_phase", + (Node *) makeBoolean(true), -1); + } + ; + +/* DROP_REPLICATION_SLOT slot */ +drop_replication_slot: + K_DROP_REPLICATION_SLOT IDENT + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = $2; + cmd->wait = false; + $$ = (Node *) cmd; + } + | K_DROP_REPLICATION_SLOT IDENT K_WAIT + { + DropReplicationSlotCmd *cmd; + cmd = makeNode(DropReplicationSlotCmd); + cmd->slotname = $2; + cmd->wait = true; + $$ = (Node *) cmd; + } + ; + +/* + * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%X [TIMELINE %d] + */ +start_replication: + K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline + { + StartReplicationCmd *cmd; + + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_PHYSICAL; + cmd->slotname = $2; + cmd->startpoint = $4; + cmd->timeline = $5; + $$ = (Node *) cmd; + } + ; + +/* START_REPLICATION SLOT slot LOGICAL %X/%X options */ +start_logical_replication: + K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options + { + StartReplicationCmd *cmd; + cmd = makeNode(StartReplicationCmd); + cmd->kind = REPLICATION_KIND_LOGICAL; + cmd->slotname = $3; + cmd->startpoint = $5; + cmd->options = $6; + $$ = (Node *) cmd; + } + ; +/* + * TIMELINE_HISTORY %d + */ +timeline_history: + K_TIMELINE_HISTORY UCONST + { + TimeLineHistoryCmd *cmd; + + if ($2 <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", $2))); + + cmd = makeNode(TimeLineHistoryCmd); + cmd->timeline = $2; + + $$ = (Node *) cmd; + } + ; + +opt_physical: + K_PHYSICAL + | /* EMPTY */ + ; + +opt_temporary: + K_TEMPORARY { $$ = true; } + | /* EMPTY */ { $$ = false; } + ; + +opt_slot: + K_SLOT IDENT + { $$ = $2; } + | /* EMPTY */ + { $$ = NULL; } + ; + +opt_timeline: + K_TIMELINE UCONST + { + if ($2 <= 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid timeline %u", $2))); + $$ = $2; + } + | /* EMPTY */ { $$ = 0; } + ; + + +plugin_options: + '(' plugin_opt_list ')' { $$ = $2; } + | /* EMPTY */ { $$ = NIL; } + ; + +plugin_opt_list: + plugin_opt_elem + { + $$ = list_make1($1); + } + | plugin_opt_list ',' plugin_opt_elem + { + $$ = lappend($1, $3); + } + ; + +plugin_opt_elem: + IDENT plugin_opt_arg + { + $$ = makeDefElem($1, $2, -1); + } + ; + +plugin_opt_arg: + SCONST { $$ = (Node *) makeString($1); } + | /* EMPTY */ { $$ = NULL; } + ; + +generic_option_list: + generic_option_list ',' generic_option + { $$ = lappend($1, $3); } + | generic_option + { $$ = list_make1($1); } + ; + +generic_option: + ident_or_keyword + { + $$ = makeDefElem($1, NULL, -1); + } + | ident_or_keyword IDENT + { + $$ = makeDefElem($1, (Node *) makeString($2), -1); + } + | ident_or_keyword SCONST + { + $$ = makeDefElem($1, (Node *) makeString($2), -1); + } + | ident_or_keyword UCONST + { + $$ = makeDefElem($1, (Node *) makeInteger($2), -1); + } + ; + +ident_or_keyword: + IDENT { $$ = $1; } + | K_BASE_BACKUP { $$ = "base_backup"; } + | K_IDENTIFY_SYSTEM { $$ = "identify_system"; } + | K_SHOW { $$ = "show"; } + | K_START_REPLICATION { $$ = "start_replication"; } + | K_CREATE_REPLICATION_SLOT { $$ = "create_replication_slot"; } + | K_DROP_REPLICATION_SLOT { $$ = "drop_replication_slot"; } + | K_TIMELINE_HISTORY { $$ = "timeline_history"; } + | K_WAIT { $$ = "wait"; } + | K_TIMELINE { $$ = "timeline"; } + | K_PHYSICAL { $$ = "physical"; } + | K_LOGICAL { $$ = "logical"; } + | K_SLOT { $$ = "slot"; } + | K_RESERVE_WAL { $$ = "reserve_wal"; } + | K_TEMPORARY { $$ = "temporary"; } + | K_TWO_PHASE { $$ = "two_phase"; } + | K_EXPORT_SNAPSHOT { $$ = "export_snapshot"; } + | K_NOEXPORT_SNAPSHOT { $$ = "noexport_snapshot"; } + | K_USE_SNAPSHOT { $$ = "use_snapshot"; } + ; + +%% diff --git a/src/backend/replication/repl_scanner.c b/src/backend/replication/repl_scanner.c new file mode 100644 index 0000000..b1d0601 --- /dev/null +++ b/src/backend/replication/repl_scanner.c @@ -0,0 +1,2547 @@ +#line 2 "repl_scanner.c" +/*------------------------------------------------------------------------- + * + * repl_scanner.l + * a lexical scanner for the replication commands + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_scanner.l + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "nodes/parsenodes.h" +#include "utils/builtins.h" +#include "parser/scansup.h" + +/* + * NB: include repl_gram.h only AFTER including walsender_private.h, because + * walsender_private includes headers that define XLogRecPtr. + */ +#include "replication/walsender_private.h" +#include "repl_gram.h" + +#line 30 "repl_scanner.c" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define yy_create_buffer replication_yy_create_buffer +#define yy_delete_buffer replication_yy_delete_buffer +#define yy_scan_buffer replication_yy_scan_buffer +#define yy_scan_string replication_yy_scan_string +#define yy_scan_bytes replication_yy_scan_bytes +#define yy_init_buffer replication_yy_init_buffer +#define yy_flush_buffer replication_yy_flush_buffer +#define yy_load_buffer_state replication_yy_load_buffer_state +#define yy_switch_to_buffer replication_yy_switch_to_buffer +#define yypush_buffer_state replication_yypush_buffer_state +#define yypop_buffer_state replication_yypop_buffer_state +#define yyensure_buffer_stack replication_yyensure_buffer_stack +#define yy_flex_debug replication_yy_flex_debug +#define yyin replication_yyin +#define yyleng replication_yyleng +#define yylex replication_yylex +#define yylineno replication_yylineno +#define yyout replication_yyout +#define yyrestart replication_yyrestart +#define yytext replication_yytext +#define yywrap replication_yywrap +#define yyalloc replication_yyalloc +#define yyrealloc replication_yyrealloc +#define yyfree replication_yyfree + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define replication_yy_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer replication_yy_create_buffer +#endif + +#ifdef yy_delete_buffer +#define replication_yy_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer replication_yy_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define replication_yy_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer replication_yy_scan_buffer +#endif + +#ifdef yy_scan_string +#define replication_yy_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string replication_yy_scan_string +#endif + +#ifdef yy_scan_bytes +#define replication_yy_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes replication_yy_scan_bytes +#endif + +#ifdef yy_init_buffer +#define replication_yy_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer replication_yy_init_buffer +#endif + +#ifdef yy_flush_buffer +#define replication_yy_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer replication_yy_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define replication_yy_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state replication_yy_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define replication_yy_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer replication_yy_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define replication_yypush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state replication_yypush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define replication_yypop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state replication_yypop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define replication_yyensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack replication_yyensure_buffer_stack +#endif + +#ifdef yylex +#define replication_yylex_ALREADY_DEFINED +#else +#define yylex replication_yylex +#endif + +#ifdef yyrestart +#define replication_yyrestart_ALREADY_DEFINED +#else +#define yyrestart replication_yyrestart +#endif + +#ifdef yylex_init +#define replication_yylex_init_ALREADY_DEFINED +#else +#define yylex_init replication_yylex_init +#endif + +#ifdef yylex_init_extra +#define replication_yylex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra replication_yylex_init_extra +#endif + +#ifdef yylex_destroy +#define replication_yylex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy replication_yylex_destroy +#endif + +#ifdef yyget_debug +#define replication_yyget_debug_ALREADY_DEFINED +#else +#define yyget_debug replication_yyget_debug +#endif + +#ifdef yyset_debug +#define replication_yyset_debug_ALREADY_DEFINED +#else +#define yyset_debug replication_yyset_debug +#endif + +#ifdef yyget_extra +#define replication_yyget_extra_ALREADY_DEFINED +#else +#define yyget_extra replication_yyget_extra +#endif + +#ifdef yyset_extra +#define replication_yyset_extra_ALREADY_DEFINED +#else +#define yyset_extra replication_yyset_extra +#endif + +#ifdef yyget_in +#define replication_yyget_in_ALREADY_DEFINED +#else +#define yyget_in replication_yyget_in +#endif + +#ifdef yyset_in +#define replication_yyset_in_ALREADY_DEFINED +#else +#define yyset_in replication_yyset_in +#endif + +#ifdef yyget_out +#define replication_yyget_out_ALREADY_DEFINED +#else +#define yyget_out replication_yyget_out +#endif + +#ifdef yyset_out +#define replication_yyset_out_ALREADY_DEFINED +#else +#define yyset_out replication_yyset_out +#endif + +#ifdef yyget_leng +#define replication_yyget_leng_ALREADY_DEFINED +#else +#define yyget_leng replication_yyget_leng +#endif + +#ifdef yyget_text +#define replication_yyget_text_ALREADY_DEFINED +#else +#define yyget_text replication_yyget_text +#endif + +#ifdef yyget_lineno +#define replication_yyget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno replication_yyget_lineno +#endif + +#ifdef yyset_lineno +#define replication_yyset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno replication_yyset_lineno +#endif + +#ifdef yywrap +#define replication_yywrap_ALREADY_DEFINED +#else +#define yywrap replication_yywrap +#endif + +#ifdef yyalloc +#define replication_yyalloc_ALREADY_DEFINED +#else +#define yyalloc replication_yyalloc +#endif + +#ifdef yyrealloc +#define replication_yyrealloc_ALREADY_DEFINED +#else +#define yyrealloc replication_yyrealloc +#endif + +#ifdef yyfree +#define replication_yyfree_ALREADY_DEFINED +#else +#define yyfree replication_yyfree +#endif + +#ifdef yytext +#define replication_yytext_ALREADY_DEFINED +#else +#define yytext replication_yytext +#endif + +#ifdef yyleng +#define replication_yyleng_ALREADY_DEFINED +#else +#define yyleng replication_yyleng +#endif + +#ifdef yyin +#define replication_yyin_ALREADY_DEFINED +#else +#define yyin replication_yyin +#endif + +#ifdef yyout +#define replication_yyout_ALREADY_DEFINED +#else +#define yyout replication_yyout +#endif + +#ifdef yy_flex_debug +#define replication_yy_flex_debug_ALREADY_DEFINED +#else +#define yy_flex_debug replication_yy_flex_debug +#endif + +#ifdef yylineno +#define replication_yylineno_ALREADY_DEFINED +#else +#define yylineno replication_yylineno +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an + * integer in range [0..255] for use as an array index. + */ +#define YY_SC_TO_UI(c) ((YY_CHAR) (c)) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart( yyin ) +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +extern int yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = NULL; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart ( FILE *input_file ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size ); +void yy_delete_buffer ( YY_BUFFER_STATE b ); +void yy_flush_buffer ( YY_BUFFER_STATE b ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state ( void ); + +static void yyensure_buffer_stack ( void ); +static void yy_load_buffer_state ( void ); +static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file ); +#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len ); + +void *yyalloc ( yy_size_t ); +void *yyrealloc ( void *, yy_size_t ); +void yyfree ( void * ); + +#define yy_new_buffer yy_create_buffer +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define replication_yywrap() (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP +typedef flex_uint8_t YY_CHAR; + +FILE *yyin = NULL, *yyout = NULL; + +typedef int yy_state_type; + +extern int yylineno; +int yylineno = 1; + +extern char *yytext; +#ifdef yytext_ptr +#undef yytext_ptr +#endif +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state ( void ); +static yy_state_type yy_try_NUL_trans ( yy_state_type current_state ); +static int yy_get_next_buffer ( void ); +static void yynoreturn yy_fatal_error ( const char* msg ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (int) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; +#define YY_NUM_RULES 32 +#define YY_END_OF_BUFFER 33 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static const flex_int16_t yy_accept[249] = + { 0, + 0, 0, 0, 0, 0, 0, 33, 31, 20, 20, + 27, 23, 21, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, + 26, 24, 20, 0, 21, 0, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 29, 26, 25, 22, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 4, 13, 30, + 30, 30, 30, 30, 19, 30, 30, 30, 30, 30, + + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 12, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 10, 30, 30, 30, + 30, 5, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 14, 30, 15, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 1, 30, + 30, 30, 30, 30, 30, 11, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 18, 30, 30, + + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 16, 2, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 9, 30, 30, + 17, 30, 6, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 8, 3, 30, 7, 0 + } ; + +static const YY_CHAR yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 4, 1, 5, 1, 1, 6, 1, + 1, 1, 1, 1, 1, 1, 7, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 1, 1, 1, + 1, 1, 1, 1, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 18, 25, 26, 27, 28, 29, 30, 31, 32, 18, + 1, 1, 1, 1, 33, 1, 34, 34, 34, 34, + + 34, 34, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 1, 1, 1, 1, 1, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18 + } ; + +static const YY_CHAR yy_meta[35] = + { 0, + 1, 1, 1, 2, 3, 4, 5, 6, 6, 6, + 6, 6, 6, 6, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 6 + } ; + +static const flex_int16_t yy_base[256] = + { 0, + 0, 0, 517, 516, 513, 512, 517, 522, 33, 35, + 522, 522, 32, 34, 62, 40, 41, 46, 509, 44, + 55, 56, 45, 46, 73, 74, 57, 51, 0, 522, + 0, 509, 82, 0, 87, 507, 506, 501, 73, 79, + 83, 81, 90, 94, 95, 91, 103, 104, 107, 106, + 100, 110, 109, 111, 50, 0, 0, 522, 0, 112, + 113, 119, 121, 126, 128, 127, 129, 130, 133, 131, + 132, 140, 142, 143, 134, 144, 145, 146, 147, 150, + 153, 155, 157, 156, 164, 162, 163, 495, 491, 166, + 168, 169, 177, 178, 474, 180, 179, 187, 189, 190, + + 191, 192, 195, 196, 198, 201, 203, 202, 204, 207, + 215, 206, 210, 211, 218, 223, 224, 226, 229, 233, + 230, 238, 219, 241, 244, 245, 247, 250, 251, 231, + 473, 252, 253, 254, 255, 257, 258, 262, 259, 260, + 261, 269, 274, 264, 280, 282, 472, 283, 284, 285, + 286, 288, 289, 290, 291, 293, 294, 297, 298, 300, + 303, 316, 292, 471, 315, 469, 320, 321, 322, 323, + 325, 301, 328, 330, 331, 336, 337, 332, 468, 339, + 350, 340, 341, 351, 354, 467, 357, 345, 355, 358, + 363, 365, 366, 367, 368, 369, 370, 466, 376, 372, + + 373, 379, 377, 381, 380, 387, 392, 393, 394, 397, + 395, 399, 398, 401, 406, 402, 465, 464, 405, 407, + 410, 413, 418, 420, 423, 424, 425, 463, 427, 428, + 462, 429, 460, 430, 431, 432, 433, 435, 437, 436, + 439, 441, 454, 458, 457, 455, 333, 522, 482, 488, + 490, 494, 500, 506, 48 + } ; + +static const flex_int16_t yy_def[256] = + { 0, + 248, 1, 249, 249, 250, 250, 248, 248, 248, 248, + 248, 248, 251, 252, 252, 15, 15, 15, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 253, 248, + 254, 248, 248, 255, 251, 251, 252, 15, 15, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 253, 254, 248, 255, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 0, 248, 248, + 248, 248, 248, 248, 248 + } ; + +static const flex_int16_t yy_nxt[557] = + { 0, + 8, 9, 10, 11, 8, 12, 8, 13, 14, 15, + 16, 17, 18, 14, 19, 19, 20, 19, 19, 21, + 19, 22, 19, 23, 24, 25, 26, 27, 19, 28, + 19, 19, 19, 14, 33, 33, 33, 33, 34, 35, + 34, 38, 38, 38, 38, 38, 38, 38, 38, 38, + 248, 248, 248, 59, 38, 43, 248, 248, 47, 55, + 46, 248, 248, 248, 40, 41, 77, 38, 34, 38, + 39, 38, 38, 38, 38, 38, 42, 44, 45, 248, + 248, 38, 54, 33, 33, 248, 51, 248, 48, 248, + 52, 61, 49, 34, 35, 38, 248, 248, 60, 50, + + 248, 248, 64, 53, 63, 62, 248, 66, 65, 248, + 248, 68, 248, 248, 72, 248, 248, 248, 248, 248, + 73, 79, 67, 76, 78, 248, 70, 248, 69, 71, + 74, 75, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 86, 80, 81, 83, 87, 248, 82, 248, 248, + 248, 248, 248, 248, 85, 92, 248, 84, 89, 248, + 88, 248, 248, 248, 90, 91, 93, 101, 248, 248, + 248, 95, 248, 97, 248, 248, 94, 99, 96, 102, + 103, 100, 98, 248, 248, 248, 248, 105, 108, 111, + 107, 112, 106, 248, 104, 248, 248, 248, 248, 116, + + 109, 248, 248, 110, 248, 118, 115, 248, 248, 248, + 248, 113, 248, 248, 117, 114, 248, 248, 123, 124, + 119, 248, 128, 126, 248, 248, 120, 122, 125, 248, + 248, 130, 248, 121, 133, 248, 248, 248, 127, 248, + 138, 134, 131, 129, 248, 135, 137, 248, 132, 139, + 248, 248, 140, 248, 136, 141, 248, 248, 248, 248, + 248, 248, 145, 248, 248, 248, 248, 248, 248, 150, + 248, 142, 147, 143, 152, 248, 144, 148, 146, 155, + 248, 156, 151, 154, 153, 158, 248, 149, 248, 248, + 248, 248, 248, 157, 248, 248, 248, 248, 248, 248, + + 248, 166, 161, 248, 248, 171, 248, 248, 163, 248, + 170, 176, 159, 162, 160, 167, 169, 164, 168, 174, + 165, 248, 248, 172, 175, 173, 248, 248, 248, 248, + 177, 248, 183, 181, 248, 178, 248, 248, 248, 248, + 185, 180, 248, 248, 179, 248, 248, 248, 182, 184, + 186, 248, 187, 188, 189, 190, 248, 248, 191, 194, + 248, 248, 195, 248, 248, 192, 193, 196, 199, 248, + 197, 248, 248, 248, 248, 248, 248, 205, 248, 248, + 201, 198, 248, 248, 207, 248, 248, 248, 208, 200, + 203, 210, 202, 248, 204, 209, 206, 212, 248, 248, + + 248, 248, 211, 248, 248, 248, 213, 248, 248, 214, + 219, 248, 248, 248, 221, 216, 248, 218, 215, 248, + 217, 220, 223, 224, 248, 222, 248, 225, 226, 248, + 248, 248, 227, 248, 248, 248, 248, 248, 248, 248, + 229, 248, 248, 248, 228, 248, 233, 248, 234, 231, + 238, 239, 230, 235, 236, 243, 232, 241, 240, 242, + 248, 248, 237, 248, 248, 244, 248, 245, 248, 248, + 248, 248, 248, 248, 248, 248, 246, 248, 248, 248, + 248, 247, 29, 29, 29, 29, 29, 29, 31, 31, + 31, 31, 31, 31, 36, 36, 37, 248, 37, 37, + + 56, 248, 56, 56, 56, 56, 57, 57, 57, 38, + 57, 57, 248, 34, 58, 248, 248, 32, 32, 30, + 30, 7, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248 + } ; + +static const flex_int16_t yy_chk[557] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 9, 9, 10, 10, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 16, 17, + 20, 23, 24, 255, 18, 20, 55, 28, 24, 28, + 23, 21, 22, 27, 16, 17, 55, 14, 15, 15, + 15, 15, 15, 15, 15, 15, 18, 21, 22, 25, + 26, 39, 27, 33, 33, 40, 26, 42, 25, 41, + 26, 40, 25, 35, 35, 15, 43, 46, 39, 25, + + 44, 45, 43, 26, 42, 41, 51, 45, 44, 47, + 48, 47, 50, 49, 50, 53, 52, 54, 60, 61, + 51, 61, 46, 54, 60, 62, 48, 63, 47, 49, + 52, 53, 64, 66, 65, 67, 68, 70, 71, 69, + 75, 68, 62, 63, 65, 69, 72, 64, 73, 74, + 76, 77, 78, 79, 67, 74, 80, 66, 71, 81, + 70, 82, 84, 83, 72, 73, 75, 83, 86, 87, + 85, 77, 90, 79, 91, 92, 76, 81, 78, 84, + 85, 82, 80, 93, 94, 97, 96, 87, 92, 96, + 91, 97, 90, 98, 86, 99, 100, 101, 102, 101, + + 93, 103, 104, 94, 105, 103, 100, 106, 108, 107, + 109, 98, 112, 110, 102, 99, 113, 114, 108, 109, + 104, 111, 113, 111, 115, 123, 105, 107, 110, 116, + 117, 115, 118, 106, 118, 119, 121, 130, 112, 120, + 123, 119, 116, 114, 122, 120, 122, 124, 117, 124, + 125, 126, 125, 127, 121, 126, 128, 129, 132, 133, + 134, 135, 130, 136, 137, 139, 140, 141, 138, 136, + 144, 127, 133, 128, 138, 142, 129, 134, 132, 141, + 143, 142, 137, 140, 139, 144, 145, 135, 146, 148, + 149, 150, 151, 143, 152, 153, 154, 155, 163, 156, + + 157, 153, 148, 158, 159, 158, 160, 172, 150, 161, + 157, 163, 145, 149, 146, 154, 156, 151, 155, 161, + 152, 165, 162, 159, 162, 160, 167, 168, 169, 170, + 165, 171, 172, 170, 173, 167, 174, 175, 178, 247, + 174, 169, 176, 177, 168, 180, 182, 183, 171, 173, + 175, 188, 176, 177, 178, 180, 181, 184, 181, 184, + 185, 189, 185, 187, 190, 182, 183, 187, 190, 191, + 188, 192, 193, 194, 195, 196, 197, 196, 200, 201, + 192, 189, 199, 203, 199, 202, 205, 204, 200, 191, + 194, 202, 193, 206, 195, 201, 197, 204, 207, 208, + + 209, 211, 203, 210, 213, 212, 205, 214, 216, 206, + 211, 219, 215, 220, 213, 208, 221, 210, 207, 222, + 209, 212, 215, 216, 223, 214, 224, 219, 220, 225, + 226, 227, 221, 229, 230, 232, 234, 235, 236, 237, + 223, 238, 240, 239, 222, 241, 227, 242, 229, 225, + 235, 236, 224, 230, 232, 240, 226, 238, 237, 239, + 243, 246, 234, 245, 244, 241, 233, 242, 231, 228, + 218, 217, 198, 186, 179, 166, 243, 164, 147, 131, + 95, 246, 249, 249, 249, 249, 249, 249, 250, 250, + 250, 250, 250, 250, 251, 251, 252, 89, 252, 252, + + 253, 88, 253, 253, 253, 253, 254, 254, 254, 38, + 254, 254, 37, 36, 32, 19, 7, 6, 5, 4, + 3, 248, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "repl_scanner.l" + +#line 31 "repl_scanner.l" +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} + +/* Handle to the buffer that the lexer uses internally */ +static YY_BUFFER_STATE scanbufhandle; + +/* Pushed-back token (we only handle one) */ +static int repl_pushed_back_token; + +/* Work area for collecting literals */ +static StringInfoData litbuf; + +static void startlit(void); +static char *litbufdup(void); +static void addlit(char *ytext, int yleng); +static void addlitchar(unsigned char ychar); + +/* LCOV_EXCL_START */ + +#line 961 "repl_scanner.c" +#define YY_NO_INPUT 1 +/* + * Exclusive states: + * <xd> delimited identifiers (double-quoted identifiers) + * <xq> standard single-quoted strings + */ + +/* Extended quote + * xqdouble implements embedded quote, '''' + */ +/* Double quote + * Allows embedded spaces and other special characters into identifiers. + */ +#line 975 "repl_scanner.c" + +#define INITIAL 0 +#define xd 1 +#define xq 2 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals ( void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( void ); + +int yyget_debug ( void ); + +void yyset_debug ( int debug_flag ); + +YY_EXTRA_TYPE yyget_extra ( void ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined ); + +FILE *yyget_in ( void ); + +void yyset_in ( FILE * _in_str ); + +FILE *yyget_out ( void ); + +void yyset_out ( FILE * _out_str ); + + int yyget_leng ( void ); + +char *yyget_text ( void ); + +int yyget_lineno ( void ); + +void yyset_lineno ( int _line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( void ); +#else +extern int yywrap ( void ); +#endif +#endif + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * ); +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus +static int yyinput ( void ); +#else +static int input ( void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK /*LINTED*/break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + yy_state_type yy_current_state; + char *yy_cp, *yy_bp; + int yy_act; + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + { +#line 105 "repl_scanner.l" + + + +#line 109 "repl_scanner.l" + /* This code is inserted at the start of replication_yylex() */ + + /* If we have a pushed-back token, return that. */ + if (repl_pushed_back_token) + { + int result = repl_pushed_back_token; + + repl_pushed_back_token = 0; + return result; + } + + +#line 1209 "repl_scanner.c" + + while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + do + { + YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 249 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + ++yy_cp; + } + while ( yy_current_state != 248 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 121 "repl_scanner.l" +{ return K_BASE_BACKUP; } + YY_BREAK +case 2: +YY_RULE_SETUP +#line 122 "repl_scanner.l" +{ return K_IDENTIFY_SYSTEM; } + YY_BREAK +case 3: +YY_RULE_SETUP +#line 123 "repl_scanner.l" +{ return K_READ_REPLICATION_SLOT; } + YY_BREAK +case 4: +YY_RULE_SETUP +#line 124 "repl_scanner.l" +{ return K_SHOW; } + YY_BREAK +case 5: +YY_RULE_SETUP +#line 125 "repl_scanner.l" +{ return K_TIMELINE; } + YY_BREAK +case 6: +YY_RULE_SETUP +#line 126 "repl_scanner.l" +{ return K_START_REPLICATION; } + YY_BREAK +case 7: +YY_RULE_SETUP +#line 127 "repl_scanner.l" +{ return K_CREATE_REPLICATION_SLOT; } + YY_BREAK +case 8: +YY_RULE_SETUP +#line 128 "repl_scanner.l" +{ return K_DROP_REPLICATION_SLOT; } + YY_BREAK +case 9: +YY_RULE_SETUP +#line 129 "repl_scanner.l" +{ return K_TIMELINE_HISTORY; } + YY_BREAK +case 10: +YY_RULE_SETUP +#line 130 "repl_scanner.l" +{ return K_PHYSICAL; } + YY_BREAK +case 11: +YY_RULE_SETUP +#line 131 "repl_scanner.l" +{ return K_RESERVE_WAL; } + YY_BREAK +case 12: +YY_RULE_SETUP +#line 132 "repl_scanner.l" +{ return K_LOGICAL; } + YY_BREAK +case 13: +YY_RULE_SETUP +#line 133 "repl_scanner.l" +{ return K_SLOT; } + YY_BREAK +case 14: +YY_RULE_SETUP +#line 134 "repl_scanner.l" +{ return K_TEMPORARY; } + YY_BREAK +case 15: +YY_RULE_SETUP +#line 135 "repl_scanner.l" +{ return K_TWO_PHASE; } + YY_BREAK +case 16: +YY_RULE_SETUP +#line 136 "repl_scanner.l" +{ return K_EXPORT_SNAPSHOT; } + YY_BREAK +case 17: +YY_RULE_SETUP +#line 137 "repl_scanner.l" +{ return K_NOEXPORT_SNAPSHOT; } + YY_BREAK +case 18: +YY_RULE_SETUP +#line 138 "repl_scanner.l" +{ return K_USE_SNAPSHOT; } + YY_BREAK +case 19: +YY_RULE_SETUP +#line 139 "repl_scanner.l" +{ return K_WAIT; } + YY_BREAK +case 20: +/* rule 20 can match eol */ +YY_RULE_SETUP +#line 141 "repl_scanner.l" +{ /* do nothing */ } + YY_BREAK +case 21: +YY_RULE_SETUP +#line 143 "repl_scanner.l" +{ + replication_yylval.uintval = strtoul(yytext, NULL, 10); + return UCONST; + } + YY_BREAK +case 22: +YY_RULE_SETUP +#line 148 "repl_scanner.l" +{ + uint32 hi, + lo; + if (sscanf(yytext, "%X/%X", &hi, &lo) != 2) + replication_yyerror("invalid streaming start location"); + replication_yylval.recptr = ((uint64) hi) << 32 | lo; + return RECPTR; + } + YY_BREAK +case 23: +YY_RULE_SETUP +#line 157 "repl_scanner.l" +{ + BEGIN(xq); + startlit(); + } + YY_BREAK +case 24: +YY_RULE_SETUP +#line 162 "repl_scanner.l" +{ + yyless(1); + BEGIN(INITIAL); + replication_yylval.str = litbufdup(); + return SCONST; + } + YY_BREAK +case 25: +YY_RULE_SETUP +#line 169 "repl_scanner.l" +{ + addlitchar('\''); + } + YY_BREAK +case 26: +/* rule 26 can match eol */ +YY_RULE_SETUP +#line 173 "repl_scanner.l" +{ + addlit(yytext, yyleng); + } + YY_BREAK +case 27: +YY_RULE_SETUP +#line 177 "repl_scanner.l" +{ + BEGIN(xd); + startlit(); + } + YY_BREAK +case 28: +YY_RULE_SETUP +#line 182 "repl_scanner.l" +{ + int len; + + yyless(1); + BEGIN(INITIAL); + replication_yylval.str = litbufdup(); + len = strlen(replication_yylval.str); + truncate_identifier(replication_yylval.str, len, true); + return IDENT; + } + YY_BREAK +case 29: +/* rule 29 can match eol */ +YY_RULE_SETUP +#line 193 "repl_scanner.l" +{ + addlit(yytext, yyleng); + } + YY_BREAK +case 30: +YY_RULE_SETUP +#line 197 "repl_scanner.l" +{ + int len = strlen(yytext); + + replication_yylval.str = downcase_truncate_identifier(yytext, len, true); + return IDENT; + } + YY_BREAK +case 31: +YY_RULE_SETUP +#line 204 "repl_scanner.l" +{ + /* Any char not recognized above is returned as itself */ + return yytext[0]; + } + YY_BREAK +case YY_STATE_EOF(xq): +case YY_STATE_EOF(xd): +#line 209 "repl_scanner.l" +{ replication_yyerror("unterminated quoted string"); } + YY_BREAK +case YY_STATE_EOF(INITIAL): +#line 212 "repl_scanner.l" +{ + yyterminate(); + } + YY_BREAK +case 32: +YY_RULE_SETUP +#line 216 "repl_scanner.l" +YY_FATAL_ERROR( "flex scanner jammed" ); + YY_BREAK +#line 1480 "repl_scanner.c" + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + char *source = (yytext_ptr); + int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr) - 1); + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc( (void *) b->yy_ch_buf, + (yy_size_t) (b->yy_buf_size + 2) ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = NULL; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart( yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if (((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc( + (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + /* "- 2" to take care of EOB's */ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + yy_state_type yy_current_state; + char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 249 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + int yy_is_jam; + char *yy_cp = (yy_c_buf_p); + + YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 249 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + yy_is_jam = (yy_current_state == 248); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (int) ((yy_c_buf_p) - (yytext_ptr)); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart( yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return 0; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE ); + } + + yy_init_buffer( YY_CURRENT_BUFFER, input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer( b, file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree( (void *) b->yy_ch_buf ); + + yyfree( (void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer( b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + yy_size_t num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + yy_size_t grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return NULL; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = NULL; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer( b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (const char * yystr ) +{ + + return yy_scan_bytes( yystr, (int) strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = (yy_size_t) (_yybytes_len + 2); + buf = (char *) yyalloc( n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer( buf, n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yynoreturn yy_fatal_error (const char* msg ) +{ + fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param _line_number line number + * + */ +void yyset_lineno (int _line_number ) +{ + + yylineno = _line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param _in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * _in_str ) +{ + yyin = _in_str ; +} + +void yyset_out (FILE * _out_str ) +{ + yyout = _out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int _bdebug ) +{ + yy_flex_debug = _bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = NULL; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = NULL; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = NULL; + yyout = NULL; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer( YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, const char * s2, int n ) +{ + + int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (const char * s ) +{ + int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return malloc(size); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return realloc(ptr, size); +} + +void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 216 "repl_scanner.l" + + +/* LCOV_EXCL_STOP */ + +static void +startlit(void) +{ + initStringInfo(&litbuf); +} + +static char * +litbufdup(void) +{ + return litbuf.data; +} + +static void +addlit(char *ytext, int yleng) +{ + appendBinaryStringInfo(&litbuf, ytext, yleng); +} + +static void +addlitchar(unsigned char ychar) +{ + appendStringInfoChar(&litbuf, ychar); +} + +void +replication_yyerror(const char *message) +{ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg_internal("%s", message))); +} + + +void +replication_scanner_init(const char *str) +{ + Size slen = strlen(str); + char *scanbuf; + + /* + * Might be left over after ereport() + */ + if (YY_CURRENT_BUFFER) + yy_delete_buffer(YY_CURRENT_BUFFER); + + /* + * Make a scan buffer with special termination needed by flex. + */ + scanbuf = (char *) palloc(slen + 2); + memcpy(scanbuf, str, slen); + scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; + scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); + + /* Make sure we start in proper state */ + BEGIN(INITIAL); + repl_pushed_back_token = 0; +} + +void +replication_scanner_finish(void) +{ + yy_delete_buffer(scanbufhandle); + scanbufhandle = NULL; +} + +/* + * Check to see if the first token of a command is a WalSender keyword. + * + * To keep repl_scanner.l minimal, we don't ask it to know every construct + * that the core lexer knows. Therefore, we daren't lex more than the + * first token of a general SQL command. That will usually look like an + * IDENT token here, although some other cases are possible. + */ +bool +replication_scanner_is_replication_command(void) +{ + int first_token = replication_yylex(); + + switch (first_token) + { + case K_IDENTIFY_SYSTEM: + case K_BASE_BACKUP: + case K_START_REPLICATION: + case K_CREATE_REPLICATION_SLOT: + case K_DROP_REPLICATION_SLOT: + case K_READ_REPLICATION_SLOT: + case K_TIMELINE_HISTORY: + case K_SHOW: + /* Yes; push back the first token so we can parse later. */ + repl_pushed_back_token = first_token; + return true; + default: + /* Nope; we don't bother to push back the token. */ + return false; + } +} + diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l new file mode 100644 index 0000000..cb467ca --- /dev/null +++ b/src/backend/replication/repl_scanner.l @@ -0,0 +1,314 @@ +%top{ +/*------------------------------------------------------------------------- + * + * repl_scanner.l + * a lexical scanner for the replication commands + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/repl_scanner.l + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "nodes/parsenodes.h" +#include "utils/builtins.h" +#include "parser/scansup.h" + +/* + * NB: include repl_gram.h only AFTER including walsender_private.h, because + * walsender_private includes headers that define XLogRecPtr. + */ +#include "replication/walsender_private.h" +#include "repl_gram.h" +} + +%{ +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} + +/* Handle to the buffer that the lexer uses internally */ +static YY_BUFFER_STATE scanbufhandle; + +/* Pushed-back token (we only handle one) */ +static int repl_pushed_back_token; + +/* Work area for collecting literals */ +static StringInfoData litbuf; + +static void startlit(void); +static char *litbufdup(void); +static void addlit(char *ytext, int yleng); +static void addlitchar(unsigned char ychar); + +/* LCOV_EXCL_START */ + +%} + +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option warn +%option prefix="replication_yy" + +/* + * Exclusive states: + * <xd> delimited identifiers (double-quoted identifiers) + * <xq> standard single-quoted strings + */ +%x xd +%x xq + +space [ \t\n\r\f] + +quote ' +quotestop {quote} + +/* Extended quote + * xqdouble implements embedded quote, '''' + */ +xqstart {quote} +xqdouble {quote}{quote} +xqinside [^']+ + +/* Double quote + * Allows embedded spaces and other special characters into identifiers. + */ +dquote \" +xdstart {dquote} +xdstop {dquote} +xddouble {dquote}{dquote} +xdinside [^"]+ + +digit [0-9] +hexdigit [0-9A-Fa-f] + +ident_start [A-Za-z\200-\377_] +ident_cont [A-Za-z\200-\377_0-9\$] + +identifier {ident_start}{ident_cont}* + +%% + +%{ + /* This code is inserted at the start of replication_yylex() */ + + /* If we have a pushed-back token, return that. */ + if (repl_pushed_back_token) + { + int result = repl_pushed_back_token; + + repl_pushed_back_token = 0; + return result; + } +%} + +BASE_BACKUP { return K_BASE_BACKUP; } +IDENTIFY_SYSTEM { return K_IDENTIFY_SYSTEM; } +READ_REPLICATION_SLOT { return K_READ_REPLICATION_SLOT; } +SHOW { return K_SHOW; } +TIMELINE { return K_TIMELINE; } +START_REPLICATION { return K_START_REPLICATION; } +CREATE_REPLICATION_SLOT { return K_CREATE_REPLICATION_SLOT; } +DROP_REPLICATION_SLOT { return K_DROP_REPLICATION_SLOT; } +TIMELINE_HISTORY { return K_TIMELINE_HISTORY; } +PHYSICAL { return K_PHYSICAL; } +RESERVE_WAL { return K_RESERVE_WAL; } +LOGICAL { return K_LOGICAL; } +SLOT { return K_SLOT; } +TEMPORARY { return K_TEMPORARY; } +TWO_PHASE { return K_TWO_PHASE; } +EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } +NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } +USE_SNAPSHOT { return K_USE_SNAPSHOT; } +WAIT { return K_WAIT; } + +{space}+ { /* do nothing */ } + +{digit}+ { + replication_yylval.uintval = strtoul(yytext, NULL, 10); + return UCONST; + } + +{hexdigit}+\/{hexdigit}+ { + uint32 hi, + lo; + if (sscanf(yytext, "%X/%X", &hi, &lo) != 2) + replication_yyerror("invalid streaming start location"); + replication_yylval.recptr = ((uint64) hi) << 32 | lo; + return RECPTR; + } + +{xqstart} { + BEGIN(xq); + startlit(); + } + +<xq>{quotestop} { + yyless(1); + BEGIN(INITIAL); + replication_yylval.str = litbufdup(); + return SCONST; + } + +<xq>{xqdouble} { + addlitchar('\''); + } + +<xq>{xqinside} { + addlit(yytext, yyleng); + } + +{xdstart} { + BEGIN(xd); + startlit(); + } + +<xd>{xdstop} { + int len; + + yyless(1); + BEGIN(INITIAL); + replication_yylval.str = litbufdup(); + len = strlen(replication_yylval.str); + truncate_identifier(replication_yylval.str, len, true); + return IDENT; + } + +<xd>{xdinside} { + addlit(yytext, yyleng); + } + +{identifier} { + int len = strlen(yytext); + + replication_yylval.str = downcase_truncate_identifier(yytext, len, true); + return IDENT; + } + +. { + /* Any char not recognized above is returned as itself */ + return yytext[0]; + } + +<xq,xd><<EOF>> { replication_yyerror("unterminated quoted string"); } + + +<<EOF>> { + yyterminate(); + } + +%% + +/* LCOV_EXCL_STOP */ + +static void +startlit(void) +{ + initStringInfo(&litbuf); +} + +static char * +litbufdup(void) +{ + return litbuf.data; +} + +static void +addlit(char *ytext, int yleng) +{ + appendBinaryStringInfo(&litbuf, ytext, yleng); +} + +static void +addlitchar(unsigned char ychar) +{ + appendStringInfoChar(&litbuf, ychar); +} + +void +replication_yyerror(const char *message) +{ + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg_internal("%s", message))); +} + + +void +replication_scanner_init(const char *str) +{ + Size slen = strlen(str); + char *scanbuf; + + /* + * Might be left over after ereport() + */ + if (YY_CURRENT_BUFFER) + yy_delete_buffer(YY_CURRENT_BUFFER); + + /* + * Make a scan buffer with special termination needed by flex. + */ + scanbuf = (char *) palloc(slen + 2); + memcpy(scanbuf, str, slen); + scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; + scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); + + /* Make sure we start in proper state */ + BEGIN(INITIAL); + repl_pushed_back_token = 0; +} + +void +replication_scanner_finish(void) +{ + yy_delete_buffer(scanbufhandle); + scanbufhandle = NULL; +} + +/* + * Check to see if the first token of a command is a WalSender keyword. + * + * To keep repl_scanner.l minimal, we don't ask it to know every construct + * that the core lexer knows. Therefore, we daren't lex more than the + * first token of a general SQL command. That will usually look like an + * IDENT token here, although some other cases are possible. + */ +bool +replication_scanner_is_replication_command(void) +{ + int first_token = replication_yylex(); + + switch (first_token) + { + case K_IDENTIFY_SYSTEM: + case K_BASE_BACKUP: + case K_START_REPLICATION: + case K_CREATE_REPLICATION_SLOT: + case K_DROP_REPLICATION_SLOT: + case K_READ_REPLICATION_SLOT: + case K_TIMELINE_HISTORY: + case K_SHOW: + /* Yes; push back the first token so we can parse later. */ + repl_pushed_back_token = first_token; + return true; + default: + /* Nope; we don't bother to push back the token. */ + return false; + } +} diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c new file mode 100644 index 0000000..bb09c40 --- /dev/null +++ b/src/backend/replication/slot.c @@ -0,0 +1,2094 @@ +/*------------------------------------------------------------------------- + * + * slot.c + * Replication slot management. + * + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/slot.c + * + * NOTES + * + * Replication slots are used to keep state about replication streams + * originating from this cluster. Their primary purpose is to prevent the + * premature removal of WAL or of old tuple versions in a manner that would + * interfere with replication; they are also useful for monitoring purposes. + * Slots need to be permanent (to allow restarts), crash-safe, and allocatable + * on standbys (to support cascading setups). The requirement that slots be + * usable on standbys precludes storing them in the system catalogs. + * + * Each replication slot gets its own directory inside the $PGDATA/pg_replslot + * directory. Inside that directory the state file will contain the slot's + * own data. Additional data can be stored alongside that file if required. + * While the server is running, the state data is also cached in memory for + * efficiency. + * + * ReplicationSlotAllocationLock must be taken in exclusive mode to allocate + * or free a slot. ReplicationSlotControlLock must be taken in shared mode + * to iterate over the slots, and in exclusive mode to change the in_use flag + * of a slot. The remaining data in each slot is protected by its mutex. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> +#include <sys/stat.h> + +#include "access/transam.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "common/file_utils.h" +#include "common/string.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/slot.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/builtins.h" + +/* + * Replication slot on-disk data structure. + */ +typedef struct ReplicationSlotOnDisk +{ + /* first part of this struct needs to be version independent */ + + /* data not covered by checksum */ + uint32 magic; + pg_crc32c checksum; + + /* data covered by checksum */ + uint32 version; + uint32 length; + + /* + * The actual data in the slot that follows can differ based on the above + * 'version'. + */ + + ReplicationSlotPersistentData slotdata; +} ReplicationSlotOnDisk; + +/* size of version independent data */ +#define ReplicationSlotOnDiskConstantSize \ + offsetof(ReplicationSlotOnDisk, slotdata) +/* size of the part of the slot not covered by the checksum */ +#define ReplicationSlotOnDiskNotChecksummedSize \ + offsetof(ReplicationSlotOnDisk, version) +/* size of the part covered by the checksum */ +#define ReplicationSlotOnDiskChecksummedSize \ + sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskNotChecksummedSize +/* size of the slot data that is version dependent */ +#define ReplicationSlotOnDiskV2Size \ + sizeof(ReplicationSlotOnDisk) - ReplicationSlotOnDiskConstantSize + +#define SLOT_MAGIC 0x1051CA1 /* format identifier */ +#define SLOT_VERSION 3 /* version for new files */ + +/* Control array for replication slot management */ +ReplicationSlotCtlData *ReplicationSlotCtl = NULL; + +/* My backend's replication slot in the shared memory array */ +ReplicationSlot *MyReplicationSlot = NULL; + +/* GUC variable */ +int max_replication_slots = 10; /* the maximum number of replication + * slots */ + +static void ReplicationSlotShmemExit(int code, Datum arg); +static void ReplicationSlotDropAcquired(void); +static void ReplicationSlotDropPtr(ReplicationSlot *slot); + +/* internal persistency functions */ +static void RestoreSlotFromDisk(const char *name); +static void CreateSlotOnDisk(ReplicationSlot *slot); +static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel); + +/* + * Report shared-memory space needed by ReplicationSlotsShmemInit. + */ +Size +ReplicationSlotsShmemSize(void) +{ + Size size = 0; + + if (max_replication_slots == 0) + return size; + + size = offsetof(ReplicationSlotCtlData, replication_slots); + size = add_size(size, + mul_size(max_replication_slots, sizeof(ReplicationSlot))); + + return size; +} + +/* + * Allocate and initialize shared memory for replication slots. + */ +void +ReplicationSlotsShmemInit(void) +{ + bool found; + + if (max_replication_slots == 0) + return; + + ReplicationSlotCtl = (ReplicationSlotCtlData *) + ShmemInitStruct("ReplicationSlot Ctl", ReplicationSlotsShmemSize(), + &found); + + if (!found) + { + int i; + + /* First time through, so initialize */ + MemSet(ReplicationSlotCtl, 0, ReplicationSlotsShmemSize()); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[i]; + + /* everything else is zeroed by the memset above */ + SpinLockInit(&slot->mutex); + LWLockInitialize(&slot->io_in_progress_lock, + LWTRANCHE_REPLICATION_SLOT_IO); + ConditionVariableInit(&slot->active_cv); + } + } +} + +/* + * Register the callback for replication slot cleanup and releasing. + */ +void +ReplicationSlotInitialize(void) +{ + before_shmem_exit(ReplicationSlotShmemExit, 0); +} + +/* + * Release and cleanup replication slots. + */ +static void +ReplicationSlotShmemExit(int code, Datum arg) +{ + /* Make sure active replication slots are released */ + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + + /* Also cleanup all the temporary slots. */ + ReplicationSlotCleanup(); +} + +/* + * Check whether the passed slot name is valid and report errors at elevel. + * + * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow + * the name to be used as a directory name on every supported OS. + * + * Returns whether the directory name is valid or not if elevel < ERROR. + */ +bool +ReplicationSlotValidateName(const char *name, int elevel) +{ + const char *cp; + + if (strlen(name) == 0) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_NAME), + errmsg("replication slot name \"%s\" is too short", + name))); + return false; + } + + if (strlen(name) >= NAMEDATALEN) + { + ereport(elevel, + (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("replication slot name \"%s\" is too long", + name))); + return false; + } + + for (cp = name; *cp; cp++) + { + if (!((*cp >= 'a' && *cp <= 'z') + || (*cp >= '0' && *cp <= '9') + || (*cp == '_'))) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_NAME), + errmsg("replication slot name \"%s\" contains invalid character", + name), + errhint("Replication slot names may only contain lower case letters, numbers, and the underscore character."))); + return false; + } + } + return true; +} + +/* + * Create a new replication slot and mark it as used by this backend. + * + * name: Name of the slot + * db_specific: logical decoding is db specific; if the slot is going to + * be used for that pass true, otherwise false. + * two_phase: Allows decoding of prepared transactions. We allow this option + * to be enabled only at the slot creation time. If we allow this option + * to be changed during decoding then it is quite possible that we skip + * prepare first time because this option was not enabled. Now next time + * during getting changes, if the two_phase option is enabled it can skip + * prepare because by that time start decoding point has been moved. So the + * user will only get commit prepared. + */ +void +ReplicationSlotCreate(const char *name, bool db_specific, + ReplicationSlotPersistency persistency, bool two_phase) +{ + ReplicationSlot *slot = NULL; + int i; + + Assert(MyReplicationSlot == NULL); + + ReplicationSlotValidateName(name, ERROR); + + /* + * If some other backend ran this code concurrently with us, we'd likely + * both allocate the same slot, and that would be bad. We'd also be at + * risk of missing a name collision. Also, we don't want to try to create + * a new slot while somebody's busy cleaning up an old one, because we + * might both be monkeying with the same directory. + */ + LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); + + /* + * Check for name collision, and identify an allocatable slot. We need to + * hold ReplicationSlotControlLock in shared mode for this, so that nobody + * else can change the in_use flags while we're looking at them. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + + if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("replication slot \"%s\" already exists", name))); + if (!s->in_use && slot == NULL) + slot = s; + } + LWLockRelease(ReplicationSlotControlLock); + + /* If all slots are in use, we're out of luck. */ + if (slot == NULL) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("all replication slots are in use"), + errhint("Free one or increase max_replication_slots."))); + + /* + * Since this slot is not in use, nobody should be looking at any part of + * it other than the in_use field unless they're trying to allocate it. + * And since we hold ReplicationSlotAllocationLock, nobody except us can + * be doing that. So it's safe to initialize the slot. + */ + Assert(!slot->in_use); + Assert(slot->active_pid == 0); + + /* first initialize persistent data */ + memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); + namestrcpy(&slot->data.name, name); + slot->data.database = db_specific ? MyDatabaseId : InvalidOid; + slot->data.persistency = persistency; + slot->data.two_phase = two_phase; + slot->data.two_phase_at = InvalidXLogRecPtr; + + /* and then data only present in shared memory */ + slot->just_dirtied = false; + slot->dirty = false; + slot->effective_xmin = InvalidTransactionId; + slot->effective_catalog_xmin = InvalidTransactionId; + slot->candidate_catalog_xmin = InvalidTransactionId; + slot->candidate_xmin_lsn = InvalidXLogRecPtr; + slot->candidate_restart_valid = InvalidXLogRecPtr; + slot->candidate_restart_lsn = InvalidXLogRecPtr; + + /* + * Create the slot on disk. We haven't actually marked the slot allocated + * yet, so no special cleanup is required if this errors out. + */ + CreateSlotOnDisk(slot); + + /* + * We need to briefly prevent any other backend from iterating over the + * slots while we flip the in_use flag. We also need to set the active + * flag while holding the ControlLock as otherwise a concurrent + * ReplicationSlotAcquire() could acquire the slot as well. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); + + slot->in_use = true; + + /* We can now mark the slot active, and that makes it our slot. */ + SpinLockAcquire(&slot->mutex); + Assert(slot->active_pid == 0); + slot->active_pid = MyProcPid; + SpinLockRelease(&slot->mutex); + MyReplicationSlot = slot; + + LWLockRelease(ReplicationSlotControlLock); + + /* + * Create statistics entry for the new logical slot. We don't collect any + * stats for physical slots, so no need to create an entry for the same. + * See ReplicationSlotDropPtr for why we need to do this before releasing + * ReplicationSlotAllocationLock. + */ + if (SlotIsLogical(slot)) + pgstat_create_replslot(slot); + + /* + * Now that the slot has been marked as in_use and active, it's safe to + * let somebody else try to allocate a slot. + */ + LWLockRelease(ReplicationSlotAllocationLock); + + /* Let everybody know we've modified this slot */ + ConditionVariableBroadcast(&slot->active_cv); +} + +/* + * Search for the named replication slot. + * + * Return the replication slot if found, otherwise NULL. + */ +ReplicationSlot * +SearchNamedReplicationSlot(const char *name, bool need_lock) +{ + int i; + ReplicationSlot *slot = NULL; + + if (need_lock) + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + + if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) + { + slot = s; + break; + } + } + + if (need_lock) + LWLockRelease(ReplicationSlotControlLock); + + return slot; +} + +/* + * Return the index of the replication slot in + * ReplicationSlotCtl->replication_slots. + * + * This is mainly useful to have an efficient key for storing replication slot + * stats. + */ +int +ReplicationSlotIndex(ReplicationSlot *slot) +{ + Assert(slot >= ReplicationSlotCtl->replication_slots && + slot < ReplicationSlotCtl->replication_slots + max_replication_slots); + + return slot - ReplicationSlotCtl->replication_slots; +} + +/* + * If the slot at 'index' is unused, return false. Otherwise 'name' is set to + * the slot's name and true is returned. + * + * This likely is only useful for pgstat_replslot.c during shutdown, in other + * cases there are obvious TOCTOU issues. + */ +bool +ReplicationSlotName(int index, Name name) +{ + ReplicationSlot *slot; + bool found; + + slot = &ReplicationSlotCtl->replication_slots[index]; + + /* + * Ensure that the slot cannot be dropped while we copy the name. Don't + * need the spinlock as the name of an existing slot cannot change. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + found = slot->in_use; + if (slot->in_use) + namestrcpy(name, NameStr(slot->data.name)); + LWLockRelease(ReplicationSlotControlLock); + + return found; +} + +/* + * Find a previously created slot and mark it as used by this process. + * + * An error is raised if nowait is true and the slot is currently in use. If + * nowait is false, we sleep until the slot is released by the owning process. + */ +void +ReplicationSlotAcquire(const char *name, bool nowait) +{ + ReplicationSlot *s; + int active_pid; + + Assert(name != NULL); + +retry: + Assert(MyReplicationSlot == NULL); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + /* + * Search for the slot with the specified name if the slot to acquire is + * not given. If the slot is not found, we either return -1 or error out. + */ + s = SearchNamedReplicationSlot(name, false); + if (s == NULL || !s->in_use) + { + LWLockRelease(ReplicationSlotControlLock); + + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication slot \"%s\" does not exist", + name))); + } + + /* + * This is the slot we want; check if it's active under some other + * process. In single user mode, we don't need this check. + */ + if (IsUnderPostmaster) + { + /* + * Get ready to sleep on the slot in case it is active. (We may end + * up not sleeping, but we don't want to do this while holding the + * spinlock.) + */ + if (!nowait) + ConditionVariablePrepareToSleep(&s->active_cv); + + SpinLockAcquire(&s->mutex); + if (s->active_pid == 0) + s->active_pid = MyProcPid; + active_pid = s->active_pid; + SpinLockRelease(&s->mutex); + } + else + active_pid = MyProcPid; + LWLockRelease(ReplicationSlotControlLock); + + /* + * If we found the slot but it's already active in another process, we + * wait until the owning process signals us that it's been released, or + * error out. + */ + if (active_pid != MyProcPid) + { + if (!nowait) + { + /* Wait here until we get signaled, and then restart */ + ConditionVariableSleep(&s->active_cv, + WAIT_EVENT_REPLICATION_SLOT_DROP); + ConditionVariableCancelSleep(); + goto retry; + } + + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication slot \"%s\" is active for PID %d", + NameStr(s->data.name), active_pid))); + } + else if (!nowait) + ConditionVariableCancelSleep(); /* no sleep needed after all */ + + /* Let everybody know we've modified this slot */ + ConditionVariableBroadcast(&s->active_cv); + + /* We made this slot active, so it's ours now. */ + MyReplicationSlot = s; + + /* + * The call to pgstat_acquire_replslot() protects against stats for a + * different slot, from before a restart or such, being present during + * pgstat_report_replslot(). + */ + if (SlotIsLogical(s)) + pgstat_acquire_replslot(s); +} + +/* + * Release the replication slot that this backend considers to own. + * + * This or another backend can re-acquire the slot later. + * Resources this slot requires will be preserved. + */ +void +ReplicationSlotRelease(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(slot != NULL && slot->active_pid != 0); + + if (slot->data.persistency == RS_EPHEMERAL) + { + /* + * Delete the slot. There is no !PANIC case where this is allowed to + * fail, all that may happen is an incomplete cleanup of the on-disk + * data. + */ + ReplicationSlotDropAcquired(); + } + + /* + * If slot needed to temporarily restrain both data and catalog xmin to + * create the catalog snapshot, remove that temporary constraint. + * Snapshots can only be exported while the initial snapshot is still + * acquired. + */ + if (!TransactionIdIsValid(slot->data.xmin) && + TransactionIdIsValid(slot->effective_xmin)) + { + SpinLockAcquire(&slot->mutex); + slot->effective_xmin = InvalidTransactionId; + SpinLockRelease(&slot->mutex); + ReplicationSlotsComputeRequiredXmin(false); + } + + if (slot->data.persistency == RS_PERSISTENT) + { + /* + * Mark persistent slot inactive. We're not freeing it, just + * disconnecting, but wake up others that may be waiting for it. + */ + SpinLockAcquire(&slot->mutex); + slot->active_pid = 0; + SpinLockRelease(&slot->mutex); + ConditionVariableBroadcast(&slot->active_cv); + } + + MyReplicationSlot = NULL; + + /* might not have been set when we've been a plain slot */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags &= ~PROC_IN_LOGICAL_DECODING; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); +} + +/* + * Cleanup all temporary slots created in current session. + */ +void +ReplicationSlotCleanup(void) +{ + int i; + + Assert(MyReplicationSlot == NULL); + +restart: + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + + if (!s->in_use) + continue; + + SpinLockAcquire(&s->mutex); + if (s->active_pid == MyProcPid) + { + Assert(s->data.persistency == RS_TEMPORARY); + SpinLockRelease(&s->mutex); + LWLockRelease(ReplicationSlotControlLock); /* avoid deadlock */ + + ReplicationSlotDropPtr(s); + + ConditionVariableBroadcast(&s->active_cv); + goto restart; + } + else + SpinLockRelease(&s->mutex); + } + + LWLockRelease(ReplicationSlotControlLock); +} + +/* + * Permanently drop replication slot identified by the passed in name. + */ +void +ReplicationSlotDrop(const char *name, bool nowait) +{ + Assert(MyReplicationSlot == NULL); + + ReplicationSlotAcquire(name, nowait); + + ReplicationSlotDropAcquired(); +} + +/* + * Permanently drop the currently acquired replication slot. + */ +static void +ReplicationSlotDropAcquired(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(MyReplicationSlot != NULL); + + /* slot isn't acquired anymore */ + MyReplicationSlot = NULL; + + ReplicationSlotDropPtr(slot); +} + +/* + * Permanently drop the replication slot which will be released by the point + * this function returns. + */ +static void +ReplicationSlotDropPtr(ReplicationSlot *slot) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + + /* + * If some other backend ran this code concurrently with us, we might try + * to delete a slot with a certain name while someone else was trying to + * create a slot with the same name. + */ + LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); + + /* Generate pathnames. */ + sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); + sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); + + /* + * Rename the slot directory on disk, so that we'll no longer recognize + * this as a valid slot. Note that if this fails, we've got to mark the + * slot inactive before bailing out. If we're dropping an ephemeral or a + * temporary slot, we better never fail hard as the caller won't expect + * the slot to survive and this might get called during error handling. + */ + if (rename(path, tmppath) == 0) + { + /* + * We need to fsync() the directory we just renamed and its parent to + * make sure that our changes are on disk in a crash-safe fashion. If + * fsync() fails, we can't be sure whether the changes are on disk or + * not. For now, we handle that by panicking; + * StartupReplicationSlots() will try to straighten it out after + * restart. + */ + START_CRIT_SECTION(); + fsync_fname(tmppath, true); + fsync_fname("pg_replslot", true); + END_CRIT_SECTION(); + } + else + { + bool fail_softly = slot->data.persistency != RS_PERSISTENT; + + SpinLockAcquire(&slot->mutex); + slot->active_pid = 0; + SpinLockRelease(&slot->mutex); + + /* wake up anyone waiting on this slot */ + ConditionVariableBroadcast(&slot->active_cv); + + ereport(fail_softly ? WARNING : ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + path, tmppath))); + } + + /* + * The slot is definitely gone. Lock out concurrent scans of the array + * long enough to kill it. It's OK to clear the active PID here without + * grabbing the mutex because nobody else can be scanning the array here, + * and nobody can be attached to this slot and thus access it without + * scanning the array. + * + * Also wake up processes waiting for it. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); + slot->active_pid = 0; + slot->in_use = false; + LWLockRelease(ReplicationSlotControlLock); + ConditionVariableBroadcast(&slot->active_cv); + + /* + * Slot is dead and doesn't prevent resource removal anymore, recompute + * limits. + */ + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + + /* + * If removing the directory fails, the worst thing that will happen is + * that the user won't be able to create a new slot with the same name + * until the next server restart. We warn about it, but that's all. + */ + if (!rmtree(tmppath, true)) + ereport(WARNING, + (errmsg("could not remove directory \"%s\"", tmppath))); + + /* + * Drop the statistics entry for the replication slot. Do this while + * holding ReplicationSlotAllocationLock so that we don't drop a + * statistics entry for another slot with the same name just created in + * another session. + */ + if (SlotIsLogical(slot)) + pgstat_drop_replslot(slot); + + /* + * We release this at the very end, so that nobody starts trying to create + * a slot while we're still cleaning up the detritus of the old one. + */ + LWLockRelease(ReplicationSlotAllocationLock); +} + +/* + * Serialize the currently acquired slot's state from memory to disk, thereby + * guaranteeing the current state will survive a crash. + */ +void +ReplicationSlotSave(void) +{ + char path[MAXPGPATH]; + + Assert(MyReplicationSlot != NULL); + + sprintf(path, "pg_replslot/%s", NameStr(MyReplicationSlot->data.name)); + SaveSlotToPath(MyReplicationSlot, path, ERROR); +} + +/* + * Signal that it would be useful if the currently acquired slot would be + * flushed out to disk. + * + * Note that the actual flush to disk can be delayed for a long time, if + * required for correctness explicitly do a ReplicationSlotSave(). + */ +void +ReplicationSlotMarkDirty(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(MyReplicationSlot != NULL); + + SpinLockAcquire(&slot->mutex); + MyReplicationSlot->just_dirtied = true; + MyReplicationSlot->dirty = true; + SpinLockRelease(&slot->mutex); +} + +/* + * Convert a slot that's marked as RS_EPHEMERAL to a RS_PERSISTENT slot, + * guaranteeing it will be there after an eventual crash. + */ +void +ReplicationSlotPersist(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(slot->data.persistency != RS_PERSISTENT); + + SpinLockAcquire(&slot->mutex); + slot->data.persistency = RS_PERSISTENT; + SpinLockRelease(&slot->mutex); + + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + +/* + * Compute the oldest xmin across all slots and store it in the ProcArray. + * + * If already_locked is true, ProcArrayLock has already been acquired + * exclusively. + */ +void +ReplicationSlotsComputeRequiredXmin(bool already_locked) +{ + int i; + TransactionId agg_xmin = InvalidTransactionId; + TransactionId agg_catalog_xmin = InvalidTransactionId; + + Assert(ReplicationSlotCtl != NULL); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + TransactionId effective_xmin; + TransactionId effective_catalog_xmin; + bool invalidated; + + if (!s->in_use) + continue; + + SpinLockAcquire(&s->mutex); + effective_xmin = s->effective_xmin; + effective_catalog_xmin = s->effective_catalog_xmin; + invalidated = s->data.invalidated != RS_INVAL_NONE; + SpinLockRelease(&s->mutex); + + /* invalidated slots need not apply */ + if (invalidated) + continue; + + /* check the data xmin */ + if (TransactionIdIsValid(effective_xmin) && + (!TransactionIdIsValid(agg_xmin) || + TransactionIdPrecedes(effective_xmin, agg_xmin))) + agg_xmin = effective_xmin; + + /* check the catalog xmin */ + if (TransactionIdIsValid(effective_catalog_xmin) && + (!TransactionIdIsValid(agg_catalog_xmin) || + TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin))) + agg_catalog_xmin = effective_catalog_xmin; + } + + LWLockRelease(ReplicationSlotControlLock); + + ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked); +} + +/* + * Compute the oldest restart LSN across all slots and inform xlog module. + * + * Note: while max_slot_wal_keep_size is theoretically relevant for this + * purpose, we don't try to account for that, because this module doesn't + * know what to compare against. + */ +void +ReplicationSlotsComputeRequiredLSN(void) +{ + int i; + XLogRecPtr min_required = InvalidXLogRecPtr; + + Assert(ReplicationSlotCtl != NULL); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + XLogRecPtr restart_lsn; + bool invalidated; + + if (!s->in_use) + continue; + + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + invalidated = s->data.invalidated != RS_INVAL_NONE; + SpinLockRelease(&s->mutex); + + /* invalidated slots need not apply */ + if (invalidated) + continue; + + if (restart_lsn != InvalidXLogRecPtr && + (min_required == InvalidXLogRecPtr || + restart_lsn < min_required)) + min_required = restart_lsn; + } + LWLockRelease(ReplicationSlotControlLock); + + XLogSetReplicationSlotMinimumLSN(min_required); +} + +/* + * Compute the oldest WAL LSN required by *logical* decoding slots.. + * + * Returns InvalidXLogRecPtr if logical decoding is disabled or no logical + * slots exist. + * + * NB: this returns a value >= ReplicationSlotsComputeRequiredLSN(), since it + * ignores physical replication slots. + * + * The results aren't required frequently, so we don't maintain a precomputed + * value like we do for ComputeRequiredLSN() and ComputeRequiredXmin(). + */ +XLogRecPtr +ReplicationSlotsComputeLogicalRestartLSN(void) +{ + XLogRecPtr result = InvalidXLogRecPtr; + int i; + + if (max_replication_slots <= 0) + return InvalidXLogRecPtr; + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s; + XLogRecPtr restart_lsn; + bool invalidated; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + /* we're only interested in logical slots */ + if (!SlotIsLogical(s)) + continue; + + /* read once, it's ok if it increases while we're checking */ + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + invalidated = s->data.invalidated != RS_INVAL_NONE; + SpinLockRelease(&s->mutex); + + /* invalidated slots need not apply */ + if (invalidated) + continue; + + if (restart_lsn == InvalidXLogRecPtr) + continue; + + if (result == InvalidXLogRecPtr || + restart_lsn < result) + result = restart_lsn; + } + + LWLockRelease(ReplicationSlotControlLock); + + return result; +} + +/* + * ReplicationSlotsCountDBSlots -- count the number of slots that refer to the + * passed database oid. + * + * Returns true if there are any slots referencing the database. *nslots will + * be set to the absolute number of slots in the database, *nactive to ones + * currently active. + */ +bool +ReplicationSlotsCountDBSlots(Oid dboid, int *nslots, int *nactive) +{ + int i; + + *nslots = *nactive = 0; + + if (max_replication_slots <= 0) + return false; + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + /* only logical slots are database specific, skip */ + if (!SlotIsLogical(s)) + continue; + + /* not our database, skip */ + if (s->data.database != dboid) + continue; + + /* NB: intentionally counting invalidated slots */ + + /* count slots with spinlock held */ + SpinLockAcquire(&s->mutex); + (*nslots)++; + if (s->active_pid != 0) + (*nactive)++; + SpinLockRelease(&s->mutex); + } + LWLockRelease(ReplicationSlotControlLock); + + if (*nslots > 0) + return true; + return false; +} + +/* + * ReplicationSlotsDropDBSlots -- Drop all db-specific slots relating to the + * passed database oid. The caller should hold an exclusive lock on the + * pg_database oid for the database to prevent creation of new slots on the db + * or replay from existing slots. + * + * Another session that concurrently acquires an existing slot on the target DB + * (most likely to drop it) may cause this function to ERROR. If that happens + * it may have dropped some but not all slots. + * + * This routine isn't as efficient as it could be - but we don't drop + * databases often, especially databases with lots of slots. + */ +void +ReplicationSlotsDropDBSlots(Oid dboid) +{ + int i; + + if (max_replication_slots <= 0) + return; + +restart: + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s; + char *slotname; + int active_pid; + + s = &ReplicationSlotCtl->replication_slots[i]; + + /* cannot change while ReplicationSlotCtlLock is held */ + if (!s->in_use) + continue; + + /* only logical slots are database specific, skip */ + if (!SlotIsLogical(s)) + continue; + + /* not our database, skip */ + if (s->data.database != dboid) + continue; + + /* NB: intentionally including invalidated slots */ + + /* acquire slot, so ReplicationSlotDropAcquired can be reused */ + SpinLockAcquire(&s->mutex); + /* can't change while ReplicationSlotControlLock is held */ + slotname = NameStr(s->data.name); + active_pid = s->active_pid; + if (active_pid == 0) + { + MyReplicationSlot = s; + s->active_pid = MyProcPid; + } + SpinLockRelease(&s->mutex); + + /* + * Even though we hold an exclusive lock on the database object a + * logical slot for that DB can still be active, e.g. if it's + * concurrently being dropped by a backend connected to another DB. + * + * That's fairly unlikely in practice, so we'll just bail out. + */ + if (active_pid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("replication slot \"%s\" is active for PID %d", + slotname, active_pid))); + + /* + * To avoid duplicating ReplicationSlotDropAcquired() and to avoid + * holding ReplicationSlotControlLock over filesystem operations, + * release ReplicationSlotControlLock and use + * ReplicationSlotDropAcquired. + * + * As that means the set of slots could change, restart scan from the + * beginning each time we release the lock. + */ + LWLockRelease(ReplicationSlotControlLock); + ReplicationSlotDropAcquired(); + goto restart; + } + LWLockRelease(ReplicationSlotControlLock); +} + + +/* + * Check whether the server's configuration supports using replication + * slots. + */ +void +CheckSlotRequirements(void) +{ + /* + * NB: Adding a new requirement likely means that RestoreSlotFromDisk() + * needs the same check. + */ + + if (max_replication_slots == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slots can only be used if max_replication_slots > 0"))); + + if (wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slots can only be used if wal_level >= replica"))); +} + +/* + * Check whether the user has privilege to use replication slots. + */ +void +CheckSlotPermissions(void) +{ + if (!has_rolreplication(GetUserId())) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to use replication slots"), + errdetail("Only roles with the %s attribute may use replication slots.", + "REPLICATION"))); +} + +/* + * Reserve WAL for the currently active slot. + * + * Compute and set restart_lsn in a manner that's appropriate for the type of + * the slot and concurrency safe. + */ +void +ReplicationSlotReserveWal(void) +{ + ReplicationSlot *slot = MyReplicationSlot; + + Assert(slot != NULL); + Assert(slot->data.restart_lsn == InvalidXLogRecPtr); + + /* + * The replication slot mechanism is used to prevent removal of required + * WAL. As there is no interlock between this routine and checkpoints, WAL + * segments could concurrently be removed when a now stale return value of + * ReplicationSlotsComputeRequiredLSN() is used. In the unlikely case that + * this happens we'll just retry. + */ + while (true) + { + XLogSegNo segno; + XLogRecPtr restart_lsn; + + /* + * For logical slots log a standby snapshot and start logical decoding + * at exactly that position. That allows the slot to start up more + * quickly. But on a standby we cannot do WAL writes, so just use the + * replay pointer; effectively, an attempt to create a logical slot on + * standby will cause it to wait for an xl_running_xact record to be + * logged independently on the primary, so that a snapshot can be + * built using the record. + * + * None of this is needed (or indeed helpful) for physical slots as + * they'll start replay at the last logged checkpoint anyway. Instead + * return the location of the last redo LSN. While that slightly + * increases the chance that we have to retry, it's where a base + * backup has to start replay at. + */ + if (SlotIsPhysical(slot)) + restart_lsn = GetRedoRecPtr(); + else if (RecoveryInProgress()) + restart_lsn = GetXLogReplayRecPtr(NULL); + else + restart_lsn = GetXLogInsertRecPtr(); + + SpinLockAcquire(&slot->mutex); + slot->data.restart_lsn = restart_lsn; + SpinLockRelease(&slot->mutex); + + /* prevent WAL removal as fast as possible */ + ReplicationSlotsComputeRequiredLSN(); + + /* + * If all required WAL is still there, great, otherwise retry. The + * slot should prevent further removal of WAL, unless there's a + * concurrent ReplicationSlotsComputeRequiredLSN() after we've written + * the new restart_lsn above, so normally we should never need to loop + * more than twice. + */ + XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); + if (XLogGetLastRemovedSegno() < segno) + break; + } + + if (!RecoveryInProgress() && SlotIsLogical(slot)) + { + XLogRecPtr flushptr; + + /* make sure we have enough information to start */ + flushptr = LogStandbySnapshot(); + + /* and make sure it's fsynced to disk */ + XLogFlush(flushptr); + } +} + +/* + * Report that replication slot needs to be invalidated + */ +static void +ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, + bool terminating, + int pid, + NameData slotname, + XLogRecPtr restart_lsn, + XLogRecPtr oldestLSN, + TransactionId snapshotConflictHorizon) +{ + StringInfoData err_detail; + bool hint = false; + + initStringInfo(&err_detail); + + switch (cause) + { + case RS_INVAL_WAL_REMOVED: + { + unsigned long long ex = oldestLSN - restart_lsn; + + hint = true; + appendStringInfo(&err_detail, + ngettext("The slot's restart_lsn %X/%X exceeds the limit by %llu byte.", + "The slot's restart_lsn %X/%X exceeds the limit by %llu bytes.", + ex), + LSN_FORMAT_ARGS(restart_lsn), + ex); + break; + } + case RS_INVAL_HORIZON: + appendStringInfo(&err_detail, _("The slot conflicted with xid horizon %u."), + snapshotConflictHorizon); + break; + + case RS_INVAL_WAL_LEVEL: + appendStringInfo(&err_detail, _("Logical decoding on standby requires wal_level >= logical on the primary server.")); + break; + case RS_INVAL_NONE: + pg_unreachable(); + } + + ereport(LOG, + terminating ? + errmsg("terminating process %d to release replication slot \"%s\"", + pid, NameStr(slotname)) : + errmsg("invalidating obsolete replication slot \"%s\"", + NameStr(slotname)), + errdetail_internal("%s", err_detail.data), + hint ? errhint("You might need to increase %s.", "max_slot_wal_keep_size") : 0); + + pfree(err_detail.data); +} + +/* + * Helper for InvalidateObsoleteReplicationSlots + * + * Acquires the given slot and mark it invalid, if necessary and possible. + * + * Returns whether ReplicationSlotControlLock was released in the interim (and + * in that case we're not holding the lock at return, otherwise we are). + * + * Sets *invalidated true if the slot was invalidated. (Untouched otherwise.) + * + * This is inherently racy, because we release the LWLock + * for syscalls, so caller must restart if we return true. + */ +static bool +InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause, + ReplicationSlot *s, + XLogRecPtr oldestLSN, + Oid dboid, TransactionId snapshotConflictHorizon, + bool *invalidated) +{ + int last_signaled_pid = 0; + bool released_lock = false; + + for (;;) + { + XLogRecPtr restart_lsn; + NameData slotname; + int active_pid = 0; + ReplicationSlotInvalidationCause conflict = RS_INVAL_NONE; + + Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock, LW_SHARED)); + + if (!s->in_use) + { + if (released_lock) + LWLockRelease(ReplicationSlotControlLock); + break; + } + + /* + * Check if the slot needs to be invalidated. If it needs to be + * invalidated, and is not currently acquired, acquire it and mark it + * as having been invalidated. We do this with the spinlock held to + * avoid race conditions -- for example the restart_lsn could move + * forward, or the slot could be dropped. + */ + SpinLockAcquire(&s->mutex); + + restart_lsn = s->data.restart_lsn; + + /* + * If the slot is already invalid or is a non conflicting slot, we + * don't need to do anything. + */ + if (s->data.invalidated == RS_INVAL_NONE) + { + switch (cause) + { + case RS_INVAL_WAL_REMOVED: + if (s->data.restart_lsn != InvalidXLogRecPtr && + s->data.restart_lsn < oldestLSN) + conflict = cause; + break; + case RS_INVAL_HORIZON: + if (!SlotIsLogical(s)) + break; + /* invalid DB oid signals a shared relation */ + if (dboid != InvalidOid && dboid != s->data.database) + break; + if (TransactionIdIsValid(s->effective_xmin) && + TransactionIdPrecedesOrEquals(s->effective_xmin, + snapshotConflictHorizon)) + conflict = cause; + else if (TransactionIdIsValid(s->effective_catalog_xmin) && + TransactionIdPrecedesOrEquals(s->effective_catalog_xmin, + snapshotConflictHorizon)) + conflict = cause; + break; + case RS_INVAL_WAL_LEVEL: + if (SlotIsLogical(s)) + conflict = cause; + break; + case RS_INVAL_NONE: + pg_unreachable(); + } + } + + /* if there's no conflict, we're done */ + if (conflict == RS_INVAL_NONE) + { + SpinLockRelease(&s->mutex); + if (released_lock) + LWLockRelease(ReplicationSlotControlLock); + break; + } + + slotname = s->data.name; + active_pid = s->active_pid; + + /* + * If the slot can be acquired, do so and mark it invalidated + * immediately. Otherwise we'll signal the owning process, below, and + * retry. + */ + if (active_pid == 0) + { + MyReplicationSlot = s; + s->active_pid = MyProcPid; + s->data.invalidated = conflict; + + /* + * XXX: We should consider not overwriting restart_lsn and instead + * just rely on .invalidated. + */ + if (conflict == RS_INVAL_WAL_REMOVED) + s->data.restart_lsn = InvalidXLogRecPtr; + + /* Let caller know */ + *invalidated = true; + } + + SpinLockRelease(&s->mutex); + + if (active_pid != 0) + { + /* + * Prepare the sleep on the slot's condition variable before + * releasing the lock, to close a possible race condition if the + * slot is released before the sleep below. + */ + ConditionVariablePrepareToSleep(&s->active_cv); + + LWLockRelease(ReplicationSlotControlLock); + released_lock = true; + + /* + * Signal to terminate the process that owns the slot, if we + * haven't already signalled it. (Avoidance of repeated + * signalling is the only reason for there to be a loop in this + * routine; otherwise we could rely on caller's restart loop.) + * + * There is the race condition that other process may own the slot + * after its current owner process is terminated and before this + * process owns it. To handle that, we signal only if the PID of + * the owning process has changed from the previous time. (This + * logic assumes that the same PID is not reused very quickly.) + */ + if (last_signaled_pid != active_pid) + { + ReportSlotInvalidation(conflict, true, active_pid, + slotname, restart_lsn, + oldestLSN, snapshotConflictHorizon); + + if (MyBackendType == B_STARTUP) + (void) SendProcSignal(active_pid, + PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT, + InvalidBackendId); + else + (void) kill(active_pid, SIGTERM); + + last_signaled_pid = active_pid; + } + + /* Wait until the slot is released. */ + ConditionVariableSleep(&s->active_cv, + WAIT_EVENT_REPLICATION_SLOT_DROP); + + /* + * Re-acquire lock and start over; we expect to invalidate the + * slot next time (unless another process acquires the slot in the + * meantime). + */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + continue; + } + else + { + /* + * We hold the slot now and have already invalidated it; flush it + * to ensure that state persists. + * + * Don't want to hold ReplicationSlotControlLock across file + * system operations, so release it now but be sure to tell caller + * to restart from scratch. + */ + LWLockRelease(ReplicationSlotControlLock); + released_lock = true; + + /* Make sure the invalidated state persists across server restart */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + pgstat_drop_replslot(s); + + ReportSlotInvalidation(conflict, false, active_pid, + slotname, restart_lsn, + oldestLSN, snapshotConflictHorizon); + + /* done with this slot for now */ + break; + } + } + + Assert(released_lock == !LWLockHeldByMe(ReplicationSlotControlLock)); + + return released_lock; +} + +/* + * Invalidate slots that require resources about to be removed. + * + * Returns true when any slot have got invalidated. + * + * Whether a slot needs to be invalidated depends on the cause. A slot is + * removed if it: + * - RS_INVAL_WAL_REMOVED: requires a LSN older than the given segment + * - RS_INVAL_HORIZON: requires a snapshot <= the given horizon in the given + * db; dboid may be InvalidOid for shared relations + * - RS_INVAL_WAL_LEVEL: is logical + * + * NB - this runs as part of checkpoint, so avoid raising errors if possible. + */ +bool +InvalidateObsoleteReplicationSlots(ReplicationSlotInvalidationCause cause, + XLogSegNo oldestSegno, Oid dboid, + TransactionId snapshotConflictHorizon) +{ + XLogRecPtr oldestLSN; + bool invalidated = false; + + Assert(cause != RS_INVAL_HORIZON || TransactionIdIsValid(snapshotConflictHorizon)); + Assert(cause != RS_INVAL_WAL_REMOVED || oldestSegno > 0); + Assert(cause != RS_INVAL_NONE); + + if (max_replication_slots == 0) + return invalidated; + + XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN); + +restart: + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (int i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + + if (!s->in_use) + continue; + + if (InvalidatePossiblyObsoleteSlot(cause, s, oldestLSN, dboid, + snapshotConflictHorizon, + &invalidated)) + { + /* if the lock was released, start from scratch */ + goto restart; + } + } + LWLockRelease(ReplicationSlotControlLock); + + /* + * If any slots have been invalidated, recalculate the resource limits. + */ + if (invalidated) + { + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + } + + return invalidated; +} + +/* + * Flush all replication slots to disk. + * + * This needn't actually be part of a checkpoint, but it's a convenient + * location. + */ +void +CheckPointReplicationSlots(void) +{ + int i; + + elog(DEBUG1, "performing replication slot checkpoint"); + + /* + * Prevent any slot from being created/dropped while we're active. As we + * explicitly do *not* want to block iterating over replication_slots or + * acquiring a slot we cannot take the control lock - but that's OK, + * because holding ReplicationSlotAllocationLock is strictly stronger, and + * enough to guarantee that nobody can change the in_use bits on us. + */ + LWLockAcquire(ReplicationSlotAllocationLock, LW_SHARED); + + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + char path[MAXPGPATH]; + + if (!s->in_use) + continue; + + /* save the slot to disk, locking is handled in SaveSlotToPath() */ + sprintf(path, "pg_replslot/%s", NameStr(s->data.name)); + SaveSlotToPath(s, path, LOG); + } + LWLockRelease(ReplicationSlotAllocationLock); +} + +/* + * Load all replication slots from disk into memory at server startup. This + * needs to be run before we start crash recovery. + */ +void +StartupReplicationSlots(void) +{ + DIR *replication_dir; + struct dirent *replication_de; + + elog(DEBUG1, "starting up replication slots"); + + /* restore all slots by iterating over all on-disk entries */ + replication_dir = AllocateDir("pg_replslot"); + while ((replication_de = ReadDir(replication_dir, "pg_replslot")) != NULL) + { + char path[MAXPGPATH + 12]; + PGFileType de_type; + + if (strcmp(replication_de->d_name, ".") == 0 || + strcmp(replication_de->d_name, "..") == 0) + continue; + + snprintf(path, sizeof(path), "pg_replslot/%s", replication_de->d_name); + de_type = get_dirent_type(path, replication_de, false, DEBUG1); + + /* we're only creating directories here, skip if it's not our's */ + if (de_type != PGFILETYPE_ERROR && de_type != PGFILETYPE_DIR) + continue; + + /* we crashed while a slot was being setup or deleted, clean up */ + if (pg_str_endswith(replication_de->d_name, ".tmp")) + { + if (!rmtree(path, true)) + { + ereport(WARNING, + (errmsg("could not remove directory \"%s\"", + path))); + continue; + } + fsync_fname("pg_replslot", true); + continue; + } + + /* looks like a slot in a normal state, restore */ + RestoreSlotFromDisk(replication_de->d_name); + } + FreeDir(replication_dir); + + /* currently no slots exist, we're done. */ + if (max_replication_slots <= 0) + return; + + /* Now that we have recovered all the data, compute replication xmin */ + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); +} + +/* ---- + * Manipulation of on-disk state of replication slots + * + * NB: none of the routines below should take any notice whether a slot is the + * current one or not, that's all handled a layer above. + * ---- + */ +static void +CreateSlotOnDisk(ReplicationSlot *slot) +{ + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + struct stat st; + + /* + * No need to take out the io_in_progress_lock, nobody else can see this + * slot yet, so nobody else will write. We're reusing SaveSlotToPath which + * takes out the lock, if we'd take the lock here, we'd deadlock. + */ + + sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); + sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); + + /* + * It's just barely possible that some previous effort to create or drop a + * slot with this name left a temp directory lying around. If that seems + * to be the case, try to remove it. If the rmtree() fails, we'll error + * out at the MakePGDirectory() below, so we don't bother checking + * success. + */ + if (stat(tmppath, &st) == 0 && S_ISDIR(st.st_mode)) + rmtree(tmppath, true); + + /* Create and fsync the temporary slot directory. */ + if (MakePGDirectory(tmppath) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + tmppath))); + fsync_fname(tmppath, true); + + /* Write the actual state file. */ + slot->dirty = true; /* signal that we really need to write */ + SaveSlotToPath(slot, tmppath, ERROR); + + /* Rename the directory into place. */ + if (rename(tmppath, path) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + + /* + * If we'd now fail - really unlikely - we wouldn't know whether this slot + * would persist after an OS crash or not - so, force a restart. The + * restart would try to fsync this again till it works. + */ + START_CRIT_SECTION(); + + fsync_fname(path, true); + fsync_fname("pg_replslot", true); + + END_CRIT_SECTION(); +} + +/* + * Shared functionality between saving and creating a replication slot. + */ +static void +SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) +{ + char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + int fd; + ReplicationSlotOnDisk cp; + bool was_dirty; + + /* first check whether there's something to write out */ + SpinLockAcquire(&slot->mutex); + was_dirty = slot->dirty; + slot->just_dirtied = false; + SpinLockRelease(&slot->mutex); + + /* and don't do anything if there's nothing to write */ + if (!was_dirty) + return; + + LWLockAcquire(&slot->io_in_progress_lock, LW_EXCLUSIVE); + + /* silence valgrind :( */ + memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); + + sprintf(tmppath, "%s/state.tmp", dir); + sprintf(path, "%s/state", dir); + + fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + if (fd < 0) + { + /* + * If not an ERROR, then release the lock before returning. In case + * of an ERROR, the error recovery path automatically releases the + * lock, but no harm in explicitly releasing even in that case. Note + * that LWLockRelease() could affect errno. + */ + int save_errno = errno; + + LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmppath))); + return; + } + + cp.magic = SLOT_MAGIC; + INIT_CRC32C(cp.checksum); + cp.version = SLOT_VERSION; + cp.length = ReplicationSlotOnDiskV2Size; + + SpinLockAcquire(&slot->mutex); + + memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); + + SpinLockRelease(&slot->mutex); + + COMP_CRC32C(cp.checksum, + (char *) (&cp) + ReplicationSlotOnDiskNotChecksummedSize, + ReplicationSlotOnDiskChecksummedSize); + FIN_CRC32C(cp.checksum); + + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); + if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) + { + int save_errno = errno; + + pgstat_report_wait_end(); + CloseTransientFile(fd); + LWLockRelease(&slot->io_in_progress_lock); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + tmppath))); + return; + } + pgstat_report_wait_end(); + + /* fsync the temporary file */ + pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_SYNC); + if (pg_fsync(fd) != 0) + { + int save_errno = errno; + + pgstat_report_wait_end(); + CloseTransientFile(fd); + LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + tmppath))); + return; + } + pgstat_report_wait_end(); + + if (CloseTransientFile(fd) != 0) + { + int save_errno = errno; + + LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + tmppath))); + return; + } + + /* rename to permanent file, fsync file and directory */ + if (rename(tmppath, path) != 0) + { + int save_errno = errno; + + LWLockRelease(&slot->io_in_progress_lock); + errno = save_errno; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + return; + } + + /* + * Check CreateSlotOnDisk() for the reasoning of using a critical section. + */ + START_CRIT_SECTION(); + + fsync_fname(path, false); + fsync_fname(dir, true); + fsync_fname("pg_replslot", true); + + END_CRIT_SECTION(); + + /* + * Successfully wrote, unset dirty bit, unless somebody dirtied again + * already. + */ + SpinLockAcquire(&slot->mutex); + if (!slot->just_dirtied) + slot->dirty = false; + SpinLockRelease(&slot->mutex); + + LWLockRelease(&slot->io_in_progress_lock); +} + +/* + * Load a single slot from disk into memory. + */ +static void +RestoreSlotFromDisk(const char *name) +{ + ReplicationSlotOnDisk cp; + int i; + char slotdir[MAXPGPATH + 12]; + char path[MAXPGPATH + 22]; + int fd; + bool restored = false; + int readBytes; + pg_crc32c checksum; + + /* no need to lock here, no concurrent access allowed yet */ + + /* delete temp file if it exists */ + sprintf(slotdir, "pg_replslot/%s", name); + sprintf(path, "%s/state.tmp", slotdir); + if (unlink(path) < 0 && errno != ENOENT) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + + sprintf(path, "%s/state", slotdir); + + elog(DEBUG1, "restoring replication slot from \"%s\"", path); + + /* on some operating systems fsyncing a file requires O_RDWR */ + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + + /* + * We do not need to handle this as we are rename()ing the directory into + * place only after we fsync()ed the state file. + */ + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* + * Sync state file before we're reading from it. We might have crashed + * while it wasn't synced yet and we shouldn't continue on that basis. + */ + pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC); + if (pg_fsync(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + pgstat_report_wait_end(); + + /* Also sync the parent directory */ + START_CRIT_SECTION(); + fsync_fname(slotdir, true); + END_CRIT_SECTION(); + + /* read part of statefile that's guaranteed to be version independent */ + pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); + readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); + pgstat_report_wait_end(); + if (readBytes != ReplicationSlotOnDiskConstantSize) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, + (Size) ReplicationSlotOnDiskConstantSize))); + } + + /* verify magic */ + if (cp.magic != SLOT_MAGIC) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u", + path, cp.magic, SLOT_MAGIC))); + + /* verify version */ + if (cp.version != SLOT_VERSION) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot file \"%s\" has unsupported version %u", + path, cp.version))); + + /* boundary check on length */ + if (cp.length != ReplicationSlotOnDiskV2Size) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("replication slot file \"%s\" has corrupted length %u", + path, cp.length))); + + /* Now that we know the size, read the entire file */ + pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); + readBytes = read(fd, + (char *) &cp + ReplicationSlotOnDiskConstantSize, + cp.length); + pgstat_report_wait_end(); + if (readBytes != cp.length) + { + if (readBytes < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", path))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, readBytes, (Size) cp.length))); + } + + if (CloseTransientFile(fd) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + /* now verify the CRC */ + INIT_CRC32C(checksum); + COMP_CRC32C(checksum, + (char *) &cp + ReplicationSlotOnDiskNotChecksummedSize, + ReplicationSlotOnDiskChecksummedSize); + FIN_CRC32C(checksum); + + if (!EQ_CRC32C(checksum, cp.checksum)) + ereport(PANIC, + (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u", + path, checksum, cp.checksum))); + + /* + * If we crashed with an ephemeral slot active, don't restore but delete + * it. + */ + if (cp.slotdata.persistency != RS_PERSISTENT) + { + if (!rmtree(slotdir, true)) + { + ereport(WARNING, + (errmsg("could not remove directory \"%s\"", + slotdir))); + } + fsync_fname("pg_replslot", true); + return; + } + + /* + * Verify that requirements for the specific slot type are met. That's + * important because if these aren't met we're not guaranteed to retain + * all the necessary resources for the slot. + * + * NB: We have to do so *after* the above checks for ephemeral slots, + * because otherwise a slot that shouldn't exist anymore could prevent + * restarts. + * + * NB: Changing the requirements here also requires adapting + * CheckSlotRequirements() and CheckLogicalDecodingRequirements(). + */ + if (cp.slotdata.database != InvalidOid && wal_level < WAL_LEVEL_LOGICAL) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("logical replication slot \"%s\" exists, but wal_level < logical", + NameStr(cp.slotdata.name)), + errhint("Change wal_level to be logical or higher."))); + else if (wal_level < WAL_LEVEL_REPLICA) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("physical replication slot \"%s\" exists, but wal_level < replica", + NameStr(cp.slotdata.name)), + errhint("Change wal_level to be replica or higher."))); + + /* nothing can be active yet, don't lock anything */ + for (i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *slot; + + slot = &ReplicationSlotCtl->replication_slots[i]; + + if (slot->in_use) + continue; + + /* restore the entire set of persistent data */ + memcpy(&slot->data, &cp.slotdata, + sizeof(ReplicationSlotPersistentData)); + + /* initialize in memory state */ + slot->effective_xmin = cp.slotdata.xmin; + slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; + + slot->candidate_catalog_xmin = InvalidTransactionId; + slot->candidate_xmin_lsn = InvalidXLogRecPtr; + slot->candidate_restart_lsn = InvalidXLogRecPtr; + slot->candidate_restart_valid = InvalidXLogRecPtr; + + slot->in_use = true; + slot->active_pid = 0; + + restored = true; + break; + } + + if (!restored) + ereport(FATAL, + (errmsg("too many replication slots active before shutdown"), + errhint("Increase max_replication_slots and try again."))); +} diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c new file mode 100644 index 0000000..6035cf4 --- /dev/null +++ b/src/backend/replication/slotfuncs.c @@ -0,0 +1,925 @@ +/*------------------------------------------------------------------------- + * + * slotfuncs.c + * Support functions for replication slots + * + * Copyright (c) 2012-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/slotfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/slot.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/pg_lsn.h" +#include "utils/resowner.h" + +/* + * Helper function for creating a new physical replication slot with + * given arguments. Note that this function doesn't release the created + * slot. + * + * If restart_lsn is a valid value, we use it without WAL reservation + * routine. So the caller must guarantee that WAL is available. + */ +static void +create_physical_replication_slot(char *name, bool immediately_reserve, + bool temporary, XLogRecPtr restart_lsn) +{ + Assert(!MyReplicationSlot); + + /* acquire replication slot, this will check for conflicting names */ + ReplicationSlotCreate(name, false, + temporary ? RS_TEMPORARY : RS_PERSISTENT, false); + + if (immediately_reserve) + { + /* Reserve WAL as the user asked for it */ + if (XLogRecPtrIsInvalid(restart_lsn)) + ReplicationSlotReserveWal(); + else + MyReplicationSlot->data.restart_lsn = restart_lsn; + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + } +} + +/* + * SQL function for creating a new physical (streaming replication) + * replication slot. + */ +Datum +pg_create_physical_replication_slot(PG_FUNCTION_ARGS) +{ + Name name = PG_GETARG_NAME(0); + bool immediately_reserve = PG_GETARG_BOOL(1); + bool temporary = PG_GETARG_BOOL(2); + Datum values[2]; + bool nulls[2]; + TupleDesc tupdesc; + HeapTuple tuple; + Datum result; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + CheckSlotPermissions(); + + CheckSlotRequirements(); + + create_physical_replication_slot(NameStr(*name), + immediately_reserve, + temporary, + InvalidXLogRecPtr); + + values[0] = NameGetDatum(&MyReplicationSlot->data.name); + nulls[0] = false; + + if (immediately_reserve) + { + values[1] = LSNGetDatum(MyReplicationSlot->data.restart_lsn); + nulls[1] = false; + } + else + nulls[1] = true; + + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + ReplicationSlotRelease(); + + PG_RETURN_DATUM(result); +} + + +/* + * Helper function for creating a new logical replication slot with + * given arguments. Note that this function doesn't release the created + * slot. + * + * When find_startpoint is false, the slot's confirmed_flush is not set; it's + * caller's responsibility to ensure it's set to something sensible. + */ +static void +create_logical_replication_slot(char *name, char *plugin, + bool temporary, bool two_phase, + XLogRecPtr restart_lsn, + bool find_startpoint) +{ + LogicalDecodingContext *ctx = NULL; + + Assert(!MyReplicationSlot); + + /* + * Acquire a logical decoding slot, this will check for conflicting names. + * Initially create persistent slot as ephemeral - that allows us to + * nicely handle errors during initialization because it'll get dropped if + * this transaction fails. We'll make it persistent at the end. Temporary + * slots can be created as temporary from beginning as they get dropped on + * error as well. + */ + ReplicationSlotCreate(name, true, + temporary ? RS_TEMPORARY : RS_EPHEMERAL, two_phase); + + /* + * Create logical decoding context to find start point or, if we don't + * need it, to 1) bump slot's restart_lsn and xmin 2) check plugin sanity. + * + * Note: when !find_startpoint this is still important, because it's at + * this point that the output plugin is validated. + */ + ctx = CreateInitDecodingContext(plugin, NIL, + false, /* just catalogs is OK */ + restart_lsn, + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + NULL, NULL, NULL); + + /* + * If caller needs us to determine the decoding start point, do so now. + * This might take a while. + */ + if (find_startpoint) + DecodingContextFindStartpoint(ctx); + + /* don't need the decoding context anymore */ + FreeDecodingContext(ctx); +} + +/* + * SQL function for creating a new logical replication slot. + */ +Datum +pg_create_logical_replication_slot(PG_FUNCTION_ARGS) +{ + Name name = PG_GETARG_NAME(0); + Name plugin = PG_GETARG_NAME(1); + bool temporary = PG_GETARG_BOOL(2); + bool two_phase = PG_GETARG_BOOL(3); + Datum result; + TupleDesc tupdesc; + HeapTuple tuple; + Datum values[2]; + bool nulls[2]; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + CheckSlotPermissions(); + + CheckLogicalDecodingRequirements(); + + create_logical_replication_slot(NameStr(*name), + NameStr(*plugin), + temporary, + two_phase, + InvalidXLogRecPtr, + true); + + values[0] = NameGetDatum(&MyReplicationSlot->data.name); + values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush); + + memset(nulls, 0, sizeof(nulls)); + + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + /* ok, slot is now fully created, mark it as persistent if needed */ + if (!temporary) + ReplicationSlotPersist(); + ReplicationSlotRelease(); + + PG_RETURN_DATUM(result); +} + + +/* + * SQL function for dropping a replication slot. + */ +Datum +pg_drop_replication_slot(PG_FUNCTION_ARGS) +{ + Name name = PG_GETARG_NAME(0); + + CheckSlotPermissions(); + + CheckSlotRequirements(); + + ReplicationSlotDrop(NameStr(*name), true); + + PG_RETURN_VOID(); +} + +/* + * pg_get_replication_slots - SQL SRF showing all replication slots + * that currently exist on the database cluster. + */ +Datum +pg_get_replication_slots(PG_FUNCTION_ARGS) +{ +#define PG_GET_REPLICATION_SLOTS_COLS 15 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + XLogRecPtr currlsn; + int slotno; + + /* + * We don't require any special permission to see this function's data + * because nothing should be sensitive. The most critical being the slot + * name, which shouldn't contain anything particularly sensitive. + */ + + InitMaterializedSRF(fcinfo, 0); + + currlsn = GetXLogWriteRecPtr(); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + for (slotno = 0; slotno < max_replication_slots; slotno++) + { + ReplicationSlot *slot = &ReplicationSlotCtl->replication_slots[slotno]; + ReplicationSlot slot_contents; + Datum values[PG_GET_REPLICATION_SLOTS_COLS]; + bool nulls[PG_GET_REPLICATION_SLOTS_COLS]; + WALAvailability walstate; + int i; + + if (!slot->in_use) + continue; + + /* Copy slot contents while holding spinlock, then examine at leisure */ + SpinLockAcquire(&slot->mutex); + slot_contents = *slot; + SpinLockRelease(&slot->mutex); + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + i = 0; + values[i++] = NameGetDatum(&slot_contents.data.name); + + if (slot_contents.data.database == InvalidOid) + nulls[i++] = true; + else + values[i++] = NameGetDatum(&slot_contents.data.plugin); + + if (slot_contents.data.database == InvalidOid) + values[i++] = CStringGetTextDatum("physical"); + else + values[i++] = CStringGetTextDatum("logical"); + + if (slot_contents.data.database == InvalidOid) + nulls[i++] = true; + else + values[i++] = ObjectIdGetDatum(slot_contents.data.database); + + values[i++] = BoolGetDatum(slot_contents.data.persistency == RS_TEMPORARY); + values[i++] = BoolGetDatum(slot_contents.active_pid != 0); + + if (slot_contents.active_pid != 0) + values[i++] = Int32GetDatum(slot_contents.active_pid); + else + nulls[i++] = true; + + if (slot_contents.data.xmin != InvalidTransactionId) + values[i++] = TransactionIdGetDatum(slot_contents.data.xmin); + else + nulls[i++] = true; + + if (slot_contents.data.catalog_xmin != InvalidTransactionId) + values[i++] = TransactionIdGetDatum(slot_contents.data.catalog_xmin); + else + nulls[i++] = true; + + if (slot_contents.data.restart_lsn != InvalidXLogRecPtr) + values[i++] = LSNGetDatum(slot_contents.data.restart_lsn); + else + nulls[i++] = true; + + if (slot_contents.data.confirmed_flush != InvalidXLogRecPtr) + values[i++] = LSNGetDatum(slot_contents.data.confirmed_flush); + else + nulls[i++] = true; + + /* + * If the slot has not been invalidated, test availability from + * restart_lsn. + */ + if (slot_contents.data.invalidated != RS_INVAL_NONE) + walstate = WALAVAIL_REMOVED; + else + walstate = GetWALAvailability(slot_contents.data.restart_lsn); + + switch (walstate) + { + case WALAVAIL_INVALID_LSN: + nulls[i++] = true; + break; + + case WALAVAIL_RESERVED: + values[i++] = CStringGetTextDatum("reserved"); + break; + + case WALAVAIL_EXTENDED: + values[i++] = CStringGetTextDatum("extended"); + break; + + case WALAVAIL_UNRESERVED: + values[i++] = CStringGetTextDatum("unreserved"); + break; + + case WALAVAIL_REMOVED: + + /* + * If we read the restart_lsn long enough ago, maybe that file + * has been removed by now. However, the walsender could have + * moved forward enough that it jumped to another file after + * we looked. If checkpointer signalled the process to + * termination, then it's definitely lost; but if a process is + * still alive, then "unreserved" seems more appropriate. + * + * If we do change it, save the state for safe_wal_size below. + */ + if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + { + int pid; + + SpinLockAcquire(&slot->mutex); + pid = slot->active_pid; + slot_contents.data.restart_lsn = slot->data.restart_lsn; + SpinLockRelease(&slot->mutex); + if (pid != 0) + { + values[i++] = CStringGetTextDatum("unreserved"); + walstate = WALAVAIL_UNRESERVED; + break; + } + } + values[i++] = CStringGetTextDatum("lost"); + break; + } + + /* + * safe_wal_size is only computed for slots that have not been lost, + * and only if there's a configured maximum size. + */ + if (walstate == WALAVAIL_REMOVED || max_slot_wal_keep_size_mb < 0) + nulls[i++] = true; + else + { + XLogSegNo targetSeg; + uint64 slotKeepSegs; + uint64 keepSegs; + XLogSegNo failSeg; + XLogRecPtr failLSN; + + XLByteToSeg(slot_contents.data.restart_lsn, targetSeg, wal_segment_size); + + /* determine how many segments can be kept by slots */ + slotKeepSegs = XLogMBVarToSegs(max_slot_wal_keep_size_mb, wal_segment_size); + /* ditto for wal_keep_size */ + keepSegs = XLogMBVarToSegs(wal_keep_size_mb, wal_segment_size); + + /* if currpos reaches failLSN, we lose our segment */ + failSeg = targetSeg + Max(slotKeepSegs, keepSegs) + 1; + XLogSegNoOffsetToRecPtr(failSeg, 0, wal_segment_size, failLSN); + + values[i++] = Int64GetDatum(failLSN - currlsn); + } + + values[i++] = BoolGetDatum(slot_contents.data.two_phase); + + if (slot_contents.data.database == InvalidOid) + nulls[i++] = true; + else + { + if (slot_contents.data.invalidated != RS_INVAL_NONE) + values[i++] = BoolGetDatum(true); + else + values[i++] = BoolGetDatum(false); + } + + Assert(i == PG_GET_REPLICATION_SLOTS_COLS); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + LWLockRelease(ReplicationSlotControlLock); + + return (Datum) 0; +} + +/* + * Helper function for advancing our physical replication slot forward. + * + * The LSN position to move to is compared simply to the slot's restart_lsn, + * knowing that any position older than that would be removed by successive + * checkpoints. + */ +static XLogRecPtr +pg_physical_replication_slot_advance(XLogRecPtr moveto) +{ + XLogRecPtr startlsn = MyReplicationSlot->data.restart_lsn; + XLogRecPtr retlsn = startlsn; + + Assert(moveto != InvalidXLogRecPtr); + + if (startlsn < moveto) + { + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->data.restart_lsn = moveto; + SpinLockRelease(&MyReplicationSlot->mutex); + retlsn = moveto; + + /* + * Dirty the slot so as it is written out at the next checkpoint. Note + * that the LSN position advanced may still be lost in the event of a + * crash, but this makes the data consistent after a clean shutdown. + */ + ReplicationSlotMarkDirty(); + } + + return retlsn; +} + +/* + * Helper function for advancing our logical replication slot forward. + * + * The slot's restart_lsn is used as start point for reading records, while + * confirmed_flush is used as base point for the decoding context. + * + * We cannot just do LogicalConfirmReceivedLocation to update confirmed_flush, + * because we need to digest WAL to advance restart_lsn allowing to recycle + * WAL and removal of old catalog tuples. As decoding is done in fast_forward + * mode, no changes are generated anyway. + */ +static XLogRecPtr +pg_logical_replication_slot_advance(XLogRecPtr moveto) +{ + LogicalDecodingContext *ctx; + ResourceOwner old_resowner = CurrentResourceOwner; + XLogRecPtr retlsn; + + Assert(moveto != InvalidXLogRecPtr); + + PG_TRY(); + { + /* + * Create our decoding context in fast_forward mode, passing start_lsn + * as InvalidXLogRecPtr, so that we start processing from my slot's + * confirmed_flush. + */ + ctx = CreateDecodingContext(InvalidXLogRecPtr, + NIL, + true, /* fast_forward */ + XL_ROUTINE(.page_read = read_local_xlog_page, + .segment_open = wal_segment_open, + .segment_close = wal_segment_close), + NULL, NULL, NULL); + + /* + * Start reading at the slot's restart_lsn, which we know to point to + * a valid record. + */ + XLogBeginRead(ctx->reader, MyReplicationSlot->data.restart_lsn); + + /* invalidate non-timetravel entries */ + InvalidateSystemCaches(); + + /* Decode at least one record, until we run out of records */ + while (ctx->reader->EndRecPtr < moveto) + { + char *errm = NULL; + XLogRecord *record; + + /* + * Read records. No changes are generated in fast_forward mode, + * but snapbuilder/slot statuses are updated properly. + */ + record = XLogReadRecord(ctx->reader, &errm); + if (errm) + elog(ERROR, "could not find record while advancing replication slot: %s", + errm); + + /* + * Process the record. Storage-level changes are ignored in + * fast_forward mode, but other modules (such as snapbuilder) + * might still have critical updates to do. + */ + if (record) + LogicalDecodingProcessRecord(ctx, ctx->reader); + + /* Stop once the requested target has been reached */ + if (moveto <= ctx->reader->EndRecPtr) + break; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * Logical decoding could have clobbered CurrentResourceOwner during + * transaction management, so restore the executor's value. (This is + * a kluge, but it's not worth cleaning up right now.) + */ + CurrentResourceOwner = old_resowner; + + if (ctx->reader->EndRecPtr != InvalidXLogRecPtr) + { + LogicalConfirmReceivedLocation(moveto); + + /* + * If only the confirmed_flush LSN has changed the slot won't get + * marked as dirty by the above. Callers on the walsender + * interface are expected to keep track of their own progress and + * don't need it written out. But SQL-interface users cannot + * specify their own start positions and it's harder for them to + * keep track of their progress, so we should make more of an + * effort to save it for them. + * + * Dirty the slot so it is written out at the next checkpoint. The + * LSN position advanced to may still be lost on a crash but this + * makes the data consistent after a clean shutdown. + */ + ReplicationSlotMarkDirty(); + } + + retlsn = MyReplicationSlot->data.confirmed_flush; + + /* free context, call shutdown callback */ + FreeDecodingContext(ctx); + + InvalidateSystemCaches(); + } + PG_CATCH(); + { + /* clear all timetravel entries */ + InvalidateSystemCaches(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + return retlsn; +} + +/* + * SQL function for moving the position in a replication slot. + */ +Datum +pg_replication_slot_advance(PG_FUNCTION_ARGS) +{ + Name slotname = PG_GETARG_NAME(0); + XLogRecPtr moveto = PG_GETARG_LSN(1); + XLogRecPtr endlsn; + XLogRecPtr minlsn; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2]; + HeapTuple tuple; + Datum result; + + Assert(!MyReplicationSlot); + + CheckSlotPermissions(); + + if (XLogRecPtrIsInvalid(moveto)) + ereport(ERROR, + (errmsg("invalid target WAL LSN"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + /* + * We can't move slot past what's been flushed/replayed so clamp the + * target position accordingly. + */ + if (!RecoveryInProgress()) + moveto = Min(moveto, GetFlushRecPtr(NULL)); + else + moveto = Min(moveto, GetXLogReplayRecPtr(NULL)); + + /* Acquire the slot so we "own" it */ + ReplicationSlotAcquire(NameStr(*slotname), true); + + /* A slot whose restart_lsn has never been reserved cannot be advanced */ + if (XLogRecPtrIsInvalid(MyReplicationSlot->data.restart_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot \"%s\" cannot be advanced", + NameStr(*slotname)), + errdetail("This slot has never previously reserved WAL, or it has been invalidated."))); + + /* + * Check if the slot is not moving backwards. Physical slots rely simply + * on restart_lsn as a minimum point, while logical slots have confirmed + * consumption up to confirmed_flush, meaning that in both cases data + * older than that is not available anymore. + */ + if (OidIsValid(MyReplicationSlot->data.database)) + minlsn = MyReplicationSlot->data.confirmed_flush; + else + minlsn = MyReplicationSlot->data.restart_lsn; + + if (moveto < minlsn) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot advance replication slot to %X/%X, minimum is %X/%X", + LSN_FORMAT_ARGS(moveto), LSN_FORMAT_ARGS(minlsn)))); + + /* Do the actual slot update, depending on the slot type */ + if (OidIsValid(MyReplicationSlot->data.database)) + endlsn = pg_logical_replication_slot_advance(moveto); + else + endlsn = pg_physical_replication_slot_advance(moveto); + + values[0] = NameGetDatum(&MyReplicationSlot->data.name); + nulls[0] = false; + + /* + * Recompute the minimum LSN and xmin across all slots to adjust with the + * advancing potentially done. + */ + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + + ReplicationSlotRelease(); + + /* Return the reached position. */ + values[1] = LSNGetDatum(endlsn); + nulls[1] = false; + + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + PG_RETURN_DATUM(result); +} + +/* + * Helper function of copying a replication slot. + */ +static Datum +copy_replication_slot(FunctionCallInfo fcinfo, bool logical_slot) +{ + Name src_name = PG_GETARG_NAME(0); + Name dst_name = PG_GETARG_NAME(1); + ReplicationSlot *src = NULL; + ReplicationSlot first_slot_contents; + ReplicationSlot second_slot_contents; + XLogRecPtr src_restart_lsn; + bool src_islogical; + bool temporary; + char *plugin; + Datum values[2]; + bool nulls[2]; + Datum result; + TupleDesc tupdesc; + HeapTuple tuple; + + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + CheckSlotPermissions(); + + if (logical_slot) + CheckLogicalDecodingRequirements(); + else + CheckSlotRequirements(); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + + /* + * We need to prevent the source slot's reserved WAL from being removed, + * but we don't want to lock that slot for very long, and it can advance + * in the meantime. So obtain the source slot's data, and create a new + * slot using its restart_lsn. Afterwards we lock the source slot again + * and verify that the data we copied (name, type) has not changed + * incompatibly. No inconvenient WAL removal can occur once the new slot + * is created -- but since WAL removal could have occurred before we + * managed to create the new slot, we advance the new slot's restart_lsn + * to the source slot's updated restart_lsn the second time we lock it. + */ + for (int i = 0; i < max_replication_slots; i++) + { + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + + if (s->in_use && strcmp(NameStr(s->data.name), NameStr(*src_name)) == 0) + { + /* Copy the slot contents while holding spinlock */ + SpinLockAcquire(&s->mutex); + first_slot_contents = *s; + SpinLockRelease(&s->mutex); + src = s; + break; + } + } + + LWLockRelease(ReplicationSlotControlLock); + + if (src == NULL) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("replication slot \"%s\" does not exist", NameStr(*src_name)))); + + src_islogical = SlotIsLogical(&first_slot_contents); + src_restart_lsn = first_slot_contents.data.restart_lsn; + temporary = (first_slot_contents.data.persistency == RS_TEMPORARY); + plugin = logical_slot ? NameStr(first_slot_contents.data.plugin) : NULL; + + /* Check type of replication slot */ + if (src_islogical != logical_slot) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + src_islogical ? + errmsg("cannot copy physical replication slot \"%s\" as a logical replication slot", + NameStr(*src_name)) : + errmsg("cannot copy logical replication slot \"%s\" as a physical replication slot", + NameStr(*src_name)))); + + /* Copying non-reserved slot doesn't make sense */ + if (XLogRecPtrIsInvalid(src_restart_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot copy a replication slot that doesn't reserve WAL"))); + + /* Overwrite params from optional arguments */ + if (PG_NARGS() >= 3) + temporary = PG_GETARG_BOOL(2); + if (PG_NARGS() >= 4) + { + Assert(logical_slot); + plugin = NameStr(*(PG_GETARG_NAME(3))); + } + + /* Create new slot and acquire it */ + if (logical_slot) + { + /* + * We must not try to read WAL, since we haven't reserved it yet -- + * hence pass find_startpoint false. confirmed_flush will be set + * below, by copying from the source slot. + */ + create_logical_replication_slot(NameStr(*dst_name), + plugin, + temporary, + false, + src_restart_lsn, + false); + } + else + create_physical_replication_slot(NameStr(*dst_name), + true, + temporary, + src_restart_lsn); + + /* + * Update the destination slot to current values of the source slot; + * recheck that the source slot is still the one we saw previously. + */ + { + TransactionId copy_effective_xmin; + TransactionId copy_effective_catalog_xmin; + TransactionId copy_xmin; + TransactionId copy_catalog_xmin; + XLogRecPtr copy_restart_lsn; + XLogRecPtr copy_confirmed_flush; + bool copy_islogical; + char *copy_name; + + /* Copy data of source slot again */ + SpinLockAcquire(&src->mutex); + second_slot_contents = *src; + SpinLockRelease(&src->mutex); + + copy_effective_xmin = second_slot_contents.effective_xmin; + copy_effective_catalog_xmin = second_slot_contents.effective_catalog_xmin; + + copy_xmin = second_slot_contents.data.xmin; + copy_catalog_xmin = second_slot_contents.data.catalog_xmin; + copy_restart_lsn = second_slot_contents.data.restart_lsn; + copy_confirmed_flush = second_slot_contents.data.confirmed_flush; + + /* for existence check */ + copy_name = NameStr(second_slot_contents.data.name); + copy_islogical = SlotIsLogical(&second_slot_contents); + + /* + * Check if the source slot still exists and is valid. We regard it as + * invalid if the type of replication slot or name has been changed, + * or the restart_lsn either is invalid or has gone backward. (The + * restart_lsn could go backwards if the source slot is dropped and + * copied from an older slot during installation.) + * + * Since erroring out will release and drop the destination slot we + * don't need to release it here. + */ + if (copy_restart_lsn < src_restart_lsn || + src_islogical != copy_islogical || + strcmp(copy_name, NameStr(*src_name)) != 0) + ereport(ERROR, + (errmsg("could not copy replication slot \"%s\"", + NameStr(*src_name)), + errdetail("The source replication slot was modified incompatibly during the copy operation."))); + + /* The source slot must have a consistent snapshot */ + if (src_islogical && XLogRecPtrIsInvalid(copy_confirmed_flush)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot copy unfinished logical replication slot \"%s\"", + NameStr(*src_name)), + errhint("Retry when the source replication slot's confirmed_flush_lsn is valid."))); + + /* Install copied values again */ + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = copy_effective_xmin; + MyReplicationSlot->effective_catalog_xmin = copy_effective_catalog_xmin; + + MyReplicationSlot->data.xmin = copy_xmin; + MyReplicationSlot->data.catalog_xmin = copy_catalog_xmin; + MyReplicationSlot->data.restart_lsn = copy_restart_lsn; + MyReplicationSlot->data.confirmed_flush = copy_confirmed_flush; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + ReplicationSlotsComputeRequiredLSN(); + ReplicationSlotSave(); + +#ifdef USE_ASSERT_CHECKING + /* Check that the restart_lsn is available */ + { + XLogSegNo segno; + + XLByteToSeg(copy_restart_lsn, segno, wal_segment_size); + Assert(XLogGetLastRemovedSegno() < segno); + } +#endif + } + + /* target slot fully created, mark as persistent if needed */ + if (logical_slot && !temporary) + ReplicationSlotPersist(); + + /* All done. Set up the return values */ + values[0] = NameGetDatum(dst_name); + nulls[0] = false; + if (!XLogRecPtrIsInvalid(MyReplicationSlot->data.confirmed_flush)) + { + values[1] = LSNGetDatum(MyReplicationSlot->data.confirmed_flush); + nulls[1] = false; + } + else + nulls[1] = true; + + tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + ReplicationSlotRelease(); + + PG_RETURN_DATUM(result); +} + +/* The wrappers below are all to appease opr_sanity */ +Datum +pg_copy_logical_replication_slot_a(PG_FUNCTION_ARGS) +{ + return copy_replication_slot(fcinfo, true); +} + +Datum +pg_copy_logical_replication_slot_b(PG_FUNCTION_ARGS) +{ + return copy_replication_slot(fcinfo, true); +} + +Datum +pg_copy_logical_replication_slot_c(PG_FUNCTION_ARGS) +{ + return copy_replication_slot(fcinfo, true); +} + +Datum +pg_copy_physical_replication_slot_a(PG_FUNCTION_ARGS) +{ + return copy_replication_slot(fcinfo, false); +} + +Datum +pg_copy_physical_replication_slot_b(PG_FUNCTION_ARGS) +{ + return copy_replication_slot(fcinfo, false); +} diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c new file mode 100644 index 0000000..0ea71b5 --- /dev/null +++ b/src/backend/replication/syncrep.c @@ -0,0 +1,1077 @@ +/*------------------------------------------------------------------------- + * + * syncrep.c + * + * Synchronous replication is new as of PostgreSQL 9.1. + * + * If requested, transaction commits wait until their commit LSN are + * acknowledged by the synchronous standbys. + * + * This module contains the code for waiting and release of backends. + * All code in this module executes on the primary. The core streaming + * replication transport remains within WALreceiver/WALsender modules. + * + * The essence of this design is that it isolates all logic about + * waiting/releasing onto the primary. The primary defines which standbys + * it wishes to wait for. The standbys are completely unaware of the + * durability requirements of transactions on the primary, reducing the + * complexity of the code and streamlining both standby operations and + * network bandwidth because there is no requirement to ship + * per-transaction state information. + * + * Replication is either synchronous or not synchronous (async). If it is + * async, we just fastpath out of here. If it is sync, then we wait for + * the write, flush or apply location on the standby before releasing + * the waiting backend. Further complexity in that interaction is + * expected in later releases. + * + * The best performing way to manage the waiting backends is to have a + * single ordered queue of waiting backends, so that we can avoid + * searching the through all waiters each time we receive a reply. + * + * In 9.5 or before only a single standby could be considered as + * synchronous. In 9.6 we support a priority-based multiple synchronous + * standbys. In 10.0 a quorum-based multiple synchronous standbys is also + * supported. The number of synchronous standbys that transactions + * must wait for replies from is specified in synchronous_standby_names. + * This parameter also specifies a list of standby names and the method + * (FIRST and ANY) to choose synchronous standbys from the listed ones. + * + * The method FIRST specifies a priority-based synchronous replication + * and makes transaction commits wait until their WAL records are + * replicated to the requested number of synchronous standbys chosen based + * on their priorities. The standbys whose names appear earlier in the list + * are given higher priority and will be considered as synchronous. + * Other standby servers appearing later in this list represent potential + * synchronous standbys. If any of the current synchronous standbys + * disconnects for whatever reason, it will be replaced immediately with + * the next-highest-priority standby. + * + * The method ANY specifies a quorum-based synchronous replication + * and makes transaction commits wait until their WAL records are + * replicated to at least the requested number of synchronous standbys + * in the list. All the standbys appearing in the list are considered as + * candidates for quorum synchronous standbys. + * + * If neither FIRST nor ANY is specified, FIRST is used as the method. + * This is for backward compatibility with 9.6 or before where only a + * priority-based sync replication was supported. + * + * Before the standbys chosen from synchronous_standby_names can + * become the synchronous standbys they must have caught up with + * the primary; that may take some time. Once caught up, + * the standbys which are considered as synchronous at that moment + * will release waiters from the queue. + * + * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/syncrep.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/xact.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc_hooks.h" +#include "utils/ps_status.h" + +/* User-settable parameters for sync rep */ +char *SyncRepStandbyNames; + +#define SyncStandbysDefined() \ + (SyncRepStandbyNames != NULL && SyncRepStandbyNames[0] != '\0') + +static bool announce_next_takeover = true; + +SyncRepConfigData *SyncRepConfig = NULL; +static int SyncRepWaitMode = SYNC_REP_NO_WAIT; + +static void SyncRepQueueInsert(int mode); +static void SyncRepCancelWait(void); +static int SyncRepWakeQueue(bool all, int mode); + +static bool SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + bool *am_sync); +static void SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + SyncRepStandbyData *sync_standbys, + int num_standbys); +static void SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + SyncRepStandbyData *sync_standbys, + int num_standbys, + uint8 nth); +static int SyncRepGetStandbyPriority(void); +static int standby_priority_comparator(const void *a, const void *b); +static int cmp_lsn(const void *a, const void *b); + +#ifdef USE_ASSERT_CHECKING +static bool SyncRepQueueIsOrderedByLSN(int mode); +#endif + +/* + * =========================================================== + * Synchronous Replication functions for normal user backends + * =========================================================== + */ + +/* + * Wait for synchronous replication, if requested by user. + * + * Initially backends start in state SYNC_REP_NOT_WAITING and then + * change that state to SYNC_REP_WAITING before adding ourselves + * to the wait queue. During SyncRepWakeQueue() a WALSender changes + * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. + * This backend then resets its state to SYNC_REP_NOT_WAITING. + * + * 'lsn' represents the LSN to wait for. 'commit' indicates whether this LSN + * represents a commit record. If it doesn't, then we wait only for the WAL + * to be flushed if synchronous_commit is set to the higher level of + * remote_apply, because only commit records provide apply feedback. + */ +void +SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) +{ + int mode; + + /* + * This should be called while holding interrupts during a transaction + * commit to prevent the follow-up shared memory queue cleanups to be + * influenced by external interruptions. + */ + Assert(InterruptHoldoffCount > 0); + + /* + * Fast exit if user has not requested sync replication, or there are no + * sync replication standby names defined. + * + * Since this routine gets called every commit time, it's important to + * exit quickly if sync replication is not requested. So we check + * WalSndCtl->sync_standbys_defined flag without the lock and exit + * immediately if it's false. If it's true, we need to check it again + * later while holding the lock, to check the flag and operate the sync + * rep queue atomically. This is necessary to avoid the race condition + * described in SyncRepUpdateSyncStandbysDefined(). On the other hand, if + * it's false, the lock is not necessary because we don't touch the queue. + */ + if (!SyncRepRequested() || + !((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined) + return; + + /* Cap the level for anything other than commit to remote flush only. */ + if (commit) + mode = SyncRepWaitMode; + else + mode = Min(SyncRepWaitMode, SYNC_REP_WAIT_FLUSH); + + Assert(dlist_node_is_detached(&MyProc->syncRepLinks)); + Assert(WalSndCtl != NULL); + + LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); + Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); + + /* + * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not + * set. See SyncRepUpdateSyncStandbysDefined. + * + * Also check that the standby hasn't already replied. Unlikely race + * condition but we'll be fetching that cache line anyway so it's likely + * to be a low cost check. + */ + if (!WalSndCtl->sync_standbys_defined || + lsn <= WalSndCtl->lsn[mode]) + { + LWLockRelease(SyncRepLock); + return; + } + + /* + * Set our waitLSN so WALSender will know when to wake us, and add + * ourselves to the queue. + */ + MyProc->waitLSN = lsn; + MyProc->syncRepState = SYNC_REP_WAITING; + SyncRepQueueInsert(mode); + Assert(SyncRepQueueIsOrderedByLSN(mode)); + LWLockRelease(SyncRepLock); + + /* Alter ps display to show waiting for sync rep. */ + if (update_process_title) + { + char buffer[32]; + + sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn)); + set_ps_display_suffix(buffer); + } + + /* + * Wait for specified LSN to be confirmed. + * + * Each proc has its own wait latch, so we perform a normal latch + * check/wait loop here. + */ + for (;;) + { + int rc; + + /* Must reset the latch before testing state. */ + ResetLatch(MyLatch); + + /* + * Acquiring the lock is not needed, the latch ensures proper + * barriers. If it looks like we're done, we must really be done, + * because once walsender changes the state to SYNC_REP_WAIT_COMPLETE, + * it will never update it again, so we can't be seeing a stale value + * in that case. + */ + if (MyProc->syncRepState == SYNC_REP_WAIT_COMPLETE) + break; + + /* + * If a wait for synchronous replication is pending, we can neither + * acknowledge the commit nor raise ERROR or FATAL. The latter would + * lead the client to believe that the transaction aborted, which is + * not true: it's already committed locally. The former is no good + * either: the client has requested synchronous replication, and is + * entitled to assume that an acknowledged commit is also replicated, + * which might not be true. So in this case we issue a WARNING (which + * some clients may be able to interpret) and shut off further output. + * We do NOT reset ProcDiePending, so that the process will die after + * the commit is cleaned up. + */ + if (ProcDiePending) + { + ereport(WARNING, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), + errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); + whereToSendOutput = DestNone; + SyncRepCancelWait(); + break; + } + + /* + * It's unclear what to do if a query cancel interrupt arrives. We + * can't actually abort at this point, but ignoring the interrupt + * altogether is not helpful, so we just terminate the wait with a + * suitable warning. + */ + if (QueryCancelPending) + { + QueryCancelPending = false; + ereport(WARNING, + (errmsg("canceling wait for synchronous replication due to user request"), + errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); + SyncRepCancelWait(); + break; + } + + /* + * Wait on latch. Any condition that should wake us up will set the + * latch, so no need for timeout. + */ + rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, + WAIT_EVENT_SYNC_REP); + + /* + * If the postmaster dies, we'll probably never get an acknowledgment, + * because all the wal sender processes will exit. So just bail out. + */ + if (rc & WL_POSTMASTER_DEATH) + { + ProcDiePending = true; + whereToSendOutput = DestNone; + SyncRepCancelWait(); + break; + } + } + + /* + * WalSender has checked our LSN and has removed us from queue. Clean up + * state and leave. It's OK to reset these shared memory fields without + * holding SyncRepLock, because any walsenders will ignore us anyway when + * we're not on the queue. We need a read barrier to make sure we see the + * changes to the queue link (this might be unnecessary without + * assertions, but better safe than sorry). + */ + pg_read_barrier(); + Assert(dlist_node_is_detached(&MyProc->syncRepLinks)); + MyProc->syncRepState = SYNC_REP_NOT_WAITING; + MyProc->waitLSN = 0; + + /* reset ps display to remove the suffix */ + if (update_process_title) + set_ps_display_remove_suffix(); +} + +/* + * Insert MyProc into the specified SyncRepQueue, maintaining sorted invariant. + * + * Usually we will go at tail of queue, though it's possible that we arrive + * here out of order, so start at tail and work back to insertion point. + */ +static void +SyncRepQueueInsert(int mode) +{ + dlist_head *queue; + dlist_iter iter; + + Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); + queue = &WalSndCtl->SyncRepQueue[mode]; + + dlist_reverse_foreach(iter, queue) + { + PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur); + + /* + * Stop at the queue element that we should insert after to ensure the + * queue is ordered by LSN. + */ + if (proc->waitLSN < MyProc->waitLSN) + { + dlist_insert_after(&proc->syncRepLinks, &MyProc->syncRepLinks); + return; + } + } + + /* + * If we get here, the list was either empty, or this process needs to be + * at the head. + */ + dlist_push_head(queue, &MyProc->syncRepLinks); +} + +/* + * Acquire SyncRepLock and cancel any wait currently in progress. + */ +static void +SyncRepCancelWait(void) +{ + LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); + if (!dlist_node_is_detached(&MyProc->syncRepLinks)) + dlist_delete_thoroughly(&MyProc->syncRepLinks); + MyProc->syncRepState = SYNC_REP_NOT_WAITING; + LWLockRelease(SyncRepLock); +} + +void +SyncRepCleanupAtProcExit(void) +{ + /* + * First check if we are removed from the queue without the lock to not + * slow down backend exit. + */ + if (!dlist_node_is_detached(&MyProc->syncRepLinks)) + { + LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); + + /* maybe we have just been removed, so recheck */ + if (!dlist_node_is_detached(&MyProc->syncRepLinks)) + dlist_delete_thoroughly(&MyProc->syncRepLinks); + + LWLockRelease(SyncRepLock); + } +} + +/* + * =========================================================== + * Synchronous Replication functions for wal sender processes + * =========================================================== + */ + +/* + * Take any action required to initialise sync rep state from config + * data. Called at WALSender startup and after each SIGHUP. + */ +void +SyncRepInitConfig(void) +{ + int priority; + + /* + * Determine if we are a potential sync standby and remember the result + * for handling replies from standby. + */ + priority = SyncRepGetStandbyPriority(); + if (MyWalSnd->sync_standby_priority != priority) + { + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sync_standby_priority = priority; + SpinLockRelease(&MyWalSnd->mutex); + + ereport(DEBUG1, + (errmsg_internal("standby \"%s\" now has synchronous standby priority %u", + application_name, priority))); + } +} + +/* + * Update the LSNs on each queue based upon our latest state. This + * implements a simple policy of first-valid-sync-standby-releases-waiter. + * + * Other policies are possible, which would change what we do here and + * perhaps also which information we store as well. + */ +void +SyncRepReleaseWaiters(void) +{ + volatile WalSndCtlData *walsndctl = WalSndCtl; + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + bool got_recptr; + bool am_sync; + int numwrite = 0; + int numflush = 0; + int numapply = 0; + + /* + * If this WALSender is serving a standby that is not on the list of + * potential sync standbys then we have nothing to do. If we are still + * starting up, still running base backup or the current flush position is + * still invalid, then leave quickly also. Streaming or stopping WAL + * senders are allowed to release waiters. + */ + if (MyWalSnd->sync_standby_priority == 0 || + (MyWalSnd->state != WALSNDSTATE_STREAMING && + MyWalSnd->state != WALSNDSTATE_STOPPING) || + XLogRecPtrIsInvalid(MyWalSnd->flush)) + { + announce_next_takeover = true; + return; + } + + /* + * We're a potential sync standby. Release waiters if there are enough + * sync standbys and we are considered as sync. + */ + LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); + + /* + * Check whether we are a sync standby or not, and calculate the synced + * positions among all sync standbys. (Note: although this step does not + * of itself require holding SyncRepLock, it seems like a good idea to do + * it after acquiring the lock. This ensures that the WAL pointers we use + * to release waiters are newer than any previous execution of this + * routine used.) + */ + got_recptr = SyncRepGetSyncRecPtr(&writePtr, &flushPtr, &applyPtr, &am_sync); + + /* + * If we are managing a sync standby, though we weren't prior to this, + * then announce we are now a sync standby. + */ + if (announce_next_takeover && am_sync) + { + announce_next_takeover = false; + + if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) + ereport(LOG, + (errmsg("standby \"%s\" is now a synchronous standby with priority %u", + application_name, MyWalSnd->sync_standby_priority))); + else + ereport(LOG, + (errmsg("standby \"%s\" is now a candidate for quorum synchronous standby", + application_name))); + } + + /* + * If the number of sync standbys is less than requested or we aren't + * managing a sync standby then just leave. + */ + if (!got_recptr || !am_sync) + { + LWLockRelease(SyncRepLock); + announce_next_takeover = !am_sync; + return; + } + + /* + * Set the lsn first so that when we wake backends they will release up to + * this location. + */ + if (walsndctl->lsn[SYNC_REP_WAIT_WRITE] < writePtr) + { + walsndctl->lsn[SYNC_REP_WAIT_WRITE] = writePtr; + numwrite = SyncRepWakeQueue(false, SYNC_REP_WAIT_WRITE); + } + if (walsndctl->lsn[SYNC_REP_WAIT_FLUSH] < flushPtr) + { + walsndctl->lsn[SYNC_REP_WAIT_FLUSH] = flushPtr; + numflush = SyncRepWakeQueue(false, SYNC_REP_WAIT_FLUSH); + } + if (walsndctl->lsn[SYNC_REP_WAIT_APPLY] < applyPtr) + { + walsndctl->lsn[SYNC_REP_WAIT_APPLY] = applyPtr; + numapply = SyncRepWakeQueue(false, SYNC_REP_WAIT_APPLY); + } + + LWLockRelease(SyncRepLock); + + elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X", + numwrite, LSN_FORMAT_ARGS(writePtr), + numflush, LSN_FORMAT_ARGS(flushPtr), + numapply, LSN_FORMAT_ARGS(applyPtr)); +} + +/* + * Calculate the synced Write, Flush and Apply positions among sync standbys. + * + * Return false if the number of sync standbys is less than + * synchronous_standby_names specifies. Otherwise return true and + * store the positions into *writePtr, *flushPtr and *applyPtr. + * + * On return, *am_sync is set to true if this walsender is connecting to + * sync standby. Otherwise it's set to false. + */ +static bool +SyncRepGetSyncRecPtr(XLogRecPtr *writePtr, XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, bool *am_sync) +{ + SyncRepStandbyData *sync_standbys; + int num_standbys; + int i; + + /* Initialize default results */ + *writePtr = InvalidXLogRecPtr; + *flushPtr = InvalidXLogRecPtr; + *applyPtr = InvalidXLogRecPtr; + *am_sync = false; + + /* Quick out if not even configured to be synchronous */ + if (SyncRepConfig == NULL) + return false; + + /* Get standbys that are considered as synchronous at this moment */ + num_standbys = SyncRepGetCandidateStandbys(&sync_standbys); + + /* Am I among the candidate sync standbys? */ + for (i = 0; i < num_standbys; i++) + { + if (sync_standbys[i].is_me) + { + *am_sync = true; + break; + } + } + + /* + * Nothing more to do if we are not managing a sync standby or there are + * not enough synchronous standbys. + */ + if (!(*am_sync) || + num_standbys < SyncRepConfig->num_sync) + { + pfree(sync_standbys); + return false; + } + + /* + * In a priority-based sync replication, the synced positions are the + * oldest ones among sync standbys. In a quorum-based, they are the Nth + * latest ones. + * + * SyncRepGetNthLatestSyncRecPtr() also can calculate the oldest + * positions. But we use SyncRepGetOldestSyncRecPtr() for that calculation + * because it's a bit more efficient. + * + * XXX If the numbers of current and requested sync standbys are the same, + * we can use SyncRepGetOldestSyncRecPtr() to calculate the synced + * positions even in a quorum-based sync replication. + */ + if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) + { + SyncRepGetOldestSyncRecPtr(writePtr, flushPtr, applyPtr, + sync_standbys, num_standbys); + } + else + { + SyncRepGetNthLatestSyncRecPtr(writePtr, flushPtr, applyPtr, + sync_standbys, num_standbys, + SyncRepConfig->num_sync); + } + + pfree(sync_standbys); + return true; +} + +/* + * Calculate the oldest Write, Flush and Apply positions among sync standbys. + */ +static void +SyncRepGetOldestSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + SyncRepStandbyData *sync_standbys, + int num_standbys) +{ + int i; + + /* + * Scan through all sync standbys and calculate the oldest Write, Flush + * and Apply positions. We assume *writePtr et al were initialized to + * InvalidXLogRecPtr. + */ + for (i = 0; i < num_standbys; i++) + { + XLogRecPtr write = sync_standbys[i].write; + XLogRecPtr flush = sync_standbys[i].flush; + XLogRecPtr apply = sync_standbys[i].apply; + + if (XLogRecPtrIsInvalid(*writePtr) || *writePtr > write) + *writePtr = write; + if (XLogRecPtrIsInvalid(*flushPtr) || *flushPtr > flush) + *flushPtr = flush; + if (XLogRecPtrIsInvalid(*applyPtr) || *applyPtr > apply) + *applyPtr = apply; + } +} + +/* + * Calculate the Nth latest Write, Flush and Apply positions among sync + * standbys. + */ +static void +SyncRepGetNthLatestSyncRecPtr(XLogRecPtr *writePtr, + XLogRecPtr *flushPtr, + XLogRecPtr *applyPtr, + SyncRepStandbyData *sync_standbys, + int num_standbys, + uint8 nth) +{ + XLogRecPtr *write_array; + XLogRecPtr *flush_array; + XLogRecPtr *apply_array; + int i; + + /* Should have enough candidates, or somebody messed up */ + Assert(nth > 0 && nth <= num_standbys); + + write_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); + flush_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); + apply_array = (XLogRecPtr *) palloc(sizeof(XLogRecPtr) * num_standbys); + + for (i = 0; i < num_standbys; i++) + { + write_array[i] = sync_standbys[i].write; + flush_array[i] = sync_standbys[i].flush; + apply_array[i] = sync_standbys[i].apply; + } + + /* Sort each array in descending order */ + qsort(write_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn); + qsort(flush_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn); + qsort(apply_array, num_standbys, sizeof(XLogRecPtr), cmp_lsn); + + /* Get Nth latest Write, Flush, Apply positions */ + *writePtr = write_array[nth - 1]; + *flushPtr = flush_array[nth - 1]; + *applyPtr = apply_array[nth - 1]; + + pfree(write_array); + pfree(flush_array); + pfree(apply_array); +} + +/* + * Compare lsn in order to sort array in descending order. + */ +static int +cmp_lsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 > lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* + * Return data about walsenders that are candidates to be sync standbys. + * + * *standbys is set to a palloc'd array of structs of per-walsender data, + * and the number of valid entries (candidate sync senders) is returned. + * (This might be more or fewer than num_sync; caller must check.) + */ +int +SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys) +{ + int i; + int n; + + /* Create result array */ + *standbys = (SyncRepStandbyData *) + palloc(max_wal_senders * sizeof(SyncRepStandbyData)); + + /* Quick exit if sync replication is not requested */ + if (SyncRepConfig == NULL) + return 0; + + /* Collect raw data from shared memory */ + n = 0; + for (i = 0; i < max_wal_senders; i++) + { + volatile WalSnd *walsnd; /* Use volatile pointer to prevent code + * rearrangement */ + SyncRepStandbyData *stby; + WalSndState state; /* not included in SyncRepStandbyData */ + + walsnd = &WalSndCtl->walsnds[i]; + stby = *standbys + n; + + SpinLockAcquire(&walsnd->mutex); + stby->pid = walsnd->pid; + state = walsnd->state; + stby->write = walsnd->write; + stby->flush = walsnd->flush; + stby->apply = walsnd->apply; + stby->sync_standby_priority = walsnd->sync_standby_priority; + SpinLockRelease(&walsnd->mutex); + + /* Must be active */ + if (stby->pid == 0) + continue; + + /* Must be streaming or stopping */ + if (state != WALSNDSTATE_STREAMING && + state != WALSNDSTATE_STOPPING) + continue; + + /* Must be synchronous */ + if (stby->sync_standby_priority == 0) + continue; + + /* Must have a valid flush position */ + if (XLogRecPtrIsInvalid(stby->flush)) + continue; + + /* OK, it's a candidate */ + stby->walsnd_index = i; + stby->is_me = (walsnd == MyWalSnd); + n++; + } + + /* + * In quorum mode, we return all the candidates. In priority mode, if we + * have too many candidates then return only the num_sync ones of highest + * priority. + */ + if (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY && + n > SyncRepConfig->num_sync) + { + /* Sort by priority ... */ + qsort(*standbys, n, sizeof(SyncRepStandbyData), + standby_priority_comparator); + /* ... then report just the first num_sync ones */ + n = SyncRepConfig->num_sync; + } + + return n; +} + +/* + * qsort comparator to sort SyncRepStandbyData entries by priority + */ +static int +standby_priority_comparator(const void *a, const void *b) +{ + const SyncRepStandbyData *sa = (const SyncRepStandbyData *) a; + const SyncRepStandbyData *sb = (const SyncRepStandbyData *) b; + + /* First, sort by increasing priority value */ + if (sa->sync_standby_priority != sb->sync_standby_priority) + return sa->sync_standby_priority - sb->sync_standby_priority; + + /* + * We might have equal priority values; arbitrarily break ties by position + * in the WalSnd array. (This is utterly bogus, since that is arrival + * order dependent, but there are regression tests that rely on it.) + */ + return sa->walsnd_index - sb->walsnd_index; +} + + +/* + * Check if we are in the list of sync standbys, and if so, determine + * priority sequence. Return priority if set, or zero to indicate that + * we are not a potential sync standby. + * + * Compare the parameter SyncRepStandbyNames against the application_name + * for this WALSender, or allow any name if we find a wildcard "*". + */ +static int +SyncRepGetStandbyPriority(void) +{ + const char *standby_name; + int priority; + bool found = false; + + /* + * Since synchronous cascade replication is not allowed, we always set the + * priority of cascading walsender to zero. + */ + if (am_cascading_walsender) + return 0; + + if (!SyncStandbysDefined() || SyncRepConfig == NULL) + return 0; + + standby_name = SyncRepConfig->member_names; + for (priority = 1; priority <= SyncRepConfig->nmembers; priority++) + { + if (pg_strcasecmp(standby_name, application_name) == 0 || + strcmp(standby_name, "*") == 0) + { + found = true; + break; + } + standby_name += strlen(standby_name) + 1; + } + + if (!found) + return 0; + + /* + * In quorum-based sync replication, all the standbys in the list have the + * same priority, one. + */ + return (SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY) ? priority : 1; +} + +/* + * Walk the specified queue from head. Set the state of any backends that + * need to be woken, remove them from the queue, and then wake them. + * Pass all = true to wake whole queue; otherwise, just wake up to + * the walsender's LSN. + * + * The caller must hold SyncRepLock in exclusive mode. + */ +static int +SyncRepWakeQueue(bool all, int mode) +{ + volatile WalSndCtlData *walsndctl = WalSndCtl; + int numprocs = 0; + dlist_mutable_iter iter; + + Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); + Assert(LWLockHeldByMeInMode(SyncRepLock, LW_EXCLUSIVE)); + Assert(SyncRepQueueIsOrderedByLSN(mode)); + + dlist_foreach_modify(iter, &WalSndCtl->SyncRepQueue[mode]) + { + PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur); + + /* + * Assume the queue is ordered by LSN + */ + if (!all && walsndctl->lsn[mode] < proc->waitLSN) + return numprocs; + + /* + * Remove from queue. + */ + dlist_delete_thoroughly(&proc->syncRepLinks); + + /* + * SyncRepWaitForLSN() reads syncRepState without holding the lock, so + * make sure that it sees the queue link being removed before the + * syncRepState change. + */ + pg_write_barrier(); + + /* + * Set state to complete; see SyncRepWaitForLSN() for discussion of + * the various states. + */ + proc->syncRepState = SYNC_REP_WAIT_COMPLETE; + + /* + * Wake only when we have set state and removed from queue. + */ + SetLatch(&(proc->procLatch)); + + numprocs++; + } + + return numprocs; +} + +/* + * The checkpointer calls this as needed to update the shared + * sync_standbys_defined flag, so that backends don't remain permanently wedged + * if synchronous_standby_names is unset. It's safe to check the current value + * without the lock, because it's only ever updated by one process. But we + * must take the lock to change it. + */ +void +SyncRepUpdateSyncStandbysDefined(void) +{ + bool sync_standbys_defined = SyncStandbysDefined(); + + if (sync_standbys_defined != WalSndCtl->sync_standbys_defined) + { + LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); + + /* + * If synchronous_standby_names has been reset to empty, it's futile + * for backends to continue waiting. Since the user no longer wants + * synchronous replication, we'd better wake them up. + */ + if (!sync_standbys_defined) + { + int i; + + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++) + SyncRepWakeQueue(true, i); + } + + /* + * Only allow people to join the queue when there are synchronous + * standbys defined. Without this interlock, there's a race + * condition: we might wake up all the current waiters; then, some + * backend that hasn't yet reloaded its config might go to sleep on + * the queue (and never wake up). This prevents that. + */ + WalSndCtl->sync_standbys_defined = sync_standbys_defined; + + LWLockRelease(SyncRepLock); + } +} + +#ifdef USE_ASSERT_CHECKING +static bool +SyncRepQueueIsOrderedByLSN(int mode) +{ + XLogRecPtr lastLSN; + dlist_iter iter; + + Assert(mode >= 0 && mode < NUM_SYNC_REP_WAIT_MODE); + + lastLSN = 0; + + dlist_foreach(iter, &WalSndCtl->SyncRepQueue[mode]) + { + PGPROC *proc = dlist_container(PGPROC, syncRepLinks, iter.cur); + + /* + * Check the queue is ordered by LSN and that multiple procs don't + * have matching LSNs + */ + if (proc->waitLSN <= lastLSN) + return false; + + lastLSN = proc->waitLSN; + } + + return true; +} +#endif + +/* + * =========================================================== + * Synchronous Replication functions executed by any process + * =========================================================== + */ + +bool +check_synchronous_standby_names(char **newval, void **extra, GucSource source) +{ + if (*newval != NULL && (*newval)[0] != '\0') + { + int parse_rc; + SyncRepConfigData *pconf; + + /* Reset communication variables to ensure a fresh start */ + syncrep_parse_result = NULL; + syncrep_parse_error_msg = NULL; + + /* Parse the synchronous_standby_names string */ + syncrep_scanner_init(*newval); + parse_rc = syncrep_yyparse(); + syncrep_scanner_finish(); + + if (parse_rc != 0 || syncrep_parse_result == NULL) + { + GUC_check_errcode(ERRCODE_SYNTAX_ERROR); + if (syncrep_parse_error_msg) + GUC_check_errdetail("%s", syncrep_parse_error_msg); + else + GUC_check_errdetail("synchronous_standby_names parser failed"); + return false; + } + + if (syncrep_parse_result->num_sync <= 0) + { + GUC_check_errmsg("number of synchronous standbys (%d) must be greater than zero", + syncrep_parse_result->num_sync); + return false; + } + + /* GUC extra value must be guc_malloc'd, not palloc'd */ + pconf = (SyncRepConfigData *) + guc_malloc(LOG, syncrep_parse_result->config_size); + if (pconf == NULL) + return false; + memcpy(pconf, syncrep_parse_result, syncrep_parse_result->config_size); + + *extra = (void *) pconf; + + /* + * We need not explicitly clean up syncrep_parse_result. It, and any + * other cruft generated during parsing, will be freed when the + * current memory context is deleted. (This code is generally run in + * a short-lived context used for config file processing, so that will + * not be very long.) + */ + } + else + *extra = NULL; + + return true; +} + +void +assign_synchronous_standby_names(const char *newval, void *extra) +{ + SyncRepConfig = (SyncRepConfigData *) extra; +} + +void +assign_synchronous_commit(int newval, void *extra) +{ + switch (newval) + { + case SYNCHRONOUS_COMMIT_REMOTE_WRITE: + SyncRepWaitMode = SYNC_REP_WAIT_WRITE; + break; + case SYNCHRONOUS_COMMIT_REMOTE_FLUSH: + SyncRepWaitMode = SYNC_REP_WAIT_FLUSH; + break; + case SYNCHRONOUS_COMMIT_REMOTE_APPLY: + SyncRepWaitMode = SYNC_REP_WAIT_APPLY; + break; + default: + SyncRepWaitMode = SYNC_REP_NO_WAIT; + break; + } +} diff --git a/src/backend/replication/syncrep_gram.c b/src/backend/replication/syncrep_gram.c new file mode 100644 index 0000000..b837870 --- /dev/null +++ b/src/backend/replication/syncrep_gram.c @@ -0,0 +1,1407 @@ +/* A Bison parser, made by GNU Bison 3.7.5. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output, and Bison version. */ +#define YYBISON 30705 + +/* Bison version string. */ +#define YYBISON_VERSION "3.7.5" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + + +/* Substitute the variable and function names. */ +#define yyparse syncrep_yyparse +#define yylex syncrep_yylex +#define yyerror syncrep_yyerror +#define yydebug syncrep_yydebug +#define yynerrs syncrep_yynerrs +#define yylval syncrep_yylval +#define yychar syncrep_yychar + +/* First part of user prologue. */ +#line 1 "syncrep_gram.y" + +/*------------------------------------------------------------------------- + * + * syncrep_gram.y - Parser for synchronous_standby_names + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/syncrep_gram.y + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "nodes/pg_list.h" +#include "replication/syncrep.h" + +/* Result of parsing is returned in one of these two variables */ +SyncRepConfigData *syncrep_parse_result; +char *syncrep_parse_error_msg; + +static SyncRepConfigData *create_syncrep_config(const char *num_sync, + List *members, uint8 syncrep_method); + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree + + +#line 114 "syncrep_gram.c" + +# ifndef YY_CAST +# ifdef __cplusplus +# define YY_CAST(Type, Val) static_cast<Type> (Val) +# define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast<Type> (Val) +# else +# define YY_CAST(Type, Val) ((Type) (Val)) +# define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val)) +# endif +# endif +# ifndef YY_NULLPTR +# if defined __cplusplus +# if 201103L <= __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# else +# define YY_NULLPTR ((void*)0) +# endif +# endif + +#include "syncrep_gram.h" +/* Symbol kind. */ +enum yysymbol_kind_t +{ + YYSYMBOL_YYEMPTY = -2, + YYSYMBOL_YYEOF = 0, /* "end of file" */ + YYSYMBOL_YYerror = 1, /* error */ + YYSYMBOL_YYUNDEF = 2, /* "invalid token" */ + YYSYMBOL_NAME = 3, /* NAME */ + YYSYMBOL_NUM = 4, /* NUM */ + YYSYMBOL_JUNK = 5, /* JUNK */ + YYSYMBOL_ANY = 6, /* ANY */ + YYSYMBOL_FIRST = 7, /* FIRST */ + YYSYMBOL_8_ = 8, /* '(' */ + YYSYMBOL_9_ = 9, /* ')' */ + YYSYMBOL_10_ = 10, /* ',' */ + YYSYMBOL_YYACCEPT = 11, /* $accept */ + YYSYMBOL_result = 12, /* result */ + YYSYMBOL_standby_config = 13, /* standby_config */ + YYSYMBOL_standby_list = 14, /* standby_list */ + YYSYMBOL_standby_name = 15 /* standby_name */ +}; +typedef enum yysymbol_kind_t yysymbol_kind_t; + + + + +#ifdef short +# undef short +#endif + +/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure + <limits.h> and (if available) <stdint.h> are included + so that the code can choose integer types of a good width. */ + +#ifndef __PTRDIFF_MAX__ +# include <limits.h> /* INFRINGES ON USER NAME SPACE */ +# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include <stdint.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_STDINT_H +# endif +#endif + +/* Narrow types that promote to a signed type and that can represent a + signed or unsigned integer of at least N bits. In tables they can + save space and decrease cache pressure. Promoting to a signed type + helps avoid bugs in integer arithmetic. */ + +#ifdef __INT_LEAST8_MAX__ +typedef __INT_LEAST8_TYPE__ yytype_int8; +#elif defined YY_STDINT_H +typedef int_least8_t yytype_int8; +#else +typedef signed char yytype_int8; +#endif + +#ifdef __INT_LEAST16_MAX__ +typedef __INT_LEAST16_TYPE__ yytype_int16; +#elif defined YY_STDINT_H +typedef int_least16_t yytype_int16; +#else +typedef short yytype_int16; +#endif + +/* Work around bug in HP-UX 11.23, which defines these macros + incorrectly for preprocessor constants. This workaround can likely + be removed in 2023, as HPE has promised support for HP-UX 11.23 + (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of + <https://h20195.www2.hpe.com/V2/getpdf.aspx/4AA4-7673ENW.pdf>. */ +#ifdef __hpux +# undef UINT_LEAST8_MAX +# undef UINT_LEAST16_MAX +# define UINT_LEAST8_MAX 255 +# define UINT_LEAST16_MAX 65535 +#endif + +#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST8_TYPE__ yytype_uint8; +#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST8_MAX <= INT_MAX) +typedef uint_least8_t yytype_uint8; +#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX +typedef unsigned char yytype_uint8; +#else +typedef short yytype_uint8; +#endif + +#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__ +typedef __UINT_LEAST16_TYPE__ yytype_uint16; +#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \ + && UINT_LEAST16_MAX <= INT_MAX) +typedef uint_least16_t yytype_uint16; +#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX +typedef unsigned short yytype_uint16; +#else +typedef int yytype_uint16; +#endif + +#ifndef YYPTRDIFF_T +# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__ +# define YYPTRDIFF_T __PTRDIFF_TYPE__ +# define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__ +# elif defined PTRDIFF_MAX +# ifndef ptrdiff_t +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# endif +# define YYPTRDIFF_T ptrdiff_t +# define YYPTRDIFF_MAXIMUM PTRDIFF_MAX +# else +# define YYPTRDIFF_T long +# define YYPTRDIFF_MAXIMUM LONG_MAX +# endif +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__ +# include <stddef.h> /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned +# endif +#endif + +#define YYSIZE_MAXIMUM \ + YY_CAST (YYPTRDIFF_T, \ + (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1) \ + ? YYPTRDIFF_MAXIMUM \ + : YY_CAST (YYSIZE_T, -1))) + +#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X)) + + +/* Stored state numbers (used for stacks). */ +typedef yytype_int8 yy_state_t; + +/* State numbers in computations. */ +typedef int yy_state_fast_t; + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include <libintl.h> /* INFRINGES ON USER NAME SPACE */ +# define YY_(Msgid) dgettext ("bison-runtime", Msgid) +# endif +# endif +# ifndef YY_ +# define YY_(Msgid) Msgid +# endif +#endif + + +#ifndef YY_ATTRIBUTE_PURE +# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_PURE __attribute__ ((__pure__)) +# else +# define YY_ATTRIBUTE_PURE +# endif +#endif + +#ifndef YY_ATTRIBUTE_UNUSED +# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__) +# define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__)) +# else +# define YY_ATTRIBUTE_UNUSED +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YY_USE(E) ((void) (E)) +#else +# define YY_USE(E) /* empty */ +#endif + +#if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__ +/* Suppress an incorrect diagnostic about yylval being uninitialized. */ +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"") \ + _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") +# define YY_IGNORE_MAYBE_UNINITIALIZED_END \ + _Pragma ("GCC diagnostic pop") +#else +# define YY_INITIAL_VALUE(Value) Value +#endif +#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_END +#endif +#ifndef YY_INITIAL_VALUE +# define YY_INITIAL_VALUE(Value) /* Nothing. */ +#endif + +#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__ +# define YY_IGNORE_USELESS_CAST_BEGIN \ + _Pragma ("GCC diagnostic push") \ + _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"") +# define YY_IGNORE_USELESS_CAST_END \ + _Pragma ("GCC diagnostic pop") +#endif +#ifndef YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_BEGIN +# define YY_IGNORE_USELESS_CAST_END +#endif + + +#define YY_ASSERT(E) ((void) (0 && (E))) + +#if !defined yyoverflow + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include <alloca.h> /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include <malloc.h> /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's 'empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* !defined yyoverflow */ + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yy_state_t yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYPTRDIFF_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / YYSIZEOF (*yyptr); \ + } \ + while (0) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYPTRDIFF_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (0) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 12 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 22 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 11 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 5 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 10 +/* YYNSTATES -- Number of states. */ +#define YYNSTATES 24 + +/* YYMAXUTOK -- Last valid token kind. */ +#define YYMAXUTOK 262 + + +/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM + as returned by yylex, with out-of-bounds checking. */ +#define YYTRANSLATE(YYX) \ + (0 <= (YYX) && (YYX) <= YYMAXUTOK \ + ? YY_CAST (yysymbol_kind_t, yytranslate[YYX]) \ + : YYSYMBOL_YYUNDEF) + +/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM + as returned by yylex. */ +static const yytype_int8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 8, 9, 2, 2, 10, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7 +}; + +#if YYDEBUG + /* YYRLINE[YYN] -- Source line where rule number YYN was defined. */ +static const yytype_int8 yyrline[] = +{ + 0, 57, 57, 61, 62, 63, 64, 68, 69, 73, + 74 +}; +#endif + +/** Accessing symbol of state STATE. */ +#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State]) + +#if YYDEBUG || 0 +/* The user-facing name of the symbol whose (internal) number is + YYSYMBOL. No bounds checking. */ +static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED; + +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "\"end of file\"", "error", "\"invalid token\"", "NAME", "NUM", "JUNK", + "ANY", "FIRST", "'('", "')'", "','", "$accept", "result", + "standby_config", "standby_list", "standby_name", YY_NULLPTR +}; + +static const char * +yysymbol_name (yysymbol_kind_t yysymbol) +{ + return yytname[yysymbol]; +} +#endif + +#ifdef YYPRINT +/* YYTOKNUM[NUM] -- (External) token number corresponding to the + (internal) symbol number NUM (which must be that of a token). */ +static const yytype_int16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 40, 41, + 44 +}; +#endif + +#define YYPACT_NINF (-10) + +#define yypact_value_is_default(Yyn) \ + ((Yyn) == YYPACT_NINF) + +#define YYTABLE_NINF (-1) + +#define yytable_value_is_error(Yyn) \ + 0 + + /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +static const yytype_int8 yypact[] = +{ + -2, -10, -5, 11, 14, 19, -10, -4, -10, 6, + 12, 13, -10, 6, -10, 2, 6, 6, -10, -10, + 4, 7, -10, -10 +}; + + /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE does not specify something else to do. Zero + means the default is an error. */ +static const yytype_int8 yydefact[] = +{ + 0, 9, 10, 0, 0, 0, 2, 3, 7, 0, + 0, 0, 1, 0, 10, 0, 0, 0, 8, 4, + 0, 0, 5, 6 +}; + + /* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -10, -10, -10, -9, 9 +}; + + /* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + 0, 5, 6, 7, 8 +}; + + /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule whose + number is the opposite. If YYTABLE_NINF, syntax error. */ +static const yytype_int8 yytable[] = +{ + 15, 1, 2, 9, 3, 4, 13, 20, 21, 1, + 14, 19, 13, 22, 13, 10, 23, 13, 11, 12, + 16, 17, 18 +}; + +static const yytype_int8 yycheck[] = +{ + 9, 3, 4, 8, 6, 7, 10, 16, 17, 3, + 4, 9, 10, 9, 10, 4, 9, 10, 4, 0, + 8, 8, 13 +}; + + /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_int8 yystos[] = +{ + 0, 3, 4, 6, 7, 12, 13, 14, 15, 8, + 4, 4, 0, 10, 4, 14, 8, 8, 15, 9, + 14, 14, 9, 9 +}; + + /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_int8 yyr1[] = +{ + 0, 11, 12, 13, 13, 13, 13, 14, 14, 15, + 15 +}; + + /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN. */ +static const yytype_int8 yyr2[] = +{ + 0, 2, 1, 1, 4, 5, 5, 1, 3, 1, + 1 +}; + + +enum { YYENOMEM = -2 }; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ + do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ + while (0) + +/* Backward compatibility with an undocumented macro. + Use YYerror or YYUNDEF. */ +#define YYERRCODE YYUNDEF + + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include <stdio.h> /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (0) + +/* This macro is provided for backward compatibility. */ +# ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif + + +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Kind, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (0) + + +/*-----------------------------------. +| Print this symbol's value on YYO. | +`-----------------------------------*/ + +static void +yy_symbol_value_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + FILE *yyoutput = yyo; + YY_USE (yyoutput); + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yykind < YYNTOKENS) + YYPRINT (yyo, yytoknum[yykind], *yyvaluep); +# endif + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/*---------------------------. +| Print this symbol on YYO. | +`---------------------------*/ + +static void +yy_symbol_print (FILE *yyo, + yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep) +{ + YYFPRINTF (yyo, "%s %s (", + yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind)); + + yy_symbol_value_print (yyo, yykind, yyvaluep); + YYFPRINTF (yyo, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +static void +yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop) +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (0) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +static void +yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp, + int yyrule) +{ + int yylno = yyrline[yyrule]; + int yynrhs = yyr2[yyrule]; + int yyi; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, + YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]), + &yyvsp[(yyi + 1) - (yynrhs)]); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyssp, yyvsp, Rule); \ +} while (0) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) ((void) 0) +# define YY_SYMBOL_PRINT(Title, Kind, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + + + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +static void +yydestruct (const char *yymsg, + yysymbol_kind_t yykind, YYSTYPE *yyvaluep) +{ + YY_USE (yyvaluep); + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp); + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + YY_USE (yykind); + YY_IGNORE_MAYBE_UNINITIALIZED_END +} + + +/* Lookahead token kind. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; +/* Number of syntax errors so far. */ +int yynerrs; + + + + +/*----------. +| yyparse. | +`----------*/ + +int +yyparse (void) +{ + yy_state_fast_t yystate = 0; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus = 0; + + /* Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* Their size. */ + YYPTRDIFF_T yystacksize = YYINITDEPTH; + + /* The state stack: array, bottom, top. */ + yy_state_t yyssa[YYINITDEPTH]; + yy_state_t *yyss = yyssa; + yy_state_t *yyssp = yyss; + + /* The semantic value stack: array, bottom, top. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs = yyvsa; + YYSTYPE *yyvsp = yyvs; + + int yyn; + /* The return value of yyparse. */ + int yyresult; + /* Lookahead symbol kind. */ + yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + + + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yychar = YYEMPTY; /* Cause a token to be read. */ + goto yysetstate; + + +/*------------------------------------------------------------. +| yynewstate -- push a new state, which is found in yystate. | +`------------------------------------------------------------*/ +yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + +/*--------------------------------------------------------------------. +| yysetstate -- set current state (the top of the stack) to yystate. | +`--------------------------------------------------------------------*/ +yysetstate: + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + YY_ASSERT (0 <= yystate && yystate < YYNSTATES); + YY_IGNORE_USELESS_CAST_BEGIN + *yyssp = YY_CAST (yy_state_t, yystate); + YY_IGNORE_USELESS_CAST_END + YY_STACK_PRINT (yyss, yyssp); + + if (yyss + yystacksize - 1 <= yyssp) +#if !defined yyoverflow && !defined YYSTACK_RELOCATE + goto yyexhaustedlab; +#else + { + /* Get the current used size of the three stacks, in elements. */ + YYPTRDIFF_T yysize = yyssp - yyss + 1; + +# if defined yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + yy_state_t *yyss1 = yyss; + YYSTYPE *yyvs1 = yyvs; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * YYSIZEOF (*yyssp), + &yyvs1, yysize * YYSIZEOF (*yyvsp), + &yystacksize); + yyss = yyss1; + yyvs = yyvs1; + } +# else /* defined YYSTACK_RELOCATE */ + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yy_state_t *yyss1 = yyss; + union yyalloc *yyptr = + YY_CAST (union yyalloc *, + YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize)))); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YY_IGNORE_USELESS_CAST_BEGIN + YYDPRINTF ((stderr, "Stack size increased to %ld\n", + YY_CAST (long, yystacksize))); + YY_IGNORE_USELESS_CAST_END + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } +#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */ + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either empty, or end-of-input, or a valid lookahead. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token\n")); + yychar = yylex (); + } + + if (yychar <= YYEOF) + { + yychar = YYEOF; + yytoken = YYSYMBOL_YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else if (yychar == YYerror) + { + /* The scanner already issued an error message, process directly + to error recovery. But do not keep the error token as + lookahead, it is too special and may lead us to an endless + loop in error recovery. */ + yychar = YYUNDEF; + yytoken = YYSYMBOL_YYerror; + goto yyerrlab1; + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + yystate = yyn; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + /* Discard the shifted token. */ + yychar = YYEMPTY; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + '$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: /* result: standby_config */ +#line 57 "syncrep_gram.y" + { syncrep_parse_result = (yyvsp[0].config); } +#line 1125 "syncrep_gram.c" + break; + + case 3: /* standby_config: standby_list */ +#line 61 "syncrep_gram.y" + { (yyval.config) = create_syncrep_config("1", (yyvsp[0].list), SYNC_REP_PRIORITY); } +#line 1131 "syncrep_gram.c" + break; + + case 4: /* standby_config: NUM '(' standby_list ')' */ +#line 62 "syncrep_gram.y" + { (yyval.config) = create_syncrep_config((yyvsp[-3].str), (yyvsp[-1].list), SYNC_REP_PRIORITY); } +#line 1137 "syncrep_gram.c" + break; + + case 5: /* standby_config: ANY NUM '(' standby_list ')' */ +#line 63 "syncrep_gram.y" + { (yyval.config) = create_syncrep_config((yyvsp[-3].str), (yyvsp[-1].list), SYNC_REP_QUORUM); } +#line 1143 "syncrep_gram.c" + break; + + case 6: /* standby_config: FIRST NUM '(' standby_list ')' */ +#line 64 "syncrep_gram.y" + { (yyval.config) = create_syncrep_config((yyvsp[-3].str), (yyvsp[-1].list), SYNC_REP_PRIORITY); } +#line 1149 "syncrep_gram.c" + break; + + case 7: /* standby_list: standby_name */ +#line 68 "syncrep_gram.y" + { (yyval.list) = list_make1((yyvsp[0].str)); } +#line 1155 "syncrep_gram.c" + break; + + case 8: /* standby_list: standby_list ',' standby_name */ +#line 69 "syncrep_gram.y" + { (yyval.list) = lappend((yyvsp[-2].list), (yyvsp[0].str)); } +#line 1161 "syncrep_gram.c" + break; + + case 9: /* standby_name: NAME */ +#line 73 "syncrep_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1167 "syncrep_gram.c" + break; + + case 10: /* standby_name: NUM */ +#line 74 "syncrep_gram.y" + { (yyval.str) = (yyvsp[0].str); } +#line 1173 "syncrep_gram.c" + break; + + +#line 1177 "syncrep_gram.c" + + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + + *++yyvsp = yyval; + + /* Now 'shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + { + const int yylhs = yyr1[yyn] - YYNTOKENS; + const int yyi = yypgoto[yylhs] + *yyssp; + yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp + ? yytable[yyi] + : yydefgoto[yylhs]); + } + + goto yynewstate; + + +/*--------------------------------------. +| yyerrlab -- here on detecting error. | +`--------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar); + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; + yyerror (YY_("syntax error")); + } + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + /* Pacify compilers when the user code never invokes YYERROR and the + label yyerrorlab therefore never appears in user code. */ + if (0) + YYERROR; + + /* Do not reclaim the symbols of the rule whose action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + /* Pop stack until we find a state that shifts the error token. */ + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYSYMBOL_YYerror; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + YY_ACCESSING_SYMBOL (yystate), yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + + +#if !defined yyoverflow +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + goto yyreturn; +#endif + + +/*-------------------------------------------------------. +| yyreturn -- parsing is finished, clean up and return. | +`-------------------------------------------------------*/ +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule whose action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + YY_ACCESSING_SYMBOL (+*yyssp), yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif + + return yyresult; +} + +#line 76 "syncrep_gram.y" + + +static SyncRepConfigData * +create_syncrep_config(const char *num_sync, List *members, uint8 syncrep_method) +{ + SyncRepConfigData *config; + int size; + ListCell *lc; + char *ptr; + + /* Compute space needed for flat representation */ + size = offsetof(SyncRepConfigData, member_names); + foreach(lc, members) + { + char *standby_name = (char *) lfirst(lc); + + size += strlen(standby_name) + 1; + } + + /* And transform the data into flat representation */ + config = (SyncRepConfigData *) palloc(size); + + config->config_size = size; + config->num_sync = atoi(num_sync); + config->syncrep_method = syncrep_method; + config->nmembers = list_length(members); + ptr = config->member_names; + foreach(lc, members) + { + char *standby_name = (char *) lfirst(lc); + + strcpy(ptr, standby_name); + ptr += strlen(standby_name) + 1; + } + + return config; +} diff --git a/src/backend/replication/syncrep_gram.h b/src/backend/replication/syncrep_gram.h new file mode 100644 index 0000000..2a59890 --- /dev/null +++ b/src/backend/replication/syncrep_gram.h @@ -0,0 +1,89 @@ +/* A Bison parser, made by GNU Bison 3.7.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +#ifndef YY_SYNCREP_YY_SYNCREP_GRAM_H_INCLUDED +# define YY_SYNCREP_YY_SYNCREP_GRAM_H_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int syncrep_yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + NAME = 258, /* NAME */ + NUM = 259, /* NUM */ + JUNK = 260, /* JUNK */ + ANY = 261, /* ANY */ + FIRST = 262 /* FIRST */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +union YYSTYPE +{ +#line 41 "syncrep_gram.y" + + char *str; + List *list; + SyncRepConfigData *config; + +#line 77 "syncrep_gram.h" + +}; +typedef union YYSTYPE YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE syncrep_yylval; + +int syncrep_yyparse (void); + +#endif /* !YY_SYNCREP_YY_SYNCREP_GRAM_H_INCLUDED */ diff --git a/src/backend/replication/syncrep_gram.y b/src/backend/replication/syncrep_gram.y new file mode 100644 index 0000000..7f31d08 --- /dev/null +++ b/src/backend/replication/syncrep_gram.y @@ -0,0 +1,112 @@ +%{ +/*------------------------------------------------------------------------- + * + * syncrep_gram.y - Parser for synchronous_standby_names + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/syncrep_gram.y + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "nodes/pg_list.h" +#include "replication/syncrep.h" + +/* Result of parsing is returned in one of these two variables */ +SyncRepConfigData *syncrep_parse_result; +char *syncrep_parse_error_msg; + +static SyncRepConfigData *create_syncrep_config(const char *num_sync, + List *members, uint8 syncrep_method); + +/* + * Bison doesn't allocate anything that needs to live across parser calls, + * so we can easily have it use palloc instead of malloc. This prevents + * memory leaks if we error out during parsing. + */ +#define YYMALLOC palloc +#define YYFREE pfree + +%} + +%expect 0 +%name-prefix="syncrep_yy" + +%union +{ + char *str; + List *list; + SyncRepConfigData *config; +} + +%token <str> NAME NUM JUNK ANY FIRST + +%type <config> result standby_config +%type <list> standby_list +%type <str> standby_name + +%start result + +%% +result: + standby_config { syncrep_parse_result = $1; } + ; + +standby_config: + standby_list { $$ = create_syncrep_config("1", $1, SYNC_REP_PRIORITY); } + | NUM '(' standby_list ')' { $$ = create_syncrep_config($1, $3, SYNC_REP_PRIORITY); } + | ANY NUM '(' standby_list ')' { $$ = create_syncrep_config($2, $4, SYNC_REP_QUORUM); } + | FIRST NUM '(' standby_list ')' { $$ = create_syncrep_config($2, $4, SYNC_REP_PRIORITY); } + ; + +standby_list: + standby_name { $$ = list_make1($1); } + | standby_list ',' standby_name { $$ = lappend($1, $3); } + ; + +standby_name: + NAME { $$ = $1; } + | NUM { $$ = $1; } + ; +%% + +static SyncRepConfigData * +create_syncrep_config(const char *num_sync, List *members, uint8 syncrep_method) +{ + SyncRepConfigData *config; + int size; + ListCell *lc; + char *ptr; + + /* Compute space needed for flat representation */ + size = offsetof(SyncRepConfigData, member_names); + foreach(lc, members) + { + char *standby_name = (char *) lfirst(lc); + + size += strlen(standby_name) + 1; + } + + /* And transform the data into flat representation */ + config = (SyncRepConfigData *) palloc(size); + + config->config_size = size; + config->num_sync = atoi(num_sync); + config->syncrep_method = syncrep_method; + config->nmembers = list_length(members); + ptr = config->member_names; + foreach(lc, members) + { + char *standby_name = (char *) lfirst(lc); + + strcpy(ptr, standby_name); + ptr += strlen(standby_name) + 1; + } + + return config; +} diff --git a/src/backend/replication/syncrep_scanner.c b/src/backend/replication/syncrep_scanner.c new file mode 100644 index 0000000..225afd4 --- /dev/null +++ b/src/backend/replication/syncrep_scanner.c @@ -0,0 +1,2171 @@ +#line 2 "syncrep_scanner.c" +/*------------------------------------------------------------------------- + * + * syncrep_scanner.l + * a lexical scanner for synchronous_standby_names + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/syncrep_scanner.l + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "lib/stringinfo.h" +#include "nodes/pg_list.h" + +/* + * NB: include syncrep_gram.h only AFTER including syncrep.h, because syncrep.h + * includes node definitions needed for YYSTYPE. + */ +#include "replication/syncrep.h" +#include "syncrep_gram.h" + +#line 29 "syncrep_scanner.c" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define yy_create_buffer syncrep_yy_create_buffer +#define yy_delete_buffer syncrep_yy_delete_buffer +#define yy_scan_buffer syncrep_yy_scan_buffer +#define yy_scan_string syncrep_yy_scan_string +#define yy_scan_bytes syncrep_yy_scan_bytes +#define yy_init_buffer syncrep_yy_init_buffer +#define yy_flush_buffer syncrep_yy_flush_buffer +#define yy_load_buffer_state syncrep_yy_load_buffer_state +#define yy_switch_to_buffer syncrep_yy_switch_to_buffer +#define yypush_buffer_state syncrep_yypush_buffer_state +#define yypop_buffer_state syncrep_yypop_buffer_state +#define yyensure_buffer_stack syncrep_yyensure_buffer_stack +#define yy_flex_debug syncrep_yy_flex_debug +#define yyin syncrep_yyin +#define yyleng syncrep_yyleng +#define yylex syncrep_yylex +#define yylineno syncrep_yylineno +#define yyout syncrep_yyout +#define yyrestart syncrep_yyrestart +#define yytext syncrep_yytext +#define yywrap syncrep_yywrap +#define yyalloc syncrep_yyalloc +#define yyrealloc syncrep_yyrealloc +#define yyfree syncrep_yyfree + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define syncrep_yy_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer syncrep_yy_create_buffer +#endif + +#ifdef yy_delete_buffer +#define syncrep_yy_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer syncrep_yy_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define syncrep_yy_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer syncrep_yy_scan_buffer +#endif + +#ifdef yy_scan_string +#define syncrep_yy_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string syncrep_yy_scan_string +#endif + +#ifdef yy_scan_bytes +#define syncrep_yy_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes syncrep_yy_scan_bytes +#endif + +#ifdef yy_init_buffer +#define syncrep_yy_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer syncrep_yy_init_buffer +#endif + +#ifdef yy_flush_buffer +#define syncrep_yy_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer syncrep_yy_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define syncrep_yy_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state syncrep_yy_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define syncrep_yy_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer syncrep_yy_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define syncrep_yypush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state syncrep_yypush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define syncrep_yypop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state syncrep_yypop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define syncrep_yyensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack syncrep_yyensure_buffer_stack +#endif + +#ifdef yylex +#define syncrep_yylex_ALREADY_DEFINED +#else +#define yylex syncrep_yylex +#endif + +#ifdef yyrestart +#define syncrep_yyrestart_ALREADY_DEFINED +#else +#define yyrestart syncrep_yyrestart +#endif + +#ifdef yylex_init +#define syncrep_yylex_init_ALREADY_DEFINED +#else +#define yylex_init syncrep_yylex_init +#endif + +#ifdef yylex_init_extra +#define syncrep_yylex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra syncrep_yylex_init_extra +#endif + +#ifdef yylex_destroy +#define syncrep_yylex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy syncrep_yylex_destroy +#endif + +#ifdef yyget_debug +#define syncrep_yyget_debug_ALREADY_DEFINED +#else +#define yyget_debug syncrep_yyget_debug +#endif + +#ifdef yyset_debug +#define syncrep_yyset_debug_ALREADY_DEFINED +#else +#define yyset_debug syncrep_yyset_debug +#endif + +#ifdef yyget_extra +#define syncrep_yyget_extra_ALREADY_DEFINED +#else +#define yyget_extra syncrep_yyget_extra +#endif + +#ifdef yyset_extra +#define syncrep_yyset_extra_ALREADY_DEFINED +#else +#define yyset_extra syncrep_yyset_extra +#endif + +#ifdef yyget_in +#define syncrep_yyget_in_ALREADY_DEFINED +#else +#define yyget_in syncrep_yyget_in +#endif + +#ifdef yyset_in +#define syncrep_yyset_in_ALREADY_DEFINED +#else +#define yyset_in syncrep_yyset_in +#endif + +#ifdef yyget_out +#define syncrep_yyget_out_ALREADY_DEFINED +#else +#define yyget_out syncrep_yyget_out +#endif + +#ifdef yyset_out +#define syncrep_yyset_out_ALREADY_DEFINED +#else +#define yyset_out syncrep_yyset_out +#endif + +#ifdef yyget_leng +#define syncrep_yyget_leng_ALREADY_DEFINED +#else +#define yyget_leng syncrep_yyget_leng +#endif + +#ifdef yyget_text +#define syncrep_yyget_text_ALREADY_DEFINED +#else +#define yyget_text syncrep_yyget_text +#endif + +#ifdef yyget_lineno +#define syncrep_yyget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno syncrep_yyget_lineno +#endif + +#ifdef yyset_lineno +#define syncrep_yyset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno syncrep_yyset_lineno +#endif + +#ifdef yywrap +#define syncrep_yywrap_ALREADY_DEFINED +#else +#define yywrap syncrep_yywrap +#endif + +#ifdef yyalloc +#define syncrep_yyalloc_ALREADY_DEFINED +#else +#define yyalloc syncrep_yyalloc +#endif + +#ifdef yyrealloc +#define syncrep_yyrealloc_ALREADY_DEFINED +#else +#define yyrealloc syncrep_yyrealloc +#endif + +#ifdef yyfree +#define syncrep_yyfree_ALREADY_DEFINED +#else +#define yyfree syncrep_yyfree +#endif + +#ifdef yytext +#define syncrep_yytext_ALREADY_DEFINED +#else +#define yytext syncrep_yytext +#endif + +#ifdef yyleng +#define syncrep_yyleng_ALREADY_DEFINED +#else +#define yyleng syncrep_yyleng +#endif + +#ifdef yyin +#define syncrep_yyin_ALREADY_DEFINED +#else +#define yyin syncrep_yyin +#endif + +#ifdef yyout +#define syncrep_yyout_ALREADY_DEFINED +#else +#define yyout syncrep_yyout +#endif + +#ifdef yy_flex_debug +#define syncrep_yy_flex_debug_ALREADY_DEFINED +#else +#define yy_flex_debug syncrep_yy_flex_debug +#endif + +#ifdef yylineno +#define syncrep_yylineno_ALREADY_DEFINED +#else +#define yylineno syncrep_yylineno +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include <inttypes.h> +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an + * integer in range [0..255] for use as an array index. + */ +#define YY_SC_TO_UI(c) ((YY_CHAR) (c)) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart( yyin ) +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +extern int yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = NULL; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart ( FILE *input_file ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size ); +void yy_delete_buffer ( YY_BUFFER_STATE b ); +void yy_flush_buffer ( YY_BUFFER_STATE b ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state ( void ); + +static void yyensure_buffer_stack ( void ); +static void yy_load_buffer_state ( void ); +static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file ); +#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len ); + +void *yyalloc ( yy_size_t ); +void *yyrealloc ( void *, yy_size_t ); +void yyfree ( void * ); + +#define yy_new_buffer yy_create_buffer +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define syncrep_yywrap() (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP +typedef flex_uint8_t YY_CHAR; + +FILE *yyin = NULL, *yyout = NULL; + +typedef int yy_state_type; + +extern int yylineno; +int yylineno = 1; + +extern char *yytext; +#ifdef yytext_ptr +#undef yytext_ptr +#endif +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state ( void ); +static yy_state_type yy_try_NUL_trans ( yy_state_type current_state ); +static int yy_get_next_buffer ( void ); +static void yynoreturn yy_fatal_error ( const char* msg ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (int) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; +#define YY_NUM_RULES 15 +#define YY_END_OF_BUFFER 16 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static const flex_int16_t yy_accept[32] = + { 0, + 0, 0, 0, 0, 16, 14, 1, 1, 4, 12, + 13, 10, 11, 9, 8, 8, 8, 6, 7, 1, + 9, 8, 8, 8, 6, 5, 2, 8, 8, 3, + 0 + } ; + +static const YY_CHAR yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 4, 1, 5, 1, 1, 1, 6, + 7, 8, 1, 9, 1, 1, 1, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 1, 1, 1, + 1, 1, 1, 1, 11, 12, 12, 12, 12, 13, + 12, 12, 14, 12, 12, 12, 12, 15, 12, 12, + 12, 16, 17, 18, 12, 12, 12, 12, 19, 12, + 1, 1, 1, 1, 12, 1, 11, 12, 12, 12, + + 12, 13, 12, 12, 14, 12, 12, 12, 12, 15, + 12, 12, 12, 16, 17, 18, 12, 12, 12, 12, + 19, 12, 1, 1, 1, 1, 1, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12 + } ; + +static const YY_CHAR yy_meta[20] = + { 0, + 1, 1, 1, 2, 3, 1, 1, 1, 1, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3 + } ; + +static const flex_int16_t yy_base[35] = + { 0, + 0, 0, 39, 38, 41, 44, 18, 20, 44, 44, + 44, 44, 44, 30, 24, 0, 24, 0, 33, 22, + 26, 0, 16, 18, 0, 44, 0, 16, 14, 0, + 44, 25, 27, 28 + } ; + +static const flex_int16_t yy_def[35] = + { 0, + 31, 1, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 33, 33, 34, 31, 31, + 31, 33, 33, 33, 34, 31, 33, 33, 33, 33, + 0, 31, 31, 31 + } ; + +static const flex_int16_t yy_nxt[64] = + { 0, + 6, 7, 8, 9, 6, 10, 11, 12, 13, 14, + 15, 16, 17, 16, 16, 16, 16, 16, 16, 20, + 20, 20, 20, 20, 20, 18, 18, 18, 25, 22, + 25, 30, 29, 28, 27, 21, 26, 24, 23, 21, + 31, 19, 19, 5, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31 + } ; + +static const flex_int16_t yy_chk[64] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, + 7, 8, 8, 20, 20, 32, 32, 32, 34, 33, + 34, 29, 28, 24, 23, 21, 19, 17, 15, 14, + 5, 4, 3, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "syncrep_scanner.l" + +#line 30 "syncrep_scanner.l" +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} + +/* Handles to the buffer that the lexer uses internally */ +static YY_BUFFER_STATE scanbufhandle; + +static StringInfoData xdbuf; + +/* LCOV_EXCL_START */ + +#line 770 "syncrep_scanner.c" +#define YY_NO_INPUT 1 +/* + * <xd> delimited identifiers (double-quoted identifiers) + */ + +#line 776 "syncrep_scanner.c" + +#define INITIAL 0 +#define xd 1 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include <unistd.h> +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals ( void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( void ); + +int yyget_debug ( void ); + +void yyset_debug ( int debug_flag ); + +YY_EXTRA_TYPE yyget_extra ( void ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined ); + +FILE *yyget_in ( void ); + +void yyset_in ( FILE * _in_str ); + +FILE *yyget_out ( void ); + +void yyset_out ( FILE * _out_str ); + + int yyget_leng ( void ); + +char *yyget_text ( void ); + +int yyget_lineno ( void ); + +void yyset_lineno ( int _line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( void ); +#else +extern int yywrap ( void ); +#endif +#endif + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * ); +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus +static int yyinput ( void ); +#else +static int input ( void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK /*LINTED*/break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + yy_state_type yy_current_state; + char *yy_cp, *yy_bp; + int yy_act; + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + { +#line 76 "syncrep_scanner.l" + +#line 994 "syncrep_scanner.c" + + while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + do + { + YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 32 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + ++yy_cp; + } + while ( yy_current_state != 31 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +/* rule 1 can match eol */ +YY_RULE_SETUP +#line 77 "syncrep_scanner.l" +{ /* ignore */ } + YY_BREAK +/* brute-force case insensitivity is safer than relying on flex -i */ +case 2: +YY_RULE_SETUP +#line 81 "syncrep_scanner.l" +{ return ANY; } + YY_BREAK +case 3: +YY_RULE_SETUP +#line 82 "syncrep_scanner.l" +{ return FIRST; } + YY_BREAK +case 4: +YY_RULE_SETUP +#line 84 "syncrep_scanner.l" +{ + initStringInfo(&xdbuf); + BEGIN(xd); + } + YY_BREAK +case 5: +YY_RULE_SETUP +#line 88 "syncrep_scanner.l" +{ + appendStringInfoChar(&xdbuf, '"'); + } + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 91 "syncrep_scanner.l" +{ + appendStringInfoString(&xdbuf, yytext); + } + YY_BREAK +case 7: +YY_RULE_SETUP +#line 94 "syncrep_scanner.l" +{ + syncrep_yylval.str = xdbuf.data; + xdbuf.data = NULL; + BEGIN(INITIAL); + return NAME; + } + YY_BREAK +case YY_STATE_EOF(xd): +#line 100 "syncrep_scanner.l" +{ + syncrep_yyerror("unterminated quoted identifier"); + return JUNK; + } + YY_BREAK +case 8: +YY_RULE_SETUP +#line 105 "syncrep_scanner.l" +{ + syncrep_yylval.str = pstrdup(yytext); + return NAME; + } + YY_BREAK +case 9: +YY_RULE_SETUP +#line 110 "syncrep_scanner.l" +{ + syncrep_yylval.str = pstrdup(yytext); + return NUM; + } + YY_BREAK +case 10: +YY_RULE_SETUP +#line 115 "syncrep_scanner.l" +{ + syncrep_yylval.str = "*"; + return NAME; + } + YY_BREAK +case 11: +YY_RULE_SETUP +#line 120 "syncrep_scanner.l" +{ return ','; } + YY_BREAK +case 12: +YY_RULE_SETUP +#line 121 "syncrep_scanner.l" +{ return '('; } + YY_BREAK +case 13: +YY_RULE_SETUP +#line 122 "syncrep_scanner.l" +{ return ')'; } + YY_BREAK +case 14: +YY_RULE_SETUP +#line 124 "syncrep_scanner.l" +{ return JUNK; } + YY_BREAK +case 15: +YY_RULE_SETUP +#line 125 "syncrep_scanner.l" +YY_FATAL_ERROR( "flex scanner jammed" ); + YY_BREAK +#line 1153 "syncrep_scanner.c" +case YY_STATE_EOF(INITIAL): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + char *source = (yytext_ptr); + int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr) - 1); + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc( (void *) b->yy_ch_buf, + (yy_size_t) (b->yy_buf_size + 2) ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = NULL; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart( yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if (((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc( + (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + /* "- 2" to take care of EOB's */ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + yy_state_type yy_current_state; + char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 32 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + int yy_is_jam; + char *yy_cp = (yy_c_buf_p); + + YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 32 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + yy_is_jam = (yy_current_state == 31); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (int) ((yy_c_buf_p) - (yytext_ptr)); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart( yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return 0; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE ); + } + + yy_init_buffer( YY_CURRENT_BUFFER, input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer( b, file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree( (void *) b->yy_ch_buf ); + + yyfree( (void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer( b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + yy_size_t num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + yy_size_t grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return NULL; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = NULL; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer( b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (const char * yystr ) +{ + + return yy_scan_bytes( yystr, (int) strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = (yy_size_t) (_yybytes_len + 2); + buf = (char *) yyalloc( n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer( buf, n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yynoreturn yy_fatal_error (const char* msg ) +{ + fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param _line_number line number + * + */ +void yyset_lineno (int _line_number ) +{ + + yylineno = _line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param _in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * _in_str ) +{ + yyin = _in_str ; +} + +void yyset_out (FILE * _out_str ) +{ + yyout = _out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int _bdebug ) +{ + yy_flex_debug = _bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = NULL; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = NULL; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = NULL; + yyout = NULL; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer( YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, const char * s2, int n ) +{ + + int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (const char * s ) +{ + int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return malloc(size); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return realloc(ptr, size); +} + +void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 125 "syncrep_scanner.l" + + +/* LCOV_EXCL_STOP */ + +/* Needs to be here for access to yytext */ +void +syncrep_yyerror(const char *message) +{ + /* report only the first error in a parse operation */ + if (syncrep_parse_error_msg) + return; + if (yytext[0]) + syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", + message, yytext); + else + syncrep_parse_error_msg = psprintf("%s at end of input", + message); +} + +void +syncrep_scanner_init(const char *str) +{ + Size slen = strlen(str); + char *scanbuf; + + /* + * Might be left over after ereport() + */ + if (YY_CURRENT_BUFFER) + yy_delete_buffer(YY_CURRENT_BUFFER); + + /* + * Make a scan buffer with special termination needed by flex. + */ + scanbuf = (char *) palloc(slen + 2); + memcpy(scanbuf, str, slen); + scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; + scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); + + /* Make sure we start in proper state */ + BEGIN(INITIAL); +} + +void +syncrep_scanner_finish(void) +{ + yy_delete_buffer(scanbufhandle); + scanbufhandle = NULL; +} + diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l new file mode 100644 index 0000000..97c88f8 --- /dev/null +++ b/src/backend/replication/syncrep_scanner.l @@ -0,0 +1,173 @@ +%top{ +/*------------------------------------------------------------------------- + * + * syncrep_scanner.l + * a lexical scanner for synchronous_standby_names + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/replication/syncrep_scanner.l + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "lib/stringinfo.h" +#include "nodes/pg_list.h" + +/* + * NB: include syncrep_gram.h only AFTER including syncrep.h, because syncrep.h + * includes node definitions needed for YYSTYPE. + */ +#include "replication/syncrep.h" +#include "syncrep_gram.h" +} + +%{ +/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */ +#undef fprintf +#define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg) + +static void +fprintf_to_ereport(const char *fmt, const char *msg) +{ + ereport(ERROR, (errmsg_internal("%s", msg))); +} + +/* Handles to the buffer that the lexer uses internally */ +static YY_BUFFER_STATE scanbufhandle; + +static StringInfoData xdbuf; + +/* LCOV_EXCL_START */ + +%} + +%option 8bit +%option never-interactive +%option nodefault +%option noinput +%option nounput +%option noyywrap +%option warn +%option prefix="syncrep_yy" + +/* + * <xd> delimited identifiers (double-quoted identifiers) + */ +%x xd + +space [ \t\n\r\f\v] + +digit [0-9] +ident_start [A-Za-z\200-\377_] +ident_cont [A-Za-z\200-\377_0-9\$] +identifier {ident_start}{ident_cont}* + +dquote \" +xdstart {dquote} +xdstop {dquote} +xddouble {dquote}{dquote} +xdinside [^"]+ + +%% +{space}+ { /* ignore */ } + + /* brute-force case insensitivity is safer than relying on flex -i */ + +[Aa][Nn][Yy] { return ANY; } +[Ff][Ii][Rr][Ss][Tt] { return FIRST; } + +{xdstart} { + initStringInfo(&xdbuf); + BEGIN(xd); + } +<xd>{xddouble} { + appendStringInfoChar(&xdbuf, '"'); + } +<xd>{xdinside} { + appendStringInfoString(&xdbuf, yytext); + } +<xd>{xdstop} { + syncrep_yylval.str = xdbuf.data; + xdbuf.data = NULL; + BEGIN(INITIAL); + return NAME; + } +<xd><<EOF>> { + syncrep_yyerror("unterminated quoted identifier"); + return JUNK; + } + +{identifier} { + syncrep_yylval.str = pstrdup(yytext); + return NAME; + } + +{digit}+ { + syncrep_yylval.str = pstrdup(yytext); + return NUM; + } + +"*" { + syncrep_yylval.str = "*"; + return NAME; + } + +"," { return ','; } +"(" { return '('; } +")" { return ')'; } + +. { return JUNK; } +%% + +/* LCOV_EXCL_STOP */ + +/* Needs to be here for access to yytext */ +void +syncrep_yyerror(const char *message) +{ + /* report only the first error in a parse operation */ + if (syncrep_parse_error_msg) + return; + if (yytext[0]) + syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", + message, yytext); + else + syncrep_parse_error_msg = psprintf("%s at end of input", + message); +} + +void +syncrep_scanner_init(const char *str) +{ + Size slen = strlen(str); + char *scanbuf; + + /* + * Might be left over after ereport() + */ + if (YY_CURRENT_BUFFER) + yy_delete_buffer(YY_CURRENT_BUFFER); + + /* + * Make a scan buffer with special termination needed by flex. + */ + scanbuf = (char *) palloc(slen + 2); + memcpy(scanbuf, str, slen); + scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR; + scanbufhandle = yy_scan_buffer(scanbuf, slen + 2); + + /* Make sure we start in proper state */ + BEGIN(INITIAL); +} + +void +syncrep_scanner_finish(void) +{ + yy_delete_buffer(scanbufhandle); + scanbufhandle = NULL; +} diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c new file mode 100644 index 0000000..feff709 --- /dev/null +++ b/src/backend/replication/walreceiver.c @@ -0,0 +1,1525 @@ +/*------------------------------------------------------------------------- + * + * walreceiver.c + * + * The WAL receiver process (walreceiver) is new as of Postgres 9.0. It + * is the process in the standby server that takes charge of receiving + * XLOG records from a primary server during streaming replication. + * + * When the startup process determines that it's time to start streaming, + * it instructs postmaster to start walreceiver. Walreceiver first connects + * to the primary server (it will be served by a walsender process + * in the primary server), and then keeps receiving XLOG records and + * writing them to the disk as long as the connection is alive. As XLOG + * records are received and flushed to disk, it updates the + * WalRcv->flushedUpto variable in shared memory, to inform the startup + * process of how far it can proceed with XLOG replay. + * + * A WAL receiver cannot directly load GUC parameters used when establishing + * its connection to the primary. Instead it relies on parameter values + * that are passed down by the startup process when streaming is requested. + * This applies, for example, to the replication slot and the connection + * string to be used for the connection with the primary. + * + * If the primary server ends streaming, but doesn't disconnect, walreceiver + * goes into "waiting" mode, and waits for the startup process to give new + * instructions. The startup process will treat that the same as + * disconnection, and will rescan the archive/pg_wal directory. But when the + * startup process wants to try streaming replication again, it will just + * nudge the existing walreceiver process that's waiting, instead of launching + * a new one. + * + * Normal termination is by SIGTERM, which instructs the walreceiver to + * exit(0). Emergency termination is by SIGQUIT; like any postmaster child + * process, the walreceiver will simply abort and exit on SIGQUIT. A close + * of the connection and a FATAL error are treated not as a crash but as + * normal operation. + * + * This file contains the server-facing parts of walreceiver. The libpq- + * specific parts are in the libpqwalreceiver module. It's loaded + * dynamically to avoid linking the server with libpq. + * + * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/walreceiver.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/htup_details.h" +#include "access/timeline.h" +#include "access/transam.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xlogrecovery.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_type.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "storage/ipc.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/pg_lsn.h" +#include "utils/ps_status.h" +#include "utils/resowner.h" +#include "utils/timestamp.h" + + +/* + * GUC variables. (Other variables that affect walreceiver are in xlog.c + * because they're passed down from the startup process, for better + * synchronization.) + */ +int wal_receiver_status_interval; +int wal_receiver_timeout; +bool hot_standby_feedback; + +/* libpqwalreceiver connection */ +static WalReceiverConn *wrconn = NULL; +WalReceiverFunctionsType *WalReceiverFunctions = NULL; + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID + * corresponding the filename of recvFile. + */ +static int recvFile = -1; +static TimeLineID recvFileTLI = 0; +static XLogSegNo recvSegNo = 0; + +/* + * LogstreamResult indicates the byte positions that we have already + * written/fsynced. + */ +static struct +{ + XLogRecPtr Write; /* last byte + 1 written out in the standby */ + XLogRecPtr Flush; /* last byte + 1 flushed in the standby */ +} LogstreamResult; + +/* + * Reasons to wake up and perform periodic tasks. + */ +typedef enum WalRcvWakeupReason +{ + WALRCV_WAKEUP_TERMINATE, + WALRCV_WAKEUP_PING, + WALRCV_WAKEUP_REPLY, + WALRCV_WAKEUP_HSFEEDBACK +#define NUM_WALRCV_WAKEUPS (WALRCV_WAKEUP_HSFEEDBACK + 1) +} WalRcvWakeupReason; + +/* + * Wake up times for periodic tasks. + */ +static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]; + +static StringInfoData reply_message; +static StringInfoData incoming_message; + +/* Prototypes for private functions */ +static void WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last); +static void WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI); +static void WalRcvDie(int code, Datum arg); +static void XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, + TimeLineID tli); +static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, + TimeLineID tli); +static void XLogWalRcvFlush(bool dying, TimeLineID tli); +static void XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli); +static void XLogWalRcvSendReply(bool force, bool requestReply); +static void XLogWalRcvSendHSFeedback(bool immed); +static void ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime); +static void WalRcvComputeNextWakeup(WalRcvWakeupReason reason, TimestampTz now); + +/* + * Process any interrupts the walreceiver process may have received. + * This should be called any time the process's latch has become set. + * + * Currently, only SIGTERM is of interest. We can't just exit(1) within the + * SIGTERM signal handler, because the signal might arrive in the middle of + * some critical operation, like while we're holding a spinlock. Instead, the + * signal handler sets a flag variable as well as setting the process's latch. + * We must check the flag (by calling ProcessWalRcvInterrupts) anytime the + * latch has become set. Operations that could block for a long time, such as + * reading from a remote server, must pay attention to the latch too; see + * libpqrcv_PQgetResult for example. + */ +void +ProcessWalRcvInterrupts(void) +{ + /* + * Although walreceiver interrupt handling doesn't use the same scheme as + * regular backends, call CHECK_FOR_INTERRUPTS() to make sure we receive + * any incoming signals on Win32, and also to make sure we process any + * barrier events. + */ + CHECK_FOR_INTERRUPTS(); + + if (ShutdownRequestPending) + { + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating walreceiver process due to administrator command"))); + } +} + + +/* Main entry point for walreceiver process */ +void +WalReceiverMain(void) +{ + char conninfo[MAXCONNINFO]; + char *tmp_conninfo; + char slotname[NAMEDATALEN]; + bool is_temp_slot; + XLogRecPtr startpoint; + TimeLineID startpointTLI; + TimeLineID primaryTLI; + bool first_stream; + WalRcvData *walrcv = WalRcv; + TimestampTz now; + char *err; + char *sender_host = NULL; + int sender_port = 0; + + /* + * WalRcv should be set up already (if we are a backend, we inherit this + * by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + Assert(walrcv != NULL); + + /* + * Mark walreceiver as running in shared memory. + * + * Do this as early as possible, so that if we fail later on, we'll set + * state to STOPPED. If we die before this, the startup process will keep + * waiting for us to start up, until it times out. + */ + SpinLockAcquire(&walrcv->mutex); + Assert(walrcv->pid == 0); + switch (walrcv->walRcvState) + { + case WALRCV_STOPPING: + /* If we've already been requested to stop, don't start up. */ + walrcv->walRcvState = WALRCV_STOPPED; + /* fall through */ + + case WALRCV_STOPPED: + SpinLockRelease(&walrcv->mutex); + ConditionVariableBroadcast(&walrcv->walRcvStoppedCV); + proc_exit(1); + break; + + case WALRCV_STARTING: + /* The usual case */ + break; + + case WALRCV_WAITING: + case WALRCV_STREAMING: + case WALRCV_RESTARTING: + default: + /* Shouldn't happen */ + SpinLockRelease(&walrcv->mutex); + elog(PANIC, "walreceiver still running according to shared memory state"); + } + /* Advertise our PID so that the startup process can kill us */ + walrcv->pid = MyProcPid; + walrcv->walRcvState = WALRCV_STREAMING; + + /* Fetch information required to start streaming */ + walrcv->ready_to_display = false; + strlcpy(conninfo, (char *) walrcv->conninfo, MAXCONNINFO); + strlcpy(slotname, (char *) walrcv->slotname, NAMEDATALEN); + is_temp_slot = walrcv->is_temp_slot; + startpoint = walrcv->receiveStart; + startpointTLI = walrcv->receiveStartTLI; + + /* + * At most one of is_temp_slot and slotname can be set; otherwise, + * RequestXLogStreaming messed up. + */ + Assert(!is_temp_slot || (slotname[0] == '\0')); + + /* Initialise to a sanish value */ + now = GetCurrentTimestamp(); + walrcv->lastMsgSendTime = + walrcv->lastMsgReceiptTime = walrcv->latestWalEndTime = now; + + /* Report the latch to use to awaken this process */ + walrcv->latch = &MyProc->procLatch; + + SpinLockRelease(&walrcv->mutex); + + pg_atomic_write_u64(&WalRcv->writtenUpto, 0); + + /* Arrange to clean up at walreceiver exit */ + on_shmem_exit(WalRcvDie, PointerGetDatum(&startpointTLI)); + + /* Properly accept or ignore signals the postmaster might send us */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); /* set flag to read config + * file */ + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); /* request shutdown */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + /* Reset some signals that are accepted by postmaster but not here */ + pqsignal(SIGCHLD, SIG_DFL); + + /* Load the libpq-specific functions */ + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + + /* Unblock signals (they were blocked when the postmaster forked us) */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* Establish the connection to the primary for XLOG streaming */ + wrconn = walrcv_connect(conninfo, false, false, + cluster_name[0] ? cluster_name : "walreceiver", + &err); + if (!wrconn) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not connect to the primary server: %s", err))); + + /* + * Save user-visible connection string. This clobbers the original + * conninfo, for security. Also save host and port of the sender server + * this walreceiver is connected to. + */ + tmp_conninfo = walrcv_get_conninfo(wrconn); + walrcv_get_senderinfo(wrconn, &sender_host, &sender_port); + SpinLockAcquire(&walrcv->mutex); + memset(walrcv->conninfo, 0, MAXCONNINFO); + if (tmp_conninfo) + strlcpy((char *) walrcv->conninfo, tmp_conninfo, MAXCONNINFO); + + memset(walrcv->sender_host, 0, NI_MAXHOST); + if (sender_host) + strlcpy((char *) walrcv->sender_host, sender_host, NI_MAXHOST); + + walrcv->sender_port = sender_port; + walrcv->ready_to_display = true; + SpinLockRelease(&walrcv->mutex); + + if (tmp_conninfo) + pfree(tmp_conninfo); + + if (sender_host) + pfree(sender_host); + + first_stream = true; + for (;;) + { + char *primary_sysid; + char standby_sysid[32]; + WalRcvStreamOptions options; + + /* + * Check that we're connected to a valid server using the + * IDENTIFY_SYSTEM replication command. + */ + primary_sysid = walrcv_identify_system(wrconn, &primaryTLI); + + snprintf(standby_sysid, sizeof(standby_sysid), UINT64_FORMAT, + GetSystemIdentifier()); + if (strcmp(primary_sysid, standby_sysid) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("database system identifier differs between the primary and standby"), + errdetail("The primary's identifier is %s, the standby's identifier is %s.", + primary_sysid, standby_sysid))); + } + + /* + * Confirm that the current timeline of the primary is the same or + * ahead of ours. + */ + if (primaryTLI < startpointTLI) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("highest timeline %u of the primary is behind recovery timeline %u", + primaryTLI, startpointTLI))); + + /* + * Get any missing history files. We do this always, even when we're + * not interested in that timeline, so that if we're promoted to + * become the primary later on, we don't select the same timeline that + * was already used in the current primary. This isn't bullet-proof - + * you'll need some external software to manage your cluster if you + * need to ensure that a unique timeline id is chosen in every case, + * but let's avoid the confusion of timeline id collisions where we + * can. + */ + WalRcvFetchTimeLineHistoryFiles(startpointTLI, primaryTLI); + + /* + * Create temporary replication slot if requested, and update slot + * name in shared memory. (Note the slot name cannot already be set + * in this case.) + */ + if (is_temp_slot) + { + snprintf(slotname, sizeof(slotname), + "pg_walreceiver_%lld", + (long long int) walrcv_get_backend_pid(wrconn)); + + walrcv_create_slot(wrconn, slotname, true, false, 0, NULL); + + SpinLockAcquire(&walrcv->mutex); + strlcpy(walrcv->slotname, slotname, NAMEDATALEN); + SpinLockRelease(&walrcv->mutex); + } + + /* + * Start streaming. + * + * We'll try to start at the requested starting point and timeline, + * even if it's different from the server's latest timeline. In case + * we've already reached the end of the old timeline, the server will + * finish the streaming immediately, and we will go back to await + * orders from the startup process. If recovery_target_timeline is + * 'latest', the startup process will scan pg_wal and find the new + * history file, bump recovery target timeline, and ask us to restart + * on the new timeline. + */ + options.logical = false; + options.startpoint = startpoint; + options.slotname = slotname[0] != '\0' ? slotname : NULL; + options.proto.physical.startpointTLI = startpointTLI; + if (walrcv_startstreaming(wrconn, &options)) + { + if (first_stream) + ereport(LOG, + (errmsg("started streaming WAL from primary at %X/%X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI))); + else + ereport(LOG, + (errmsg("restarted WAL streaming at %X/%X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI))); + first_stream = false; + + /* Initialize LogstreamResult and buffers for processing messages */ + LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL); + initStringInfo(&reply_message); + initStringInfo(&incoming_message); + + /* Initialize nap wakeup times. */ + now = GetCurrentTimestamp(); + for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i) + WalRcvComputeNextWakeup(i, now); + + /* Send initial reply/feedback messages. */ + XLogWalRcvSendReply(true, false); + XLogWalRcvSendHSFeedback(true); + + /* Loop until end-of-streaming or error */ + for (;;) + { + char *buf; + int len; + bool endofwal = false; + pgsocket wait_fd = PGINVALID_SOCKET; + int rc; + TimestampTz nextWakeup; + long nap; + + /* + * Exit walreceiver if we're not in recovery. This should not + * happen, but cross-check the status here. + */ + if (!RecoveryInProgress()) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot continue WAL streaming, recovery has already ended"))); + + /* Process any requests or signals received recently */ + ProcessWalRcvInterrupts(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + /* recompute wakeup times */ + now = GetCurrentTimestamp(); + for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i) + WalRcvComputeNextWakeup(i, now); + XLogWalRcvSendHSFeedback(true); + } + + /* See if we can read data immediately */ + len = walrcv_receive(wrconn, &buf, &wait_fd); + if (len != 0) + { + /* + * Process the received data, and any subsequent data we + * can read without blocking. + */ + for (;;) + { + if (len > 0) + { + /* + * Something was received from primary, so adjust + * the ping and terminate wakeup times. + */ + now = GetCurrentTimestamp(); + WalRcvComputeNextWakeup(WALRCV_WAKEUP_TERMINATE, + now); + WalRcvComputeNextWakeup(WALRCV_WAKEUP_PING, now); + XLogWalRcvProcessMsg(buf[0], &buf[1], len - 1, + startpointTLI); + } + else if (len == 0) + break; + else if (len < 0) + { + ereport(LOG, + (errmsg("replication terminated by primary server"), + errdetail("End of WAL reached on timeline %u at %X/%X.", + startpointTLI, + LSN_FORMAT_ARGS(LogstreamResult.Write)))); + endofwal = true; + break; + } + len = walrcv_receive(wrconn, &buf, &wait_fd); + } + + /* Let the primary know that we received some data. */ + XLogWalRcvSendReply(false, false); + + /* + * If we've written some records, flush them to disk and + * let the startup process and primary server know about + * them. + */ + XLogWalRcvFlush(false, startpointTLI); + } + + /* Check if we need to exit the streaming loop. */ + if (endofwal) + break; + + /* Find the soonest wakeup time, to limit our nap. */ + nextWakeup = TIMESTAMP_INFINITY; + for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i) + nextWakeup = Min(wakeup[i], nextWakeup); + + /* Calculate the nap time, clamping as necessary. */ + now = GetCurrentTimestamp(); + nap = TimestampDifferenceMilliseconds(now, nextWakeup); + + /* + * Ideally we would reuse a WaitEventSet object repeatedly + * here to avoid the overheads of WaitLatchOrSocket on epoll + * systems, but we can't be sure that libpq (or any other + * walreceiver implementation) has the same socket (even if + * the fd is the same number, it may have been closed and + * reopened since the last time). In future, if there is a + * function for removing sockets from WaitEventSet, then we + * could add and remove just the socket each time, potentially + * avoiding some system calls. + */ + Assert(wait_fd != PGINVALID_SOCKET); + rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE | + WL_TIMEOUT | WL_LATCH_SET, + wait_fd, + nap, + WAIT_EVENT_WAL_RECEIVER_MAIN); + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + ProcessWalRcvInterrupts(); + + if (walrcv->force_reply) + { + /* + * The recovery process has asked us to send apply + * feedback now. Make sure the flag is really set to + * false in shared memory before sending the reply, so + * we don't miss a new request for a reply. + */ + walrcv->force_reply = false; + pg_memory_barrier(); + XLogWalRcvSendReply(true, false); + } + } + if (rc & WL_TIMEOUT) + { + /* + * We didn't receive anything new. If we haven't heard + * anything from the server for more than + * wal_receiver_timeout / 2, ping the server. Also, if + * it's been longer than wal_receiver_status_interval + * since the last update we sent, send a status update to + * the primary anyway, to report any progress in applying + * WAL. + */ + bool requestReply = false; + + /* + * Check if time since last receive from primary has + * reached the configured limit. + */ + now = GetCurrentTimestamp(); + if (now >= wakeup[WALRCV_WAKEUP_TERMINATE]) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("terminating walreceiver due to timeout"))); + + /* + * If we didn't receive anything new for half of receiver + * replication timeout, then ping the server. + */ + if (now >= wakeup[WALRCV_WAKEUP_PING]) + { + requestReply = true; + wakeup[WALRCV_WAKEUP_PING] = TIMESTAMP_INFINITY; + } + + XLogWalRcvSendReply(requestReply, requestReply); + XLogWalRcvSendHSFeedback(false); + } + } + + /* + * The backend finished streaming. Exit streaming COPY-mode from + * our side, too. + */ + walrcv_endstreaming(wrconn, &primaryTLI); + + /* + * If the server had switched to a new timeline that we didn't + * know about when we began streaming, fetch its timeline history + * file now. + */ + WalRcvFetchTimeLineHistoryFiles(startpointTLI, primaryTLI); + } + else + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u", + startpointTLI))); + + /* + * End of WAL reached on the requested timeline. Close the last + * segment, and await for new orders from the startup process. + */ + if (recvFile >= 0) + { + char xlogfname[MAXFNAMELEN]; + + XLogWalRcvFlush(false, startpointTLI); + XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size); + if (close(recvFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close WAL segment %s: %m", + xlogfname))); + + /* + * Create .done file forcibly to prevent the streamed segment from + * being archived later. + */ + if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS) + XLogArchiveForceDone(xlogfname); + else + XLogArchiveNotify(xlogfname); + } + recvFile = -1; + + elog(DEBUG1, "walreceiver ended streaming and awaits new instructions"); + WalRcvWaitForStartPosition(&startpoint, &startpointTLI); + } + /* not reached */ +} + +/* + * Wait for startup process to set receiveStart and receiveStartTLI. + */ +static void +WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) +{ + WalRcvData *walrcv = WalRcv; + int state; + + SpinLockAcquire(&walrcv->mutex); + state = walrcv->walRcvState; + if (state != WALRCV_STREAMING) + { + SpinLockRelease(&walrcv->mutex); + if (state == WALRCV_STOPPING) + proc_exit(0); + else + elog(FATAL, "unexpected walreceiver state"); + } + walrcv->walRcvState = WALRCV_WAITING; + walrcv->receiveStart = InvalidXLogRecPtr; + walrcv->receiveStartTLI = 0; + SpinLockRelease(&walrcv->mutex); + + set_ps_display("idle"); + + /* + * nudge startup process to notice that we've stopped streaming and are + * now waiting for instructions. + */ + WakeupRecovery(); + for (;;) + { + ResetLatch(MyLatch); + + ProcessWalRcvInterrupts(); + + SpinLockAcquire(&walrcv->mutex); + Assert(walrcv->walRcvState == WALRCV_RESTARTING || + walrcv->walRcvState == WALRCV_WAITING || + walrcv->walRcvState == WALRCV_STOPPING); + if (walrcv->walRcvState == WALRCV_RESTARTING) + { + /* + * No need to handle changes in primary_conninfo or + * primary_slot_name here. Startup process will signal us to + * terminate in case those change. + */ + *startpoint = walrcv->receiveStart; + *startpointTLI = walrcv->receiveStartTLI; + walrcv->walRcvState = WALRCV_STREAMING; + SpinLockRelease(&walrcv->mutex); + break; + } + if (walrcv->walRcvState == WALRCV_STOPPING) + { + /* + * We should've received SIGTERM if the startup process wants us + * to die, but might as well check it here too. + */ + SpinLockRelease(&walrcv->mutex); + exit(1); + } + SpinLockRelease(&walrcv->mutex); + + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_WAL_RECEIVER_WAIT_START); + } + + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%X", + LSN_FORMAT_ARGS(*startpoint)); + set_ps_display(activitymsg); + } +} + +/* + * Fetch any missing timeline history files between 'first' and 'last' + * (inclusive) from the server. + */ +static void +WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last) +{ + TimeLineID tli; + + for (tli = first; tli <= last; tli++) + { + /* there's no history file for timeline 1 */ + if (tli != 1 && !existsTimeLineHistory(tli)) + { + char *fname; + char *content; + int len; + char expectedfname[MAXFNAMELEN]; + + ereport(LOG, + (errmsg("fetching timeline history file for timeline %u from primary server", + tli))); + + walrcv_readtimelinehistoryfile(wrconn, tli, &fname, &content, &len); + + /* + * Check that the filename on the primary matches what we + * calculated ourselves. This is just a sanity check, it should + * always match. + */ + TLHistoryFileName(expectedfname, tli); + if (strcmp(fname, expectedfname) != 0) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("primary reported unexpected file name for timeline history file of timeline %u", + tli))); + + /* + * Write the file to pg_wal. + */ + writeTimeLineHistoryFile(tli, content, len); + + /* + * Mark the streamed history file as ready for archiving if + * archive_mode is always. + */ + if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS) + XLogArchiveForceDone(fname); + else + XLogArchiveNotify(fname); + + pfree(fname); + pfree(content); + } + } +} + +/* + * Mark us as STOPPED in shared memory at exit. + */ +static void +WalRcvDie(int code, Datum arg) +{ + WalRcvData *walrcv = WalRcv; + TimeLineID *startpointTLI_p = (TimeLineID *) DatumGetPointer(arg); + + Assert(*startpointTLI_p != 0); + + /* Ensure that all WAL records received are flushed to disk */ + XLogWalRcvFlush(true, *startpointTLI_p); + + /* Mark ourselves inactive in shared memory */ + SpinLockAcquire(&walrcv->mutex); + Assert(walrcv->walRcvState == WALRCV_STREAMING || + walrcv->walRcvState == WALRCV_RESTARTING || + walrcv->walRcvState == WALRCV_STARTING || + walrcv->walRcvState == WALRCV_WAITING || + walrcv->walRcvState == WALRCV_STOPPING); + Assert(walrcv->pid == MyProcPid); + walrcv->walRcvState = WALRCV_STOPPED; + walrcv->pid = 0; + walrcv->ready_to_display = false; + walrcv->latch = NULL; + SpinLockRelease(&walrcv->mutex); + + ConditionVariableBroadcast(&walrcv->walRcvStoppedCV); + + /* Terminate the connection gracefully. */ + if (wrconn != NULL) + walrcv_disconnect(wrconn); + + /* Wake up the startup process to notice promptly that we're gone */ + WakeupRecovery(); +} + +/* + * Accept the message from XLOG stream, and process it. + */ +static void +XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli) +{ + int hdrlen; + XLogRecPtr dataStart; + XLogRecPtr walEnd; + TimestampTz sendTime; + bool replyRequested; + + resetStringInfo(&incoming_message); + + switch (type) + { + case 'w': /* WAL records */ + { + /* copy message to StringInfo */ + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (len < hdrlen) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid WAL message received from primary"))); + appendBinaryStringInfo(&incoming_message, buf, hdrlen); + + /* read the fields */ + dataStart = pq_getmsgint64(&incoming_message); + walEnd = pq_getmsgint64(&incoming_message); + sendTime = pq_getmsgint64(&incoming_message); + ProcessWalSndrMessage(walEnd, sendTime); + + buf += hdrlen; + len -= hdrlen; + XLogWalRcvWrite(buf, len, dataStart, tli); + break; + } + case 'k': /* Keepalive */ + { + /* copy message to StringInfo */ + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char); + if (len != hdrlen) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid keepalive message received from primary"))); + appendBinaryStringInfo(&incoming_message, buf, hdrlen); + + /* read the fields */ + walEnd = pq_getmsgint64(&incoming_message); + sendTime = pq_getmsgint64(&incoming_message); + replyRequested = pq_getmsgbyte(&incoming_message); + + ProcessWalSndrMessage(walEnd, sendTime); + + /* If the primary requested a reply, send one immediately */ + if (replyRequested) + XLogWalRcvSendReply(true, false); + break; + } + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg_internal("invalid replication message type %d", + type))); + } +} + +/* + * Write XLOG data to disk. + */ +static void +XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) +{ + int startoff; + int byteswritten; + + Assert(tli != 0); + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size)) + XLogWalRcvClose(recptr, tli); + + if (recvFile < 0) + { + /* Create/use new log file */ + XLByteToSeg(recptr, recvSegNo, wal_segment_size); + recvFile = XLogFileInit(recvSegNo, tli); + recvFileTLI = tli; + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to WAL segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + + LogstreamResult.Write = recptr; + } + + /* Update shared-memory status */ + pg_atomic_write_u64(&WalRcv->writtenUpto, LogstreamResult.Write); + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop, to create its archive notification file soon. Otherwise WAL + * archiving of the segment will be delayed until any data in the next + * segment is received and written. + */ + if (recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size)) + XLogWalRcvClose(recptr, tli); +} + +/* + * Flush the log to disk. + * + * If we're in the midst of dying, it's unwise to do anything that might throw + * an error, so we skip sending a reply in that case. + */ +static void +XLogWalRcvFlush(bool dying, TimeLineID tli) +{ + Assert(tli != 0); + + if (LogstreamResult.Flush < LogstreamResult.Write) + { + WalRcvData *walrcv = WalRcv; + + issue_xlog_fsync(recvFile, recvSegNo, tli); + + LogstreamResult.Flush = LogstreamResult.Write; + + /* Update shared-memory status */ + SpinLockAcquire(&walrcv->mutex); + if (walrcv->flushedUpto < LogstreamResult.Flush) + { + walrcv->latestChunkStart = walrcv->flushedUpto; + walrcv->flushedUpto = LogstreamResult.Flush; + walrcv->receivedTLI = tli; + } + SpinLockRelease(&walrcv->mutex); + + /* Signal the startup process and walsender that new WAL has arrived */ + WakeupRecovery(); + if (AllowCascadeReplication()) + WalSndWakeup(true, false); + + /* Report XLOG streaming progress in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(LogstreamResult.Write)); + set_ps_display(activitymsg); + } + + /* Also let the primary know that we made some progress */ + if (!dying) + { + XLogWalRcvSendReply(false, false); + XLogWalRcvSendHSFeedback(false); + } + } +} + +/* + * Close the current segment. + * + * Flush the segment to disk before closing it. Otherwise we have to + * reopen and fsync it later. + * + * Create an archive notification file since the segment is known completed. + */ +static void +XLogWalRcvClose(XLogRecPtr recptr, TimeLineID tli) +{ + char xlogfname[MAXFNAMELEN]; + + Assert(recvFile >= 0 && !XLByteInSeg(recptr, recvSegNo, wal_segment_size)); + Assert(tli != 0); + + /* + * fsync() and close current file before we switch to next one. We would + * otherwise have to reopen this file to fsync it later + */ + XLogWalRcvFlush(false, tli); + + XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size); + + /* + * XLOG segment files will be re-read by recovery in startup process soon, + * so we don't advise the OS to release cache pages associated with the + * file like XLogFileClose() does. + */ + if (close(recvFile) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close WAL segment %s: %m", + xlogfname))); + + /* + * Create .done file forcibly to prevent the streamed segment from being + * archived later. + */ + if (XLogArchiveMode != ARCHIVE_MODE_ALWAYS) + XLogArchiveForceDone(xlogfname); + else + XLogArchiveNotify(xlogfname); + + recvFile = -1; +} + +/* + * Send reply message to primary, indicating our current WAL locations, oldest + * xmin and the current time. + * + * If 'force' is not set, the message is only sent if enough time has + * passed since last status update to reach wal_receiver_status_interval. + * If wal_receiver_status_interval is disabled altogether and 'force' is + * false, this is a no-op. + * + * If 'requestReply' is true, requests the server to reply immediately upon + * receiving this message. This is used for heartbeats, when approaching + * wal_receiver_timeout. + */ +static void +XLogWalRcvSendReply(bool force, bool requestReply) +{ + static XLogRecPtr writePtr = 0; + static XLogRecPtr flushPtr = 0; + XLogRecPtr applyPtr; + TimestampTz now; + + /* + * If the user doesn't want status to be reported to the primary, be sure + * to exit before doing anything at all. + */ + if (!force && wal_receiver_status_interval <= 0) + return; + + /* Get current timestamp. */ + now = GetCurrentTimestamp(); + + /* + * We can compare the write and flush positions to the last message we + * sent without taking any lock, but the apply position requires a spin + * lock, so we don't check that unless something else has changed or 10 + * seconds have passed. This means that the apply WAL location will + * appear, from the primary's point of view, to lag slightly, but since + * this is only for reporting purposes and only on idle systems, that's + * probably OK. + */ + if (!force + && writePtr == LogstreamResult.Write + && flushPtr == LogstreamResult.Flush + && now < wakeup[WALRCV_WAKEUP_REPLY]) + return; + + /* Make sure we wake up when it's time to send another reply. */ + WalRcvComputeNextWakeup(WALRCV_WAKEUP_REPLY, now); + + /* Construct a new message */ + writePtr = LogstreamResult.Write; + flushPtr = LogstreamResult.Flush; + applyPtr = GetXLogReplayRecPtr(NULL); + + resetStringInfo(&reply_message); + pq_sendbyte(&reply_message, 'r'); + pq_sendint64(&reply_message, writePtr); + pq_sendint64(&reply_message, flushPtr); + pq_sendint64(&reply_message, applyPtr); + pq_sendint64(&reply_message, GetCurrentTimestamp()); + pq_sendbyte(&reply_message, requestReply ? 1 : 0); + + /* Send it */ + elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s", + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr), + requestReply ? " (reply requested)" : ""); + + walrcv_send(wrconn, reply_message.data, reply_message.len); +} + +/* + * Send hot standby feedback message to primary, plus the current time, + * in case they don't have a watch. + * + * If the user disables feedback, send one final message to tell sender + * to forget about the xmin on this standby. We also send this message + * on first connect because a previous connection might have set xmin + * on a replication slot. (If we're not using a slot it's harmless to + * send a feedback message explicitly setting InvalidTransactionId). + */ +static void +XLogWalRcvSendHSFeedback(bool immed) +{ + TimestampTz now; + FullTransactionId nextFullXid; + TransactionId nextXid; + uint32 xmin_epoch, + catalog_xmin_epoch; + TransactionId xmin, + catalog_xmin; + + /* initially true so we always send at least one feedback message */ + static bool primary_has_standby_xmin = true; + + /* + * If the user doesn't want status to be reported to the primary, be sure + * to exit before doing anything at all. + */ + if ((wal_receiver_status_interval <= 0 || !hot_standby_feedback) && + !primary_has_standby_xmin) + return; + + /* Get current timestamp. */ + now = GetCurrentTimestamp(); + + /* Send feedback at most once per wal_receiver_status_interval. */ + if (!immed && now < wakeup[WALRCV_WAKEUP_HSFEEDBACK]) + return; + + /* Make sure we wake up when it's time to send feedback again. */ + WalRcvComputeNextWakeup(WALRCV_WAKEUP_HSFEEDBACK, now); + + /* + * If Hot Standby is not yet accepting connections there is nothing to + * send. Check this after the interval has expired to reduce number of + * calls. + * + * Bailing out here also ensures that we don't send feedback until we've + * read our own replication slot state, so we don't tell the primary to + * discard needed xmin or catalog_xmin from any slots that may exist on + * this replica. + */ + if (!HotStandbyActive()) + return; + + /* + * Make the expensive call to get the oldest xmin once we are certain + * everything else has been checked. + */ + if (hot_standby_feedback) + { + GetReplicationHorizons(&xmin, &catalog_xmin); + } + else + { + xmin = InvalidTransactionId; + catalog_xmin = InvalidTransactionId; + } + + /* + * Get epoch and adjust if nextXid and oldestXmin are different sides of + * the epoch boundary. + */ + nextFullXid = ReadNextFullTransactionId(); + nextXid = XidFromFullTransactionId(nextFullXid); + xmin_epoch = EpochFromFullTransactionId(nextFullXid); + catalog_xmin_epoch = xmin_epoch; + if (nextXid < xmin) + xmin_epoch--; + if (nextXid < catalog_xmin) + catalog_xmin_epoch--; + + elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u catalog_xmin %u catalog_xmin_epoch %u", + xmin, xmin_epoch, catalog_xmin, catalog_xmin_epoch); + + /* Construct the message and send it. */ + resetStringInfo(&reply_message); + pq_sendbyte(&reply_message, 'h'); + pq_sendint64(&reply_message, GetCurrentTimestamp()); + pq_sendint32(&reply_message, xmin); + pq_sendint32(&reply_message, xmin_epoch); + pq_sendint32(&reply_message, catalog_xmin); + pq_sendint32(&reply_message, catalog_xmin_epoch); + walrcv_send(wrconn, reply_message.data, reply_message.len); + if (TransactionIdIsValid(xmin) || TransactionIdIsValid(catalog_xmin)) + primary_has_standby_xmin = true; + else + primary_has_standby_xmin = false; +} + +/* + * Update shared memory status upon receiving a message from primary. + * + * 'walEnd' and 'sendTime' are the end-of-WAL and timestamp of the latest + * message, reported by primary. + */ +static void +ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime) +{ + WalRcvData *walrcv = WalRcv; + TimestampTz lastMsgReceiptTime = GetCurrentTimestamp(); + + /* Update shared-memory status */ + SpinLockAcquire(&walrcv->mutex); + if (walrcv->latestWalEnd < walEnd) + walrcv->latestWalEndTime = sendTime; + walrcv->latestWalEnd = walEnd; + walrcv->lastMsgSendTime = sendTime; + walrcv->lastMsgReceiptTime = lastMsgReceiptTime; + SpinLockRelease(&walrcv->mutex); + + if (message_level_is_interesting(DEBUG2)) + { + char *sendtime; + char *receipttime; + int applyDelay; + + /* Copy because timestamptz_to_str returns a static buffer */ + sendtime = pstrdup(timestamptz_to_str(sendTime)); + receipttime = pstrdup(timestamptz_to_str(lastMsgReceiptTime)); + applyDelay = GetReplicationApplyDelay(); + + /* apply delay is not available */ + if (applyDelay == -1) + elog(DEBUG2, "sendtime %s receipttime %s replication apply delay (N/A) transfer latency %d ms", + sendtime, + receipttime, + GetReplicationTransferLatency()); + else + elog(DEBUG2, "sendtime %s receipttime %s replication apply delay %d ms transfer latency %d ms", + sendtime, + receipttime, + applyDelay, + GetReplicationTransferLatency()); + + pfree(sendtime); + pfree(receipttime); + } +} + +/* + * Compute the next wakeup time for a given wakeup reason. Can be called to + * initialize a wakeup time, to adjust it for the next wakeup, or to + * reinitialize it when GUCs have changed. We ask the caller to pass in the + * value of "now" because this frequently avoids multiple calls of + * GetCurrentTimestamp(). It had better be a reasonably up-to-date value + * though. + */ +static void +WalRcvComputeNextWakeup(WalRcvWakeupReason reason, TimestampTz now) +{ + switch (reason) + { + case WALRCV_WAKEUP_TERMINATE: + if (wal_receiver_timeout <= 0) + wakeup[reason] = TIMESTAMP_INFINITY; + else + wakeup[reason] = TimestampTzPlusMilliseconds(now, wal_receiver_timeout); + break; + case WALRCV_WAKEUP_PING: + if (wal_receiver_timeout <= 0) + wakeup[reason] = TIMESTAMP_INFINITY; + else + wakeup[reason] = TimestampTzPlusMilliseconds(now, wal_receiver_timeout / 2); + break; + case WALRCV_WAKEUP_HSFEEDBACK: + if (!hot_standby_feedback || wal_receiver_status_interval <= 0) + wakeup[reason] = TIMESTAMP_INFINITY; + else + wakeup[reason] = TimestampTzPlusSeconds(now, wal_receiver_status_interval); + break; + case WALRCV_WAKEUP_REPLY: + if (wal_receiver_status_interval <= 0) + wakeup[reason] = TIMESTAMP_INFINITY; + else + wakeup[reason] = TimestampTzPlusSeconds(now, wal_receiver_status_interval); + break; + /* there's intentionally no default: here */ + } +} + +/* + * Wake up the walreceiver main loop. + * + * This is called by the startup process whenever interesting xlog records + * are applied, so that walreceiver can check if it needs to send an apply + * notification back to the primary which may be waiting in a COMMIT with + * synchronous_commit = remote_apply. + */ +void +WalRcvForceReply(void) +{ + Latch *latch; + + WalRcv->force_reply = true; + /* fetching the latch pointer might not be atomic, so use spinlock */ + SpinLockAcquire(&WalRcv->mutex); + latch = WalRcv->latch; + SpinLockRelease(&WalRcv->mutex); + if (latch) + SetLatch(latch); +} + +/* + * Return a string constant representing the state. This is used + * in system functions and views, and should *not* be translated. + */ +static const char * +WalRcvGetStateString(WalRcvState state) +{ + switch (state) + { + case WALRCV_STOPPED: + return "stopped"; + case WALRCV_STARTING: + return "starting"; + case WALRCV_STREAMING: + return "streaming"; + case WALRCV_WAITING: + return "waiting"; + case WALRCV_RESTARTING: + return "restarting"; + case WALRCV_STOPPING: + return "stopping"; + } + return "UNKNOWN"; +} + +/* + * Returns activity of WAL receiver, including pid, state and xlog locations + * received from the WAL sender of another server. + */ +Datum +pg_stat_get_wal_receiver(PG_FUNCTION_ARGS) +{ + TupleDesc tupdesc; + Datum *values; + bool *nulls; + int pid; + bool ready_to_display; + WalRcvState state; + XLogRecPtr receive_start_lsn; + TimeLineID receive_start_tli; + XLogRecPtr written_lsn; + XLogRecPtr flushed_lsn; + TimeLineID received_tli; + TimestampTz last_send_time; + TimestampTz last_receipt_time; + XLogRecPtr latest_end_lsn; + TimestampTz latest_end_time; + char sender_host[NI_MAXHOST]; + int sender_port = 0; + char slotname[NAMEDATALEN]; + char conninfo[MAXCONNINFO]; + + /* Take a lock to ensure value consistency */ + SpinLockAcquire(&WalRcv->mutex); + pid = (int) WalRcv->pid; + ready_to_display = WalRcv->ready_to_display; + state = WalRcv->walRcvState; + receive_start_lsn = WalRcv->receiveStart; + receive_start_tli = WalRcv->receiveStartTLI; + flushed_lsn = WalRcv->flushedUpto; + received_tli = WalRcv->receivedTLI; + last_send_time = WalRcv->lastMsgSendTime; + last_receipt_time = WalRcv->lastMsgReceiptTime; + latest_end_lsn = WalRcv->latestWalEnd; + latest_end_time = WalRcv->latestWalEndTime; + strlcpy(slotname, (char *) WalRcv->slotname, sizeof(slotname)); + strlcpy(sender_host, (char *) WalRcv->sender_host, sizeof(sender_host)); + sender_port = WalRcv->sender_port; + strlcpy(conninfo, (char *) WalRcv->conninfo, sizeof(conninfo)); + SpinLockRelease(&WalRcv->mutex); + + /* + * No WAL receiver (or not ready yet), just return a tuple with NULL + * values + */ + if (pid == 0 || !ready_to_display) + PG_RETURN_NULL(); + + /* + * Read "writtenUpto" without holding a spinlock. Note that it may not be + * consistent with the other shared variables of the WAL receiver + * protected by a spinlock, but this should not be used for data integrity + * checks. + */ + written_lsn = pg_atomic_read_u64(&WalRcv->writtenUpto); + + /* determine result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + values = palloc0(sizeof(Datum) * tupdesc->natts); + nulls = palloc0(sizeof(bool) * tupdesc->natts); + + /* Fetch values */ + values[0] = Int32GetDatum(pid); + + if (!has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS)) + { + /* + * Only superusers and roles with privileges of pg_read_all_stats can + * see details. Other users only get the pid value to know whether it + * is a WAL receiver, but no details. + */ + memset(&nulls[1], true, sizeof(bool) * (tupdesc->natts - 1)); + } + else + { + values[1] = CStringGetTextDatum(WalRcvGetStateString(state)); + + if (XLogRecPtrIsInvalid(receive_start_lsn)) + nulls[2] = true; + else + values[2] = LSNGetDatum(receive_start_lsn); + values[3] = Int32GetDatum(receive_start_tli); + if (XLogRecPtrIsInvalid(written_lsn)) + nulls[4] = true; + else + values[4] = LSNGetDatum(written_lsn); + if (XLogRecPtrIsInvalid(flushed_lsn)) + nulls[5] = true; + else + values[5] = LSNGetDatum(flushed_lsn); + values[6] = Int32GetDatum(received_tli); + if (last_send_time == 0) + nulls[7] = true; + else + values[7] = TimestampTzGetDatum(last_send_time); + if (last_receipt_time == 0) + nulls[8] = true; + else + values[8] = TimestampTzGetDatum(last_receipt_time); + if (XLogRecPtrIsInvalid(latest_end_lsn)) + nulls[9] = true; + else + values[9] = LSNGetDatum(latest_end_lsn); + if (latest_end_time == 0) + nulls[10] = true; + else + values[10] = TimestampTzGetDatum(latest_end_time); + if (*slotname == '\0') + nulls[11] = true; + else + values[11] = CStringGetTextDatum(slotname); + if (*sender_host == '\0') + nulls[12] = true; + else + values[12] = CStringGetTextDatum(sender_host); + if (sender_port == 0) + nulls[13] = true; + else + values[13] = Int32GetDatum(sender_port); + if (*conninfo == '\0') + nulls[14] = true; + else + values[14] = CStringGetTextDatum(conninfo); + } + + /* Returns the record as Datum */ + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c new file mode 100644 index 0000000..8305022 --- /dev/null +++ b/src/backend/replication/walreceiverfuncs.c @@ -0,0 +1,408 @@ +/*------------------------------------------------------------------------- + * + * walreceiverfuncs.c + * + * This file contains functions used by the startup process to communicate + * with the walreceiver process. Functions implementing walreceiver itself + * are in walreceiver.c. + * + * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/replication/walreceiverfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <sys/stat.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> +#include <signal.h> + +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "pgstat.h" +#include "postmaster/startup.h" +#include "replication/walreceiver.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "utils/timestamp.h" + +WalRcvData *WalRcv = NULL; + +/* + * How long to wait for walreceiver to start up after requesting + * postmaster to launch it. In seconds. + */ +#define WALRCV_STARTUP_TIMEOUT 10 + +/* Report shared memory space needed by WalRcvShmemInit */ +Size +WalRcvShmemSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(WalRcvData)); + + return size; +} + +/* Allocate and initialize walreceiver-related shared memory */ +void +WalRcvShmemInit(void) +{ + bool found; + + WalRcv = (WalRcvData *) + ShmemInitStruct("Wal Receiver Ctl", WalRcvShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(WalRcv, 0, WalRcvShmemSize()); + WalRcv->walRcvState = WALRCV_STOPPED; + ConditionVariableInit(&WalRcv->walRcvStoppedCV); + SpinLockInit(&WalRcv->mutex); + pg_atomic_init_u64(&WalRcv->writtenUpto, 0); + WalRcv->latch = NULL; + } +} + +/* Is walreceiver running (or starting up)? */ +bool +WalRcvRunning(void) +{ + WalRcvData *walrcv = WalRcv; + WalRcvState state; + pg_time_t startTime; + + SpinLockAcquire(&walrcv->mutex); + + state = walrcv->walRcvState; + startTime = walrcv->startTime; + + SpinLockRelease(&walrcv->mutex); + + /* + * If it has taken too long for walreceiver to start up, give up. Setting + * the state to STOPPED ensures that if walreceiver later does start up + * after all, it will see that it's not supposed to be running and die + * without doing anything. + */ + if (state == WALRCV_STARTING) + { + pg_time_t now = (pg_time_t) time(NULL); + + if ((now - startTime) > WALRCV_STARTUP_TIMEOUT) + { + bool stopped = false; + + SpinLockAcquire(&walrcv->mutex); + if (walrcv->walRcvState == WALRCV_STARTING) + { + state = walrcv->walRcvState = WALRCV_STOPPED; + stopped = true; + } + SpinLockRelease(&walrcv->mutex); + + if (stopped) + ConditionVariableBroadcast(&walrcv->walRcvStoppedCV); + } + } + + if (state != WALRCV_STOPPED) + return true; + else + return false; +} + +/* + * Is walreceiver running and streaming (or at least attempting to connect, + * or starting up)? + */ +bool +WalRcvStreaming(void) +{ + WalRcvData *walrcv = WalRcv; + WalRcvState state; + pg_time_t startTime; + + SpinLockAcquire(&walrcv->mutex); + + state = walrcv->walRcvState; + startTime = walrcv->startTime; + + SpinLockRelease(&walrcv->mutex); + + /* + * If it has taken too long for walreceiver to start up, give up. Setting + * the state to STOPPED ensures that if walreceiver later does start up + * after all, it will see that it's not supposed to be running and die + * without doing anything. + */ + if (state == WALRCV_STARTING) + { + pg_time_t now = (pg_time_t) time(NULL); + + if ((now - startTime) > WALRCV_STARTUP_TIMEOUT) + { + bool stopped = false; + + SpinLockAcquire(&walrcv->mutex); + if (walrcv->walRcvState == WALRCV_STARTING) + { + state = walrcv->walRcvState = WALRCV_STOPPED; + stopped = true; + } + SpinLockRelease(&walrcv->mutex); + + if (stopped) + ConditionVariableBroadcast(&walrcv->walRcvStoppedCV); + } + } + + if (state == WALRCV_STREAMING || state == WALRCV_STARTING || + state == WALRCV_RESTARTING) + return true; + else + return false; +} + +/* + * Stop walreceiver (if running) and wait for it to die. + * Executed by the Startup process. + */ +void +ShutdownWalRcv(void) +{ + WalRcvData *walrcv = WalRcv; + pid_t walrcvpid = 0; + bool stopped = false; + + /* + * Request walreceiver to stop. Walreceiver will switch to WALRCV_STOPPED + * mode once it's finished, and will also request postmaster to not + * restart itself. + */ + SpinLockAcquire(&walrcv->mutex); + switch (walrcv->walRcvState) + { + case WALRCV_STOPPED: + break; + case WALRCV_STARTING: + walrcv->walRcvState = WALRCV_STOPPED; + stopped = true; + break; + + case WALRCV_STREAMING: + case WALRCV_WAITING: + case WALRCV_RESTARTING: + walrcv->walRcvState = WALRCV_STOPPING; + /* fall through */ + case WALRCV_STOPPING: + walrcvpid = walrcv->pid; + break; + } + SpinLockRelease(&walrcv->mutex); + + /* Unnecessary but consistent. */ + if (stopped) + ConditionVariableBroadcast(&walrcv->walRcvStoppedCV); + + /* + * Signal walreceiver process if it was still running. + */ + if (walrcvpid != 0) + kill(walrcvpid, SIGTERM); + + /* + * Wait for walreceiver to acknowledge its death by setting state to + * WALRCV_STOPPED. + */ + ConditionVariablePrepareToSleep(&walrcv->walRcvStoppedCV); + while (WalRcvRunning()) + ConditionVariableSleep(&walrcv->walRcvStoppedCV, + WAIT_EVENT_WAL_RECEIVER_EXIT); + ConditionVariableCancelSleep(); +} + +/* + * Request postmaster to start walreceiver. + * + * "recptr" indicates the position where streaming should begin. "conninfo" + * is a libpq connection string to use. "slotname" is, optionally, the name + * of a replication slot to acquire. "create_temp_slot" indicates to create + * a temporary slot when no "slotname" is given. + * + * WAL receivers do not directly load GUC parameters used for the connection + * to the primary, and rely on the values passed down by the caller of this + * routine instead. Hence, the addition of any new parameters should happen + * through this code path. + */ +void +RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, + const char *slotname, bool create_temp_slot) +{ + WalRcvData *walrcv = WalRcv; + bool launch = false; + pg_time_t now = (pg_time_t) time(NULL); + Latch *latch; + + /* + * We always start at the beginning of the segment. That prevents a broken + * segment (i.e., with no records in the first half of a segment) from + * being created by XLOG streaming, which might cause trouble later on if + * the segment is e.g archived. + */ + if (XLogSegmentOffset(recptr, wal_segment_size) != 0) + recptr -= XLogSegmentOffset(recptr, wal_segment_size); + + SpinLockAcquire(&walrcv->mutex); + + /* It better be stopped if we try to restart it */ + Assert(walrcv->walRcvState == WALRCV_STOPPED || + walrcv->walRcvState == WALRCV_WAITING); + + if (conninfo != NULL) + strlcpy((char *) walrcv->conninfo, conninfo, MAXCONNINFO); + else + walrcv->conninfo[0] = '\0'; + + /* + * Use configured replication slot if present, and ignore the value of + * create_temp_slot as the slot name should be persistent. Otherwise, use + * create_temp_slot to determine whether this WAL receiver should create a + * temporary slot by itself and use it, or not. + */ + if (slotname != NULL && slotname[0] != '\0') + { + strlcpy((char *) walrcv->slotname, slotname, NAMEDATALEN); + walrcv->is_temp_slot = false; + } + else + { + walrcv->slotname[0] = '\0'; + walrcv->is_temp_slot = create_temp_slot; + } + + if (walrcv->walRcvState == WALRCV_STOPPED) + { + launch = true; + walrcv->walRcvState = WALRCV_STARTING; + } + else + walrcv->walRcvState = WALRCV_RESTARTING; + walrcv->startTime = now; + + /* + * If this is the first startup of walreceiver (on this timeline), + * initialize flushedUpto and latestChunkStart to the starting point. + */ + if (walrcv->receiveStart == 0 || walrcv->receivedTLI != tli) + { + walrcv->flushedUpto = recptr; + walrcv->receivedTLI = tli; + walrcv->latestChunkStart = recptr; + } + walrcv->receiveStart = recptr; + walrcv->receiveStartTLI = tli; + + latch = walrcv->latch; + + SpinLockRelease(&walrcv->mutex); + + if (launch) + SendPostmasterSignal(PMSIGNAL_START_WALRECEIVER); + else if (latch) + SetLatch(latch); +} + +/* + * Returns the last+1 byte position that walreceiver has flushed. + * + * Optionally, returns the previous chunk start, that is the first byte + * written in the most recent walreceiver flush cycle. Callers not + * interested in that value may pass NULL for latestChunkStart. Same for + * receiveTLI. + */ +XLogRecPtr +GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI) +{ + WalRcvData *walrcv = WalRcv; + XLogRecPtr recptr; + + SpinLockAcquire(&walrcv->mutex); + recptr = walrcv->flushedUpto; + if (latestChunkStart) + *latestChunkStart = walrcv->latestChunkStart; + if (receiveTLI) + *receiveTLI = walrcv->receivedTLI; + SpinLockRelease(&walrcv->mutex); + + return recptr; +} + +/* + * Returns the last+1 byte position that walreceiver has written. + * This returns a recently written value without taking a lock. + */ +XLogRecPtr +GetWalRcvWriteRecPtr(void) +{ + WalRcvData *walrcv = WalRcv; + + return pg_atomic_read_u64(&walrcv->writtenUpto); +} + +/* + * Returns the replication apply delay in ms or -1 + * if the apply delay info is not available + */ +int +GetReplicationApplyDelay(void) +{ + WalRcvData *walrcv = WalRcv; + XLogRecPtr receivePtr; + XLogRecPtr replayPtr; + TimestampTz chunkReplayStartTime; + + SpinLockAcquire(&walrcv->mutex); + receivePtr = walrcv->flushedUpto; + SpinLockRelease(&walrcv->mutex); + + replayPtr = GetXLogReplayRecPtr(NULL); + + if (receivePtr == replayPtr) + return 0; + + chunkReplayStartTime = GetCurrentChunkReplayStartTime(); + + if (chunkReplayStartTime == 0) + return -1; + + return TimestampDifferenceMilliseconds(chunkReplayStartTime, + GetCurrentTimestamp()); +} + +/* + * Returns the network latency in ms, note that this includes any + * difference in clock settings between the servers, as well as timezone. + */ +int +GetReplicationTransferLatency(void) +{ + WalRcvData *walrcv = WalRcv; + TimestampTz lastMsgSendTime; + TimestampTz lastMsgReceiptTime; + + SpinLockAcquire(&walrcv->mutex); + lastMsgSendTime = walrcv->lastMsgSendTime; + lastMsgReceiptTime = walrcv->lastMsgReceiptTime; + SpinLockRelease(&walrcv->mutex); + + return TimestampDifferenceMilliseconds(lastMsgSendTime, + lastMsgReceiptTime); +} diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c new file mode 100644 index 0000000..4c53de0 --- /dev/null +++ b/src/backend/replication/walsender.c @@ -0,0 +1,3917 @@ +/*------------------------------------------------------------------------- + * + * walsender.c + * + * The WAL sender process (walsender) is new as of Postgres 9.0. It takes + * care of sending XLOG from the primary server to a single recipient. + * (Note that there can be more than one walsender process concurrently.) + * It is started by the postmaster when the walreceiver of a standby server + * connects to the primary server and requests XLOG streaming replication. + * + * A walsender is similar to a regular backend, ie. there is a one-to-one + * relationship between a connection and a walsender process, but instead + * of processing SQL queries, it understands a small set of special + * replication-mode commands. The START_REPLICATION command begins streaming + * WAL to the client. While streaming, the walsender keeps reading XLOG + * records from the disk and sends them to the standby server over the + * COPY protocol, until either side ends the replication by exiting COPY + * mode (or until the connection is closed). + * + * Normal termination is by SIGTERM, which instructs the walsender to + * close the connection and exit(0) at the next convenient moment. Emergency + * termination is by SIGQUIT; like any backend, the walsender will simply + * abort and exit on SIGQUIT. A close of the connection and a FATAL error + * are treated as not a crash but approximately normal termination; + * the walsender will exit quickly without sending any more XLOG records. + * + * If the server is shut down, checkpointer sends us + * PROCSIG_WALSND_INIT_STOPPING after all regular backends have exited. If + * the backend is idle or runs an SQL query this causes the backend to + * shutdown, if logical replication is in progress all existing WAL records + * are processed followed by a shutdown. Otherwise this causes the walsender + * to switch to the "stopping" state. In this state, the walsender will reject + * any further replication commands. The checkpointer begins the shutdown + * checkpoint once all walsenders are confirmed as stopping. When the shutdown + * checkpoint finishes, the postmaster sends us SIGUSR2. This instructs + * walsender to send any outstanding WAL, including the shutdown checkpoint + * record, wait for it to be replicated to the standby, and then exit. + * + * + * Portions Copyright (c) 2010-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/walsender.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/printtup.h" +#include "access/timeline.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "backup/basebackup.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_type.h" +#include "commands/dbcommands.h" +#include "commands/defrem.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "nodes/replnodes.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/slot.h" +#include "replication/snapbuild.h" +#include "replication/syncrep.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "tcop/dest.h" +#include "tcop/tcopprot.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/pg_lsn.h" +#include "utils/portal.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* + * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ. + * + * We don't have a good idea of what a good value would be; there's some + * overhead per message in both walsender and walreceiver, but on the other + * hand sending large batches makes walsender less responsive to signals + * because signals are checked only between messages. 128kB (with + * default 8k blocks) seems like a reasonable guess for now. + */ +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) + +/* Array of WalSnds in shared memory */ +WalSndCtlData *WalSndCtl = NULL; + +/* My slot in the shared memory array */ +WalSnd *MyWalSnd = NULL; + +/* Global state */ +bool am_walsender = false; /* Am I a walsender process? */ +bool am_cascading_walsender = false; /* Am I cascading WAL to another + * standby? */ +bool am_db_walsender = false; /* Connected to a database? */ + +/* GUC variables */ +int max_wal_senders = 10; /* the maximum number of concurrent + * walsenders */ +int wal_sender_timeout = 60 * 1000; /* maximum time to send one WAL + * data message */ +bool log_replication_commands = false; + +/* + * State for WalSndWakeupRequest + */ +bool wake_wal_senders = false; + +/* + * xlogreader used for replication. Note that a WAL sender doing physical + * replication does not need xlogreader to read WAL, but it needs one to + * keep a state of its work. + */ +static XLogReaderState *xlogreader = NULL; + +/* + * These variables keep track of the state of the timeline we're currently + * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, + * the timeline is not the latest timeline on this server, and the server's + * history forked off from that timeline at sendTimeLineValidUpto. + */ +static TimeLineID sendTimeLine = 0; +static TimeLineID sendTimeLineNextTLI = 0; +static bool sendTimeLineIsHistoric = false; +static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; + +/* + * How far have we sent WAL already? This is also advertised in + * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) + */ +static XLogRecPtr sentPtr = InvalidXLogRecPtr; + +/* Buffers for constructing outgoing messages and processing reply messages. */ +static StringInfoData output_message; +static StringInfoData reply_message; +static StringInfoData tmpbuf; + +/* Timestamp of last ProcessRepliesIfAny(). */ +static TimestampTz last_processing = 0; + +/* + * Timestamp of last ProcessRepliesIfAny() that saw a reply from the + * standby. Set to 0 if wal_sender_timeout doesn't need to be active. + */ +static TimestampTz last_reply_timestamp = 0; + +/* Have we sent a heartbeat message asking for reply, since last reply? */ +static bool waiting_for_ping_response = false; + +/* + * While streaming WAL in Copy mode, streamingDoneSending is set to true + * after we have sent CopyDone. We should not send any more CopyData messages + * after that. streamingDoneReceiving is set to true when we receive CopyDone + * from the other end. When both become true, it's time to exit Copy mode. + */ +static bool streamingDoneSending; +static bool streamingDoneReceiving; + +/* Are we there yet? */ +static bool WalSndCaughtUp = false; + +/* Flags set by signal handlers for later service in main loop */ +static volatile sig_atomic_t got_SIGUSR2 = false; +static volatile sig_atomic_t got_STOPPING = false; + +/* + * This is set while we are streaming. When not set + * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, + * the main loop is responsible for checking got_STOPPING and terminating when + * it's set (after streaming any remaining WAL). + */ +static volatile sig_atomic_t replication_active = false; + +static LogicalDecodingContext *logical_decoding_ctx = NULL; + +/* A sample associating a WAL location with the time it was written. */ +typedef struct +{ + XLogRecPtr lsn; + TimestampTz time; +} WalTimeSample; + +/* The size of our buffer of time samples. */ +#define LAG_TRACKER_BUFFER_SIZE 8192 + +/* A mechanism for tracking replication lag. */ +typedef struct +{ + XLogRecPtr last_lsn; + WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]; + int write_head; + int read_heads[NUM_SYNC_REP_WAIT_MODE]; + WalTimeSample last_read[NUM_SYNC_REP_WAIT_MODE]; +} LagTracker; + +static LagTracker *lag_tracker; + +/* Signal handlers */ +static void WalSndLastCycleHandler(SIGNAL_ARGS); + +/* Prototypes for private functions */ +typedef void (*WalSndSendDataCallback) (void); +static void WalSndLoop(WalSndSendDataCallback send_data); +static void InitWalSenderSlot(void); +static void WalSndKill(int code, Datum arg); +static void WalSndShutdown(void) pg_attribute_noreturn(); +static void XLogSendPhysical(void); +static void XLogSendLogical(void); +static void WalSndDone(WalSndSendDataCallback send_data); +static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); +static void IdentifySystem(void); +static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); +static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); +static void DropReplicationSlot(DropReplicationSlotCmd *cmd); +static void StartReplication(StartReplicationCmd *cmd); +static void StartLogicalReplication(StartReplicationCmd *cmd); +static void ProcessStandbyMessage(void); +static void ProcessStandbyReplyMessage(void); +static void ProcessStandbyHSFeedbackMessage(void); +static void ProcessRepliesIfAny(void); +static void ProcessPendingWrites(void); +static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr); +static void WalSndKeepaliveIfNecessary(void); +static void WalSndCheckTimeOut(void); +static long WalSndComputeSleeptime(TimestampTz now); +static void WalSndWait(uint32 socket_events, long timeout, uint32 wait_event); +static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write); +static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write); +static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool skipped_xact); +static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); +static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); +static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); +static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); + +static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p); + + +/* Initialize walsender process before entering the main command loop */ +void +InitWalSender(void) +{ + am_cascading_walsender = RecoveryInProgress(); + + /* Create a per-walsender data structure in shared memory */ + InitWalSenderSlot(); + + /* + * We don't currently need any ResourceOwner in a walsender process, but + * if we did, we could call CreateAuxProcessResourceOwner here. + */ + + /* + * Let postmaster know that we're a WAL sender. Once we've declared us as + * a WAL sender process, postmaster will let us outlive the bgwriter and + * kill us last in the shutdown sequence, so we get a chance to stream all + * remaining WAL at shutdown, including the shutdown checkpoint. Note that + * there's no going back, and we mustn't write any WAL records after this. + */ + MarkPostmasterChildWalSender(); + SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE); + + /* + * If the client didn't specify a database to connect to, show in PGPROC + * that our advertised xmin should affect vacuum horizons in all + * databases. This allows physical replication clients to send hot + * standby feedback that will delay vacuum cleanup in all databases. + */ + if (MyDatabaseId == InvalidOid) + { + Assert(MyProc->xmin == InvalidTransactionId); + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + MyProc->statusFlags |= PROC_AFFECTS_ALL_HORIZONS; + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + LWLockRelease(ProcArrayLock); + } + + /* Initialize empty timestamp buffer for lag tracking. */ + lag_tracker = MemoryContextAllocZero(TopMemoryContext, sizeof(LagTracker)); +} + +/* + * Clean up after an error. + * + * WAL sender processes don't use transactions like regular backends do. + * This function does any cleanup required after an error in a WAL sender + * process, similar to what transaction abort does in a regular backend. + */ +void +WalSndErrorCleanup(void) +{ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + + if (xlogreader != NULL && xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + + ReplicationSlotCleanup(); + + replication_active = false; + + /* + * If there is a transaction in progress, it will clean up our + * ResourceOwner, but if a replication command set up a resource owner + * without a transaction, we've got to clean that up now. + */ + if (!IsTransactionOrTransactionBlock()) + WalSndResourceCleanup(false); + + if (got_STOPPING || got_SIGUSR2) + proc_exit(0); + + /* Revert back to startup state */ + WalSndSetState(WALSNDSTATE_STARTUP); +} + +/* + * Clean up any ResourceOwner we created. + */ +void +WalSndResourceCleanup(bool isCommit) +{ + ResourceOwner resowner; + + if (CurrentResourceOwner == NULL) + return; + + /* + * Deleting CurrentResourceOwner is not allowed, so we must save a pointer + * in a local variable and clear it first. + */ + resowner = CurrentResourceOwner; + CurrentResourceOwner = NULL; + + /* Now we can release resources and delete it. */ + ResourceOwnerRelease(resowner, + RESOURCE_RELEASE_BEFORE_LOCKS, isCommit, true); + ResourceOwnerRelease(resowner, + RESOURCE_RELEASE_LOCKS, isCommit, true); + ResourceOwnerRelease(resowner, + RESOURCE_RELEASE_AFTER_LOCKS, isCommit, true); + ResourceOwnerDelete(resowner); +} + +/* + * Handle a client's connection abort in an orderly manner. + */ +static void +WalSndShutdown(void) +{ + /* + * Reset whereToSendOutput to prevent ereport from attempting to send any + * more messages to the standby. + */ + if (whereToSendOutput == DestRemote) + whereToSendOutput = DestNone; + + proc_exit(0); + abort(); /* keep the compiler quiet */ +} + +/* + * Handle the IDENTIFY_SYSTEM command. + */ +static void +IdentifySystem(void) +{ + char sysid[32]; + char xloc[MAXFNAMELEN]; + XLogRecPtr logptr; + char *dbname = NULL; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[4]; + bool nulls[4] = {0}; + TimeLineID currTLI; + + /* + * Reply with a result set with one row, four columns. First col is system + * ID, second is timeline ID, third is current xlog location and the + * fourth contains the database name if we are connected to one. + */ + + snprintf(sysid, sizeof(sysid), UINT64_FORMAT, + GetSystemIdentifier()); + + am_cascading_walsender = RecoveryInProgress(); + if (am_cascading_walsender) + logptr = GetStandbyFlushRecPtr(&currTLI); + else + logptr = GetFlushRecPtr(&currTLI); + + snprintf(xloc, sizeof(xloc), "%X/%X", LSN_FORMAT_ARGS(logptr)); + + if (MyDatabaseId != InvalidOid) + { + MemoryContext cur = CurrentMemoryContext; + + /* syscache access needs a transaction env. */ + StartTransactionCommand(); + /* make dbname live outside TX context */ + MemoryContextSwitchTo(cur); + dbname = get_database_name(MyDatabaseId); + CommitTransactionCommand(); + /* CommitTransactionCommand switches to TopMemoryContext */ + MemoryContextSwitchTo(cur); + } + + dest = CreateDestReceiver(DestRemoteSimple); + + /* need a tuple descriptor representing four columns */ + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "systemid", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "timeline", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 3, "xlogpos", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 4, "dbname", + TEXTOID, -1, 0); + + /* prepare for projection of tuples */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + /* column 1: system identifier */ + values[0] = CStringGetTextDatum(sysid); + + /* column 2: timeline */ + values[1] = Int64GetDatum(currTLI); + + /* column 3: wal location */ + values[2] = CStringGetTextDatum(xloc); + + /* column 4: database name, or NULL if none */ + if (dbname) + values[3] = CStringGetTextDatum(dbname); + else + nulls[3] = true; + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); +} + +/* Handle READ_REPLICATION_SLOT command */ +static void +ReadReplicationSlot(ReadReplicationSlotCmd *cmd) +{ +#define READ_REPLICATION_SLOT_COLS 3 + ReplicationSlot *slot; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[READ_REPLICATION_SLOT_COLS] = {0}; + bool nulls[READ_REPLICATION_SLOT_COLS]; + + tupdesc = CreateTemplateTupleDesc(READ_REPLICATION_SLOT_COLS); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "slot_type", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "restart_lsn", + TEXTOID, -1, 0); + /* TimeLineID is unsigned, so int4 is not wide enough. */ + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 3, "restart_tli", + INT8OID, -1, 0); + + memset(nulls, true, READ_REPLICATION_SLOT_COLS * sizeof(bool)); + + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + slot = SearchNamedReplicationSlot(cmd->slotname, false); + if (slot == NULL || !slot->in_use) + { + LWLockRelease(ReplicationSlotControlLock); + } + else + { + ReplicationSlot slot_contents; + int i = 0; + + /* Copy slot contents while holding spinlock */ + SpinLockAcquire(&slot->mutex); + slot_contents = *slot; + SpinLockRelease(&slot->mutex); + LWLockRelease(ReplicationSlotControlLock); + + if (OidIsValid(slot_contents.data.database)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use %s with a logical replication slot", + "READ_REPLICATION_SLOT")); + + /* slot type */ + values[i] = CStringGetTextDatum("physical"); + nulls[i] = false; + i++; + + /* start LSN */ + if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + { + char xloc[64]; + + snprintf(xloc, sizeof(xloc), "%X/%X", + LSN_FORMAT_ARGS(slot_contents.data.restart_lsn)); + values[i] = CStringGetTextDatum(xloc); + nulls[i] = false; + } + i++; + + /* timeline this WAL was produced on */ + if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn)) + { + TimeLineID slots_position_timeline; + TimeLineID current_timeline; + List *timeline_history = NIL; + + /* + * While in recovery, use as timeline the currently-replaying one + * to get the LSN position's history. + */ + if (RecoveryInProgress()) + (void) GetXLogReplayRecPtr(¤t_timeline); + else + current_timeline = GetWALInsertionTimeLine(); + + timeline_history = readTimeLineHistory(current_timeline); + slots_position_timeline = tliOfPointInHistory(slot_contents.data.restart_lsn, + timeline_history); + values[i] = Int64GetDatum((int64) slots_position_timeline); + nulls[i] = false; + } + i++; + + Assert(i == READ_REPLICATION_SLOT_COLS); + } + + dest = CreateDestReceiver(DestRemoteSimple); + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + do_tup_output(tstate, values, nulls); + end_tup_output(tstate); +} + + +/* + * Handle TIMELINE_HISTORY command. + */ +static void +SendTimeLineHistory(TimeLineHistoryCmd *cmd) +{ + DestReceiver *dest; + TupleDesc tupdesc; + StringInfoData buf; + char histfname[MAXFNAMELEN]; + char path[MAXPGPATH]; + int fd; + off_t histfilelen; + off_t bytesleft; + Size len; + + dest = CreateDestReceiver(DestRemoteSimple); + + /* + * Reply with a result set with one row, and two columns. The first col is + * the name of the history file, 2nd is the contents. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "filename", TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "content", TEXTOID, -1, 0); + + TLHistoryFileName(histfname, cmd->timeline); + TLHistoryFilePath(path, cmd->timeline); + + /* Send a RowDescription message */ + dest->rStartup(dest, CMD_SELECT, tupdesc); + + /* Send a DataRow message */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, 2); /* # of columns */ + len = strlen(histfname); + pq_sendint32(&buf, len); /* col1 len */ + pq_sendbytes(&buf, histfname, len); + + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + /* Determine file length and send it to client */ + histfilelen = lseek(fd, 0, SEEK_END); + if (histfilelen < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to end of file \"%s\": %m", path))); + if (lseek(fd, 0, SEEK_SET) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to beginning of file \"%s\": %m", path))); + + pq_sendint32(&buf, histfilelen); /* col2 len */ + + bytesleft = histfilelen; + while (bytesleft > 0) + { + PGAlignedBlock rbuf; + int nread; + + pgstat_report_wait_start(WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ); + nread = read(fd, rbuf.data, sizeof(rbuf)); + pgstat_report_wait_end(); + if (nread < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + path))); + else if (nread == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + path, nread, (Size) bytesleft))); + + pq_sendbytes(&buf, rbuf.data, nread); + bytesleft -= nread; + } + + if (CloseTransientFile(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + + pq_endmessage(&buf); +} + +/* + * Handle START_REPLICATION command. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +static void +StartReplication(StartReplicationCmd *cmd) +{ + StringInfoData buf; + XLogRecPtr FlushPtr; + TimeLineID FlushTLI; + + /* create xlogreader for physical replication */ + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); + + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record. + */ + am_cascading_walsender = RecoveryInProgress(); + if (am_cascading_walsender) + FlushPtr = GetStandbyFlushRecPtr(&FlushTLI); + else + FlushPtr = GetFlushRecPtr(&FlushTLI); + + if (cmd->timeline != 0) + { + XLogRecPtr switchpoint; + + sendTimeLine = cmd->timeline; + if (sendTimeLine == FlushTLI) + { + sendTimeLineIsHistoric = false; + sendTimeLineValidUpto = InvalidXLogRecPtr; + } + else + { + List *timeLineHistory; + + sendTimeLineIsHistoric = true; + + /* + * Check that the timeline the client requested exists, and the + * requested start location is on that timeline. + */ + timeLineHistory = readTimeLineHistory(FlushTLI); + switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, + &sendTimeLineNextTLI); + list_free_deep(timeLineHistory); + + /* + * Found the requested timeline in the history. Check that + * requested startpoint is on that timeline in our history. + * + * This is quite loose on purpose. We only check that we didn't + * fork off the requested timeline before the switchpoint. We + * don't check that we switched *to* it before the requested + * starting point. This is because the client can legitimately + * request to start replication from the beginning of the WAL + * segment that contains switchpoint, but on the new timeline, so + * that it doesn't end up with a partial segment. If you ask for + * too old a starting point, you'll get an error later when we + * fail to find the requested WAL segment in pg_wal. + * + * XXX: we could be more strict here and only allow a startpoint + * that's older than the switchpoint, if it's still in the same + * WAL segment. + */ + if (!XLogRecPtrIsInvalid(switchpoint) && + switchpoint < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); + } + sendTimeLineValidUpto = switchpoint; + } + } + else + { + sendTimeLine = FlushTLI; + sendTimeLineValidUpto = InvalidXLogRecPtr; + sendTimeLineIsHistoric = false; + } + + streamingDoneSending = streamingDoneReceiving = false; + + /* If there is nothing to stream, don't even enter COPY mode */ + if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) + { + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* Send a CopyBothResponse message, and start streaming */ + pq_beginmessage(&buf, 'W'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + pq_flush(); + + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Main loop of walsender */ + replication_active = true; + + WalSndLoop(XLogSendPhysical); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + Assert(streamingDoneSending && streamingDoneReceiving); + } + + if (cmd->slotname) + ReplicationSlotRelease(); + + /* + * Copy is finished now. Send a single-row result set indicating the next + * timeline. + */ + if (sendTimeLineIsHistoric) + { + char startpos_str[8 + 1 + 8 + 1]; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2] = {0}; + + snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + LSN_FORMAT_ARGS(sendTimeLineValidUpto)); + + dest = CreateDestReceiver(DestRemoteSimple); + + /* + * Need a tuple descriptor representing two columns. int8 may seem + * like a surprising data type for this, but in theory int4 would not + * be wide enough for this, as TimeLineID is unsigned. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", + TEXTOID, -1, 0); + + /* prepare for projection of tuple */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); + values[1] = CStringGetTextDatum(startpos_str); + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); + } + + /* Send CommandComplete message */ + EndReplicationCommand("START_STREAMING"); +} + +/* + * XLogReaderRoutine->page_read callback for logical decoding contexts, as a + * walsender process. + * + * Inside the walsender we can do better than read_local_xlog_page, + * which has to do a plain sleep/busy loop, because the walsender's latch gets + * set every time WAL is flushed. + */ +static int +logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *cur_page) +{ + XLogRecPtr flushptr; + int count; + WALReadError errinfo; + XLogSegNo segno; + TimeLineID currTLI; + + /* + * Make sure we have enough WAL available before retrieving the current + * timeline. This is needed to determine am_cascading_walsender accurately + * which is needed to determine the current timeline. + */ + flushptr = WalSndWaitForWal(targetPagePtr + reqLen); + + /* + * Since logical decoding is also permitted on a standby server, we need + * to check if the server is in recovery to decide how to get the current + * timeline ID (so that it also cover the promotion or timeline change + * cases). + */ + am_cascading_walsender = RecoveryInProgress(); + + if (am_cascading_walsender) + GetXLogReplayRecPtr(&currTLI); + else + currTLI = GetWALInsertionTimeLine(); + + XLogReadDetermineTimeline(state, targetPagePtr, reqLen, currTLI); + sendTimeLineIsHistoric = (state->currTLI != currTLI); + sendTimeLine = state->currTLI; + sendTimeLineValidUpto = state->currTLIValidUntil; + sendTimeLineNextTLI = state->nextTLI; + + /* fail if not (implies we are going to shut down) */ + if (flushptr < targetPagePtr + reqLen) + return -1; + + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; /* more than one block available */ + else + count = flushptr - targetPagePtr; /* part of the page available */ + + /* now actually read the data, we know it's there */ + if (!WALRead(state, + cur_page, + targetPagePtr, + XLOG_BLCKSZ, + currTLI, /* Pass the current TLI because only + * WalSndSegmentOpen controls whether new TLI + * is needed. */ + &errinfo)) + WALReadRaiseError(&errinfo); + + /* + * After reading into the buffer, check that what we read was valid. We do + * this after reading, because even though the segment was present when we + * opened it, it might get recycled or removed while we read it. The + * read() succeeds in that case, but the data we tried to read might + * already have been overwritten with new WAL records. + */ + XLByteToSeg(targetPagePtr, segno, state->segcxt.ws_segsize); + CheckXLogRemoved(segno, state->seg.ws_tli); + + return count; +} + +/* + * Process extra options given to CREATE_REPLICATION_SLOT. + */ +static void +parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd, + bool *reserve_wal, + CRSSnapshotAction *snapshot_action, + bool *two_phase) +{ + ListCell *lc; + bool snapshot_action_given = false; + bool reserve_wal_given = false; + bool two_phase_given = false; + + /* Parse options */ + foreach(lc, cmd->options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + if (strcmp(defel->defname, "snapshot") == 0) + { + char *action; + + if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + action = defGetString(defel); + snapshot_action_given = true; + + if (strcmp(action, "export") == 0) + *snapshot_action = CRS_EXPORT_SNAPSHOT; + else if (strcmp(action, "nothing") == 0) + *snapshot_action = CRS_NOEXPORT_SNAPSHOT; + else if (strcmp(action, "use") == 0) + *snapshot_action = CRS_USE_SNAPSHOT; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized value for CREATE_REPLICATION_SLOT option \"%s\": \"%s\"", + defel->defname, action))); + } + else if (strcmp(defel->defname, "reserve_wal") == 0) + { + if (reserve_wal_given || cmd->kind != REPLICATION_KIND_PHYSICAL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + + reserve_wal_given = true; + *reserve_wal = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "two_phase") == 0) + { + if (two_phase_given || cmd->kind != REPLICATION_KIND_LOGICAL) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + two_phase_given = true; + *two_phase = defGetBoolean(defel); + } + else + elog(ERROR, "unrecognized option: %s", defel->defname); + } +} + +/* + * Create a new replication slot. + */ +static void +CreateReplicationSlot(CreateReplicationSlotCmd *cmd) +{ + const char *snapshot_name = NULL; + char xloc[MAXFNAMELEN]; + char *slot_name; + bool reserve_wal = false; + bool two_phase = false; + CRSSnapshotAction snapshot_action = CRS_EXPORT_SNAPSHOT; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[4]; + bool nulls[4] = {0}; + + Assert(!MyReplicationSlot); + + parseCreateReplSlotOptions(cmd, &reserve_wal, &snapshot_action, &two_phase); + + if (cmd->kind == REPLICATION_KIND_PHYSICAL) + { + ReplicationSlotCreate(cmd->slotname, false, + cmd->temporary ? RS_TEMPORARY : RS_PERSISTENT, + false); + } + else + { + CheckLogicalDecodingRequirements(); + + /* + * Initially create persistent slot as ephemeral - that allows us to + * nicely handle errors during initialization because it'll get + * dropped if this transaction fails. We'll make it persistent at the + * end. Temporary slots can be created as temporary from beginning as + * they get dropped on error as well. + */ + ReplicationSlotCreate(cmd->slotname, true, + cmd->temporary ? RS_TEMPORARY : RS_EPHEMERAL, + two_phase); + } + + if (cmd->kind == REPLICATION_KIND_LOGICAL) + { + LogicalDecodingContext *ctx; + bool need_full_snapshot = false; + + /* + * Do options check early so that we can bail before calling the + * DecodingContextFindStartpoint which can take long time. + */ + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { + if (IsTransactionBlock()) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must not be called inside a transaction", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'export')"))); + + need_full_snapshot = true; + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + if (!IsTransactionBlock()) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must be called inside a transaction", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'use')"))); + + if (XactIsoLevel != XACT_REPEATABLE_READ) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must be called in REPEATABLE READ isolation mode transaction", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'use')"))); + if (!XactReadOnly) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must be called in a read-only transaction", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'use')"))); + + if (FirstSnapshotSet) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must be called before any query", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'use')"))); + + if (IsSubTransaction()) + ereport(ERROR, + /*- translator: %s is a CREATE_REPLICATION_SLOT statement */ + (errmsg("%s must not be called in a subtransaction", + "CREATE_REPLICATION_SLOT ... (SNAPSHOT 'use')"))); + + need_full_snapshot = true; + } + + ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot, + InvalidXLogRecPtr, + XL_ROUTINE(.page_read = logical_read_xlog_page, + .segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + WalSndPrepareWrite, WalSndWriteData, + WalSndUpdateProgress); + + /* + * Signal that we don't need the timeout mechanism. We're just + * creating the replication slot and don't yet accept feedback + * messages or send keepalives. As we possibly need to wait for + * further WAL the walsender would otherwise possibly be killed too + * soon. + */ + last_reply_timestamp = 0; + + /* build initial snapshot, might take a while */ + DecodingContextFindStartpoint(ctx); + + /* + * Export or use the snapshot if we've been asked to do so. + * + * NB. We will convert the snapbuild.c kind of snapshot to normal + * snapshot when doing this. + */ + if (snapshot_action == CRS_EXPORT_SNAPSHOT) + { + snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder); + } + else if (snapshot_action == CRS_USE_SNAPSHOT) + { + Snapshot snap; + + snap = SnapBuildInitialSnapshot(ctx->snapshot_builder); + RestoreTransactionSnapshot(snap, MyProc); + } + + /* don't need the decoding context anymore */ + FreeDecodingContext(ctx); + + if (!cmd->temporary) + ReplicationSlotPersist(); + } + else if (cmd->kind == REPLICATION_KIND_PHYSICAL && reserve_wal) + { + ReplicationSlotReserveWal(); + + ReplicationSlotMarkDirty(); + + /* Write this slot to disk if it's a permanent one. */ + if (!cmd->temporary) + ReplicationSlotSave(); + } + + snprintf(xloc, sizeof(xloc), "%X/%X", + LSN_FORMAT_ARGS(MyReplicationSlot->data.confirmed_flush)); + + dest = CreateDestReceiver(DestRemoteSimple); + + /*---------- + * Need a tuple descriptor representing four columns: + * - first field: the slot name + * - second field: LSN at which we became consistent + * - third field: exported snapshot's name + * - fourth field: output plugin + *---------- + */ + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "slot_name", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "consistent_point", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 3, "snapshot_name", + TEXTOID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 4, "output_plugin", + TEXTOID, -1, 0); + + /* prepare for projection of tuples */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + /* slot_name */ + slot_name = NameStr(MyReplicationSlot->data.name); + values[0] = CStringGetTextDatum(slot_name); + + /* consistent wal location */ + values[1] = CStringGetTextDatum(xloc); + + /* snapshot name, or NULL if none */ + if (snapshot_name != NULL) + values[2] = CStringGetTextDatum(snapshot_name); + else + nulls[2] = true; + + /* plugin, or NULL if none */ + if (cmd->plugin != NULL) + values[3] = CStringGetTextDatum(cmd->plugin); + else + nulls[3] = true; + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + end_tup_output(tstate); + + ReplicationSlotRelease(); +} + +/* + * Get rid of a replication slot that is no longer wanted. + */ +static void +DropReplicationSlot(DropReplicationSlotCmd *cmd) +{ + ReplicationSlotDrop(cmd->slotname, !cmd->wait); +} + +/* + * Load previously initiated logical slot and prepare for sending data (via + * WalSndLoop). + */ +static void +StartLogicalReplication(StartReplicationCmd *cmd) +{ + StringInfoData buf; + QueryCompletion qc; + + /* make sure that our requirements are still fulfilled */ + CheckLogicalDecodingRequirements(); + + Assert(!MyReplicationSlot); + + ReplicationSlotAcquire(cmd->slotname, true); + + /* + * Force a disconnect, so that the decoding code doesn't need to care + * about an eventual switch from running in recovery, to running in a + * normal environment. Client code is expected to handle reconnects. + */ + if (am_cascading_walsender && !RecoveryInProgress()) + { + ereport(LOG, + (errmsg("terminating walsender process after promotion"))); + got_STOPPING = true; + } + + /* + * Create our decoding context, making it start at the previously ack'ed + * position. + * + * Do this before sending a CopyBothResponse message, so that any errors + * are reported early. + */ + logical_decoding_ctx = + CreateDecodingContext(cmd->startpoint, cmd->options, false, + XL_ROUTINE(.page_read = logical_read_xlog_page, + .segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + WalSndPrepareWrite, WalSndWriteData, + WalSndUpdateProgress); + xlogreader = logical_decoding_ctx->reader; + + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* Send a CopyBothResponse message, and start streaming */ + pq_beginmessage(&buf, 'W'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + pq_flush(); + + /* Start reading WAL from the oldest required WAL. */ + XLogBeginRead(logical_decoding_ctx->reader, + MyReplicationSlot->data.restart_lsn); + + /* + * Report the location after which we'll send out further commits as the + * current sentPtr. + */ + sentPtr = MyReplicationSlot->data.confirmed_flush; + + /* Also update the sent position status in shared memory */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = MyReplicationSlot->data.restart_lsn; + SpinLockRelease(&MyWalSnd->mutex); + + replication_active = true; + + SyncRepInitConfig(); + + /* Main loop of walsender */ + WalSndLoop(XLogSendLogical); + + FreeDecodingContext(logical_decoding_ctx); + ReplicationSlotRelease(); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + /* Get out of COPY mode (CommandComplete). */ + SetQueryCompletion(&qc, CMDTAG_COPY, 0); + EndCommand(&qc, DestRemote, false); +} + +/* + * LogicalDecodingContext 'prepare_write' callback. + * + * Prepare a write into a StringInfo. + * + * Don't do anything lasting in here, it's quite possible that nothing will be done + * with the data. + */ +static void +WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write) +{ + /* can't have sync rep confused by sending the same LSN several times */ + if (!last_write) + lsn = InvalidXLogRecPtr; + + resetStringInfo(ctx->out); + + pq_sendbyte(ctx->out, 'w'); + pq_sendint64(ctx->out, lsn); /* dataStart */ + pq_sendint64(ctx->out, lsn); /* walEnd */ + + /* + * Fill out the sendtime later, just as it's done in XLogSendPhysical, but + * reserve space here. + */ + pq_sendint64(ctx->out, 0); /* sendtime */ +} + +/* + * LogicalDecodingContext 'write' callback. + * + * Actually write out data previously prepared by WalSndPrepareWrite out to + * the network. Take as long as needed, but process replies from the other + * side and check timeouts during that. + */ +static void +WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool last_write) +{ + TimestampTz now; + + /* + * Fill the send timestamp last, so that it is taken as late as possible. + * This is somewhat ugly, but the protocol is set as it's already used for + * several releases by streaming physical replication. + */ + resetStringInfo(&tmpbuf); + now = GetCurrentTimestamp(); + pq_sendint64(&tmpbuf, now); + memcpy(&ctx->out->data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + + /* output previously gathered data in a CopyData packet */ + pq_putmessage_noblock('d', ctx->out->data, ctx->out->len); + + CHECK_FOR_INTERRUPTS(); + + /* Try to flush pending output to the client */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + + /* Try taking fast path unless we get too close to walsender timeout. */ + if (now < TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout / 2) && + !pq_is_send_pending()) + { + return; + } + + /* If we have pending write here, go to slow path */ + ProcessPendingWrites(); +} + +/* + * Wait until there is no pending write. Also process replies from the other + * side and check timeouts during that. + */ +static void +ProcessPendingWrites(void) +{ + for (;;) + { + long sleeptime; + + /* Check for input from the client */ + ProcessRepliesIfAny(); + + /* die if timeout was reached */ + WalSndCheckTimeOut(); + + /* Send keepalive if the time has come */ + WalSndKeepaliveIfNecessary(); + + if (!pq_is_send_pending()) + break; + + sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); + + /* Sleep until something happens or we time out */ + WalSndWait(WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, sleeptime, + WAIT_EVENT_WAL_SENDER_WRITE_DATA); + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* Try to flush pending output to the client */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + } + + /* reactivate latch so WalSndLoop knows to continue */ + SetLatch(MyLatch); +} + +/* + * LogicalDecodingContext 'update_progress' callback. + * + * Write the current position to the lag tracker (see XLogSendPhysical). + * + * When skipping empty transactions, send a keepalive message if necessary. + */ +static void +WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, + bool skipped_xact) +{ + static TimestampTz sendTime = 0; + TimestampTz now = GetCurrentTimestamp(); + bool pending_writes = false; + bool end_xact = ctx->end_xact; + + /* + * Track lag no more than once per WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS to + * avoid flooding the lag tracker when we commit frequently. + * + * We don't have a mechanism to get the ack for any LSN other than end + * xact LSN from the downstream. So, we track lag only for end of + * transaction LSN. + */ +#define WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS 1000 + if (end_xact && TimestampDifferenceExceeds(sendTime, now, + WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS)) + { + LagTrackerWrite(lsn, now); + sendTime = now; + } + + /* + * When skipping empty transactions in synchronous replication, we send a + * keepalive message to avoid delaying such transactions. + * + * It is okay to check sync_standbys_defined flag without lock here as in + * the worst case we will just send an extra keepalive message when it is + * really not required. + */ + if (skipped_xact && + SyncRepRequested() && + ((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined) + { + WalSndKeepalive(false, lsn); + + /* Try to flush pending output to the client */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + + /* If we have pending write here, make sure it's actually flushed */ + if (pq_is_send_pending()) + pending_writes = true; + } + + /* + * Process pending writes if any or try to send a keepalive if required. + * We don't need to try sending keep alive messages at the transaction end + * as that will be done at a later point in time. This is required only + * for large transactions where we don't send any changes to the + * downstream and the receiver can timeout due to that. + */ + if (pending_writes || (!end_xact && + now >= TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout / 2))) + ProcessPendingWrites(); +} + +/* + * Wait till WAL < loc is flushed to disk so it can be safely sent to client. + * + * Returns end LSN of flushed WAL. Normally this will be >= loc, but + * if we detect a shutdown request (either from postmaster or client) + * we will return early, so caller must always check. + */ +static XLogRecPtr +WalSndWaitForWal(XLogRecPtr loc) +{ + int wakeEvents; + static XLogRecPtr RecentFlushPtr = InvalidXLogRecPtr; + + /* + * Fast path to avoid acquiring the spinlock in case we already know we + * have enough WAL available. This is particularly interesting if we're + * far behind. + */ + if (RecentFlushPtr != InvalidXLogRecPtr && + loc <= RecentFlushPtr) + return RecentFlushPtr; + + /* Get a more recent flush pointer. */ + if (!RecoveryInProgress()) + RecentFlushPtr = GetFlushRecPtr(NULL); + else + RecentFlushPtr = GetXLogReplayRecPtr(NULL); + + for (;;) + { + long sleeptime; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* Check for input from the client */ + ProcessRepliesIfAny(); + + /* + * If we're shutting down, trigger pending WAL to be written out, + * otherwise we'd possibly end up waiting for WAL that never gets + * written, because walwriter has shut down already. + */ + if (got_STOPPING) + XLogBackgroundFlush(); + + /* Update our idea of the currently flushed position. */ + if (!RecoveryInProgress()) + RecentFlushPtr = GetFlushRecPtr(NULL); + else + RecentFlushPtr = GetXLogReplayRecPtr(NULL); + + /* + * If postmaster asked us to stop, don't wait anymore. + * + * It's important to do this check after the recomputation of + * RecentFlushPtr, so we can send all remaining data before shutting + * down. + */ + if (got_STOPPING) + break; + + /* + * We only send regular messages to the client for full decoded + * transactions, but a synchronous replication and walsender shutdown + * possibly are waiting for a later location. So, before sleeping, we + * send a ping containing the flush location. If the receiver is + * otherwise idle, this keepalive will trigger a reply. Processing the + * reply will update these MyWalSnd locations. + */ + if (MyWalSnd->flush < sentPtr && + MyWalSnd->write < sentPtr && + !waiting_for_ping_response) + WalSndKeepalive(false, InvalidXLogRecPtr); + + /* check whether we're done */ + if (loc <= RecentFlushPtr) + break; + + /* Waiting for new WAL. Since we need to wait, we're now caught up. */ + WalSndCaughtUp = true; + + /* + * Try to flush any pending output to the client. + */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + + /* + * If we have received CopyDone from the client, sent CopyDone + * ourselves, and the output buffer is empty, it's time to exit + * streaming, so fail the current WAL fetch request. + */ + if (streamingDoneReceiving && streamingDoneSending && + !pq_is_send_pending()) + break; + + /* die if timeout was reached */ + WalSndCheckTimeOut(); + + /* Send keepalive if the time has come */ + WalSndKeepaliveIfNecessary(); + + /* + * Sleep until something happens or we time out. Also wait for the + * socket becoming writable, if there's still pending output. + * Otherwise we might sit on sendable output data while waiting for + * new WAL to be generated. (But if we have nothing to send, we don't + * want to wake on socket-writable.) + */ + sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); + + wakeEvents = WL_SOCKET_READABLE; + + if (pq_is_send_pending()) + wakeEvents |= WL_SOCKET_WRITEABLE; + + WalSndWait(wakeEvents, sleeptime, WAIT_EVENT_WAL_SENDER_WAIT_WAL); + } + + /* reactivate latch so WalSndLoop knows to continue */ + SetLatch(MyLatch); + return RecentFlushPtr; +} + +/* + * Execute an incoming replication command. + * + * Returns true if the cmd_string was recognized as WalSender command, false + * if not. + */ +bool +exec_replication_command(const char *cmd_string) +{ + int parse_rc; + Node *cmd_node; + const char *cmdtag; + MemoryContext cmd_context; + MemoryContext old_context; + + /* + * If WAL sender has been told that shutdown is getting close, switch its + * status accordingly to handle the next replication commands correctly. + */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + /* + * Throw error if in stopping mode. We need prevent commands that could + * generate WAL while the shutdown checkpoint is being written. To be + * safe, we just prohibit all new commands. + */ + if (MyWalSnd->state == WALSNDSTATE_STOPPING) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot execute new commands while WAL sender is in stopping mode"))); + + /* + * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot until the next + * command arrives. Clean up the old stuff if there's anything. + */ + SnapBuildClearExportedSnapshot(); + + CHECK_FOR_INTERRUPTS(); + + /* + * Prepare to parse and execute the command. + */ + cmd_context = AllocSetContextCreate(CurrentMemoryContext, + "Replication command context", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(cmd_context); + + replication_scanner_init(cmd_string); + + /* + * Is it a WalSender command? + */ + if (!replication_scanner_is_replication_command()) + { + /* Nope; clean up and get out. */ + replication_scanner_finish(); + + MemoryContextSwitchTo(old_context); + MemoryContextDelete(cmd_context); + + /* XXX this is a pretty random place to make this check */ + if (MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot execute SQL commands in WAL sender for physical replication"))); + + /* Tell the caller that this wasn't a WalSender command. */ + return false; + } + + /* + * Looks like a WalSender command, so parse it. + */ + parse_rc = replication_yyparse(); + if (parse_rc != 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg_internal("replication command parser returned %d", + parse_rc))); + replication_scanner_finish(); + + cmd_node = replication_parse_result; + + /* + * Report query to various monitoring facilities. For this purpose, we + * report replication commands just like SQL commands. + */ + debug_query_string = cmd_string; + + pgstat_report_activity(STATE_RUNNING, cmd_string); + + /* + * Log replication command if log_replication_commands is enabled. Even + * when it's disabled, log the command with DEBUG1 level for backward + * compatibility. + */ + ereport(log_replication_commands ? LOG : DEBUG1, + (errmsg("received replication command: %s", cmd_string))); + + /* + * Disallow replication commands in aborted transaction blocks. + */ + if (IsAbortedTransactionBlockState()) + ereport(ERROR, + (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), + errmsg("current transaction is aborted, " + "commands ignored until end of transaction block"))); + + CHECK_FOR_INTERRUPTS(); + + /* + * Allocate buffers that will be used for each outgoing and incoming + * message. We do this just once per command to reduce palloc overhead. + */ + initStringInfo(&output_message); + initStringInfo(&reply_message); + initStringInfo(&tmpbuf); + + switch (cmd_node->type) + { + case T_IdentifySystemCmd: + cmdtag = "IDENTIFY_SYSTEM"; + set_ps_display(cmdtag); + IdentifySystem(); + EndReplicationCommand(cmdtag); + break; + + case T_ReadReplicationSlotCmd: + cmdtag = "READ_REPLICATION_SLOT"; + set_ps_display(cmdtag); + ReadReplicationSlot((ReadReplicationSlotCmd *) cmd_node); + EndReplicationCommand(cmdtag); + break; + + case T_BaseBackupCmd: + cmdtag = "BASE_BACKUP"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + SendBaseBackup((BaseBackupCmd *) cmd_node); + EndReplicationCommand(cmdtag); + break; + + case T_CreateReplicationSlotCmd: + cmdtag = "CREATE_REPLICATION_SLOT"; + set_ps_display(cmdtag); + CreateReplicationSlot((CreateReplicationSlotCmd *) cmd_node); + EndReplicationCommand(cmdtag); + break; + + case T_DropReplicationSlotCmd: + cmdtag = "DROP_REPLICATION_SLOT"; + set_ps_display(cmdtag); + DropReplicationSlot((DropReplicationSlotCmd *) cmd_node); + EndReplicationCommand(cmdtag); + break; + + case T_StartReplicationCmd: + { + StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node; + + cmdtag = "START_REPLICATION"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + + if (cmd->kind == REPLICATION_KIND_PHYSICAL) + StartReplication(cmd); + else + StartLogicalReplication(cmd); + + /* dupe, but necessary per libpqrcv_endstreaming */ + EndReplicationCommand(cmdtag); + + Assert(xlogreader != NULL); + break; + } + + case T_TimeLineHistoryCmd: + cmdtag = "TIMELINE_HISTORY"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node); + EndReplicationCommand(cmdtag); + break; + + case T_VariableShowStmt: + { + DestReceiver *dest = CreateDestReceiver(DestRemoteSimple); + VariableShowStmt *n = (VariableShowStmt *) cmd_node; + + cmdtag = "SHOW"; + set_ps_display(cmdtag); + + /* syscache access needs a transaction environment */ + StartTransactionCommand(); + GetPGVariable(n->name, dest); + CommitTransactionCommand(); + EndReplicationCommand(cmdtag); + } + break; + + default: + elog(ERROR, "unrecognized replication command node tag: %u", + cmd_node->type); + } + + /* done */ + MemoryContextSwitchTo(old_context); + MemoryContextDelete(cmd_context); + + /* + * We need not update ps display or pg_stat_activity, because PostgresMain + * will reset those to "idle". But we must reset debug_query_string to + * ensure it doesn't become a dangling pointer. + */ + debug_query_string = NULL; + + return true; +} + +/* + * Process any incoming messages while streaming. Also checks if the remote + * end has closed the connection. + */ +static void +ProcessRepliesIfAny(void) +{ + unsigned char firstchar; + int maxmsglen; + int r; + bool received = false; + + last_processing = GetCurrentTimestamp(); + + /* + * If we already received a CopyDone from the frontend, any subsequent + * message is the beginning of a new command, and should be processed in + * the main processing loop. + */ + while (!streamingDoneReceiving) + { + pq_startmsgread(); + r = pq_getbyte_if_available(&firstchar); + if (r < 0) + { + /* unexpected error or EOF */ + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF on standby connection"))); + proc_exit(0); + } + if (r == 0) + { + /* no data available without blocking */ + pq_endmsgread(); + break; + } + + /* Validate message type and set packet size limit */ + switch (firstchar) + { + case 'd': + maxmsglen = PQ_LARGE_MESSAGE_LIMIT; + break; + case 'c': + case 'X': + maxmsglen = PQ_SMALL_MESSAGE_LIMIT; + break; + default: + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid standby message type \"%c\"", + firstchar))); + maxmsglen = 0; /* keep compiler quiet */ + break; + } + + /* Read the message contents */ + resetStringInfo(&reply_message); + if (pq_getmessage(&reply_message, maxmsglen)) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF on standby connection"))); + proc_exit(0); + } + + /* ... and process it */ + switch (firstchar) + { + /* + * 'd' means a standby reply wrapped in a CopyData packet. + */ + case 'd': + ProcessStandbyMessage(); + received = true; + break; + + /* + * CopyDone means the standby requested to finish streaming. + * Reply with CopyDone, if we had not sent that already. + */ + case 'c': + if (!streamingDoneSending) + { + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + } + + streamingDoneReceiving = true; + received = true; + break; + + /* + * 'X' means that the standby is closing down the socket. + */ + case 'X': + proc_exit(0); + + default: + Assert(false); /* NOT REACHED */ + } + } + + /* + * Save the last reply timestamp if we've received at least one reply. + */ + if (received) + { + last_reply_timestamp = last_processing; + waiting_for_ping_response = false; + } +} + +/* + * Process a status update message received from standby. + */ +static void +ProcessStandbyMessage(void) +{ + char msgtype; + + /* + * Check message type from the first byte. + */ + msgtype = pq_getmsgbyte(&reply_message); + + switch (msgtype) + { + case 'r': + ProcessStandbyReplyMessage(); + break; + + case 'h': + ProcessStandbyHSFeedbackMessage(); + break; + + default: + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message type \"%c\"", msgtype))); + proc_exit(0); + } +} + +/* + * Remember that a walreceiver just confirmed receipt of lsn `lsn`. + */ +static void +PhysicalConfirmReceivedLocation(XLogRecPtr lsn) +{ + bool changed = false; + ReplicationSlot *slot = MyReplicationSlot; + + Assert(lsn != InvalidXLogRecPtr); + SpinLockAcquire(&slot->mutex); + if (slot->data.restart_lsn != lsn) + { + changed = true; + slot->data.restart_lsn = lsn; + } + SpinLockRelease(&slot->mutex); + + if (changed) + { + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredLSN(); + } + + /* + * One could argue that the slot should be saved to disk now, but that'd + * be energy wasted - the worst thing lost information could cause here is + * to give wrong information in a statistics view - we'll just potentially + * be more conservative in removing files. + */ +} + +/* + * Regular reply from standby advising of WAL locations on standby server. + */ +static void +ProcessStandbyReplyMessage(void) +{ + XLogRecPtr writePtr, + flushPtr, + applyPtr; + bool replyRequested; + TimeOffset writeLag, + flushLag, + applyLag; + bool clearLagTimes; + TimestampTz now; + TimestampTz replyTime; + + static bool fullyAppliedLastTime = false; + + /* the caller already consumed the msgtype byte */ + writePtr = pq_getmsgint64(&reply_message); + flushPtr = pq_getmsgint64(&reply_message); + applyPtr = pq_getmsgint64(&reply_message); + replyTime = pq_getmsgint64(&reply_message); + replyRequested = pq_getmsgbyte(&reply_message); + + if (message_level_is_interesting(DEBUG2)) + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); + + elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s reply_time %s", + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr), + replyRequested ? " (reply requested)" : "", + replyTimeStr); + + pfree(replyTimeStr); + } + + /* See if we can compute the round-trip lag for these positions. */ + now = GetCurrentTimestamp(); + writeLag = LagTrackerRead(SYNC_REP_WAIT_WRITE, writePtr, now); + flushLag = LagTrackerRead(SYNC_REP_WAIT_FLUSH, flushPtr, now); + applyLag = LagTrackerRead(SYNC_REP_WAIT_APPLY, applyPtr, now); + + /* + * If the standby reports that it has fully replayed the WAL in two + * consecutive reply messages, then the second such message must result + * from wal_receiver_status_interval expiring on the standby. This is a + * convenient time to forget the lag times measured when it last + * wrote/flushed/applied a WAL record, to avoid displaying stale lag data + * until more WAL traffic arrives. + */ + clearLagTimes = false; + if (applyPtr == sentPtr) + { + if (fullyAppliedLastTime) + clearLagTimes = true; + fullyAppliedLastTime = true; + } + else + fullyAppliedLastTime = false; + + /* Send a reply if the standby requested one. */ + if (replyRequested) + WalSndKeepalive(false, InvalidXLogRecPtr); + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->write = writePtr; + walsnd->flush = flushPtr; + walsnd->apply = applyPtr; + if (writeLag != -1 || clearLagTimes) + walsnd->writeLag = writeLag; + if (flushLag != -1 || clearLagTimes) + walsnd->flushLag = flushLag; + if (applyLag != -1 || clearLagTimes) + walsnd->applyLag = applyLag; + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + } + + if (!am_cascading_walsender) + SyncRepReleaseWaiters(); + + /* + * Advance our local xmin horizon when the client confirmed a flush. + */ + if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr) + { + if (SlotIsLogical(MyReplicationSlot)) + LogicalConfirmReceivedLocation(flushPtr); + else + PhysicalConfirmReceivedLocation(flushPtr); + } +} + +/* compute new replication slot xmin horizon if needed */ +static void +PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbackCatalogXmin) +{ + bool changed = false; + ReplicationSlot *slot = MyReplicationSlot; + + SpinLockAcquire(&slot->mutex); + MyProc->xmin = InvalidTransactionId; + + /* + * For physical replication we don't need the interlock provided by xmin + * and effective_xmin since the consequences of a missed increase are + * limited to query cancellations, so set both at once. + */ + if (!TransactionIdIsNormal(slot->data.xmin) || + !TransactionIdIsNormal(feedbackXmin) || + TransactionIdPrecedes(slot->data.xmin, feedbackXmin)) + { + changed = true; + slot->data.xmin = feedbackXmin; + slot->effective_xmin = feedbackXmin; + } + if (!TransactionIdIsNormal(slot->data.catalog_xmin) || + !TransactionIdIsNormal(feedbackCatalogXmin) || + TransactionIdPrecedes(slot->data.catalog_xmin, feedbackCatalogXmin)) + { + changed = true; + slot->data.catalog_xmin = feedbackCatalogXmin; + slot->effective_catalog_xmin = feedbackCatalogXmin; + } + SpinLockRelease(&slot->mutex); + + if (changed) + { + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + } +} + +/* + * Check that the provided xmin/epoch are sane, that is, not in the future + * and not so far back as to be already wrapped around. + * + * Epoch of nextXid should be same as standby, or if the counter has + * wrapped, then one greater than standby. + * + * This check doesn't care about whether clog exists for these xids + * at all. + */ +static bool +TransactionIdInRecentPast(TransactionId xid, uint32 epoch) +{ + FullTransactionId nextFullXid; + TransactionId nextXid; + uint32 nextEpoch; + + nextFullXid = ReadNextFullTransactionId(); + nextXid = XidFromFullTransactionId(nextFullXid); + nextEpoch = EpochFromFullTransactionId(nextFullXid); + + if (xid <= nextXid) + { + if (epoch != nextEpoch) + return false; + } + else + { + if (epoch + 1 != nextEpoch) + return false; + } + + if (!TransactionIdPrecedesOrEquals(xid, nextXid)) + return false; /* epoch OK, but it's wrapped around */ + + return true; +} + +/* + * Hot Standby feedback + */ +static void +ProcessStandbyHSFeedbackMessage(void) +{ + TransactionId feedbackXmin; + uint32 feedbackEpoch; + TransactionId feedbackCatalogXmin; + uint32 feedbackCatalogEpoch; + TimestampTz replyTime; + + /* + * Decipher the reply message. The caller already consumed the msgtype + * byte. See XLogWalRcvSendHSFeedback() in walreceiver.c for the creation + * of this message. + */ + replyTime = pq_getmsgint64(&reply_message); + feedbackXmin = pq_getmsgint(&reply_message, 4); + feedbackEpoch = pq_getmsgint(&reply_message, 4); + feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); + feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + + if (message_level_is_interesting(DEBUG2)) + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); + + elog(DEBUG2, "hot standby feedback xmin %u epoch %u, catalog_xmin %u epoch %u reply_time %s", + feedbackXmin, + feedbackEpoch, + feedbackCatalogXmin, + feedbackCatalogEpoch, + replyTimeStr); + + pfree(replyTimeStr); + } + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + } + + /* + * Unset WalSender's xmins if the feedback message values are invalid. + * This happens when the downstream turned hot_standby_feedback off. + */ + if (!TransactionIdIsNormal(feedbackXmin) + && !TransactionIdIsNormal(feedbackCatalogXmin)) + { + MyProc->xmin = InvalidTransactionId; + if (MyReplicationSlot != NULL) + PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); + return; + } + + /* + * Check that the provided xmin/epoch are sane, that is, not in the future + * and not so far back as to be already wrapped around. Ignore if not. + */ + if (TransactionIdIsNormal(feedbackXmin) && + !TransactionIdInRecentPast(feedbackXmin, feedbackEpoch)) + return; + + if (TransactionIdIsNormal(feedbackCatalogXmin) && + !TransactionIdInRecentPast(feedbackCatalogXmin, feedbackCatalogEpoch)) + return; + + /* + * Set the WalSender's xmin equal to the standby's requested xmin, so that + * the xmin will be taken into account by GetSnapshotData() / + * ComputeXidHorizons(). This will hold back the removal of dead rows and + * thereby prevent the generation of cleanup conflicts on the standby + * server. + * + * There is a small window for a race condition here: although we just + * checked that feedbackXmin precedes nextXid, the nextXid could have + * gotten advanced between our fetching it and applying the xmin below, + * perhaps far enough to make feedbackXmin wrap around. In that case the + * xmin we set here would be "in the future" and have no effect. No point + * in worrying about this since it's too late to save the desired data + * anyway. Assuming that the standby sends us an increasing sequence of + * xmins, this could only happen during the first reply cycle, else our + * own xmin would prevent nextXid from advancing so far. + * + * We don't bother taking the ProcArrayLock here. Setting the xmin field + * is assumed atomic, and there's no real need to prevent concurrent + * horizon determinations. (If we're moving our xmin forward, this is + * obviously safe, and if we're moving it backwards, well, the data is at + * risk already since a VACUUM could already have determined the horizon.) + * + * If we're using a replication slot we reserve the xmin via that, + * otherwise via the walsender's PGPROC entry. We can only track the + * catalog xmin separately when using a slot, so we store the least of the + * two provided when not using a slot. + * + * XXX: It might make sense to generalize the ephemeral slot concept and + * always use the slot mechanism to handle the feedback xmin. + */ + if (MyReplicationSlot != NULL) /* XXX: persistency configurable? */ + PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); + else + { + if (TransactionIdIsNormal(feedbackCatalogXmin) + && TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin)) + MyProc->xmin = feedbackCatalogXmin; + else + MyProc->xmin = feedbackXmin; + } +} + +/* + * Compute how long send/receive loops should sleep. + * + * If wal_sender_timeout is enabled we want to wake up in time to send + * keepalives and to abort the connection if wal_sender_timeout has been + * reached. + */ +static long +WalSndComputeSleeptime(TimestampTz now) +{ + long sleeptime = 10000; /* 10 s */ + + if (wal_sender_timeout > 0 && last_reply_timestamp > 0) + { + TimestampTz wakeup_time; + + /* + * At the latest stop sleeping once wal_sender_timeout has been + * reached. + */ + wakeup_time = TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout); + + /* + * If no ping has been sent yet, wakeup when it's time to do so. + * WalSndKeepaliveIfNecessary() wants to send a keepalive once half of + * the timeout passed without a response. + */ + if (!waiting_for_ping_response) + wakeup_time = TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout / 2); + + /* Compute relative time until wakeup. */ + sleeptime = TimestampDifferenceMilliseconds(now, wakeup_time); + } + + return sleeptime; +} + +/* + * Check whether there have been responses by the client within + * wal_sender_timeout and shutdown if not. Using last_processing as the + * reference point avoids counting server-side stalls against the client. + * However, a long server-side stall can make WalSndKeepaliveIfNecessary() + * postdate last_processing by more than wal_sender_timeout. If that happens, + * the client must reply almost immediately to avoid a timeout. This rarely + * affects the default configuration, under which clients spontaneously send a + * message every standby_message_timeout = wal_sender_timeout/6 = 10s. We + * could eliminate that problem by recognizing timeout expiration at + * wal_sender_timeout/2 after the keepalive. + */ +static void +WalSndCheckTimeOut(void) +{ + TimestampTz timeout; + + /* don't bail out if we're doing something that doesn't require timeouts */ + if (last_reply_timestamp <= 0) + return; + + timeout = TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout); + + if (wal_sender_timeout > 0 && last_processing >= timeout) + { + /* + * Since typically expiration of replication timeout means + * communication problem, we don't send the error message to the + * standby. + */ + ereport(COMMERROR, + (errmsg("terminating walsender process due to replication timeout"))); + + WalSndShutdown(); + } +} + +/* Main loop of walsender process that streams the WAL over Copy messages. */ +static void +WalSndLoop(WalSndSendDataCallback send_data) +{ + /* + * Initialize the last reply timestamp. That enables timeout processing + * from hereon. + */ + last_reply_timestamp = GetCurrentTimestamp(); + waiting_for_ping_response = false; + + /* + * Loop until we reach the end of this timeline or the client requests to + * stop streaming. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* Check for input from the client */ + ProcessRepliesIfAny(); + + /* + * If we have received CopyDone from the client, sent CopyDone + * ourselves, and the output buffer is empty, it's time to exit + * streaming. + */ + if (streamingDoneReceiving && streamingDoneSending && + !pq_is_send_pending()) + break; + + /* + * If we don't have any pending data in the output buffer, try to send + * some more. If there is some, we don't bother to call send_data + * again until we've flushed it ... but we'd better assume we are not + * caught up. + */ + if (!pq_is_send_pending()) + send_data(); + else + WalSndCaughtUp = false; + + /* Try to flush pending output to the client */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + + /* If nothing remains to be sent right now ... */ + if (WalSndCaughtUp && !pq_is_send_pending()) + { + /* + * If we're in catchup state, move to streaming. This is an + * important state change for users to know about, since before + * this point data loss might occur if the primary dies and we + * need to failover to the standby. The state change is also + * important for synchronous replication, since commits that + * started to wait at that point might wait for some time. + */ + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + { + ereport(DEBUG1, + (errmsg_internal("\"%s\" has now caught up with upstream server", + application_name))); + WalSndSetState(WALSNDSTATE_STREAMING); + } + + /* + * When SIGUSR2 arrives, we send any outstanding logs up to the + * shutdown checkpoint record (i.e., the latest record), wait for + * them to be replicated to the standby, and exit. This may be a + * normal termination at shutdown, or a promotion, the walsender + * is not sure which. + */ + if (got_SIGUSR2) + WalSndDone(send_data); + } + + /* Check for replication timeout. */ + WalSndCheckTimeOut(); + + /* Send keepalive if the time has come */ + WalSndKeepaliveIfNecessary(); + + /* + * Block if we have unsent data. XXX For logical replication, let + * WalSndWaitForWal() handle any other blocking; idle receivers need + * its additional actions. For physical replication, also block if + * caught up; its send_data does not block. + */ + if ((WalSndCaughtUp && send_data != XLogSendLogical && + !streamingDoneSending) || + pq_is_send_pending()) + { + long sleeptime; + int wakeEvents; + + if (!streamingDoneReceiving) + wakeEvents = WL_SOCKET_READABLE; + else + wakeEvents = 0; + + /* + * Use fresh timestamp, not last_processing, to reduce the chance + * of reaching wal_sender_timeout before sending a keepalive. + */ + sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); + + if (pq_is_send_pending()) + wakeEvents |= WL_SOCKET_WRITEABLE; + + /* Sleep until something happens or we time out */ + WalSndWait(wakeEvents, sleeptime, WAIT_EVENT_WAL_SENDER_MAIN); + } + } +} + +/* Initialize a per-walsender data structure for this walsender process */ +static void +InitWalSenderSlot(void) +{ + int i; + + /* + * WalSndCtl should be set up already (we inherit this by fork() or + * EXEC_BACKEND mechanism from the postmaster). + */ + Assert(WalSndCtl != NULL); + Assert(MyWalSnd == NULL); + + /* + * Find a free walsender slot and reserve it. This must not fail due to + * the prior check for free WAL senders in InitProcess(). + */ + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + + SpinLockAcquire(&walsnd->mutex); + + if (walsnd->pid != 0) + { + SpinLockRelease(&walsnd->mutex); + continue; + } + else + { + /* + * Found a free slot. Reserve it for us. + */ + walsnd->pid = MyProcPid; + walsnd->state = WALSNDSTATE_STARTUP; + walsnd->sentPtr = InvalidXLogRecPtr; + walsnd->needreload = false; + walsnd->write = InvalidXLogRecPtr; + walsnd->flush = InvalidXLogRecPtr; + walsnd->apply = InvalidXLogRecPtr; + walsnd->writeLag = -1; + walsnd->flushLag = -1; + walsnd->applyLag = -1; + walsnd->sync_standby_priority = 0; + walsnd->latch = &MyProc->procLatch; + walsnd->replyTime = 0; + + /* + * The kind assignment is done here and not in StartReplication() + * and StartLogicalReplication(). Indeed, the logical walsender + * needs to read WAL records (like snapshot of running + * transactions) during the slot creation. So it needs to be woken + * up based on its kind. + * + * The kind assignment could also be done in StartReplication(), + * StartLogicalReplication() and CREATE_REPLICATION_SLOT but it + * seems better to set it on one place. + */ + if (MyDatabaseId == InvalidOid) + walsnd->kind = REPLICATION_KIND_PHYSICAL; + else + walsnd->kind = REPLICATION_KIND_LOGICAL; + + SpinLockRelease(&walsnd->mutex); + /* don't need the lock anymore */ + MyWalSnd = (WalSnd *) walsnd; + + break; + } + } + + Assert(MyWalSnd != NULL); + + /* Arrange to clean up at walsender exit */ + on_shmem_exit(WalSndKill, 0); +} + +/* Destroy the per-walsender data structure for this walsender process */ +static void +WalSndKill(int code, Datum arg) +{ + WalSnd *walsnd = MyWalSnd; + + Assert(walsnd != NULL); + + MyWalSnd = NULL; + + SpinLockAcquire(&walsnd->mutex); + /* clear latch while holding the spinlock, so it can safely be read */ + walsnd->latch = NULL; + /* Mark WalSnd struct as no longer being in use. */ + walsnd->pid = 0; + SpinLockRelease(&walsnd->mutex); +} + +/* XLogReaderRoutine->segment_open callback */ +static void +WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + char path[MAXPGPATH]; + + /*------- + * When reading from a historic timeline, and there is a timeline switch + * within this segment, read from the WAL segment belonging to the new + * timeline. + * + * For example, imagine that this server is currently on timeline 5, and + * we're streaming timeline 4. The switch from timeline 4 to 5 happened at + * 0/13002088. In pg_wal, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from segment 0x13, on + * timeline 4, we read the WAL from file 000000050000000000000013. Archive + * recovery prefers files from newer timelines, so if the segment was + * restored from the archive on this server, the file belonging to the old + * timeline, 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the used + * portion of the old segment is copied to the new file. ------- + */ + *tli_p = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); + if (nextSegNo == endSegNo) + *tli_p = sendTimeLineNextTLI; + } + + XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + /* + * If the file is not found, assume it's because the standby asked for a + * too old WAL segment that has already been removed or recycled. + */ + if (errno == ENOENT) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + xlogfname))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes; + XLogSegNo segno; + WALReadError errinfo; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + TimeLineID SendRqstTLI; + + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; + + SendRqstPtr = GetStandbyFlushRecPtr(&SendRqstTLI); + + if (!RecoveryInProgress()) + { + /* We have been promoted. */ + SendRqstTLI = GetWALInsertionTimeLine(); + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? + */ + if (sendTimeLine != SendRqstTLI) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(SendRqstTLI); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ + SendRqstPtr = GetFlushRecPtr(NULL); + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + if (xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += MAX_SEND_SIZE; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= MAX_SEND_SIZE); + + /* + * OK to read and send the slice. + */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 'w'); + + pq_sendint64(&output_message, startptr); /* dataStart */ + pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ + pq_sendint64(&output_message, 0); /* sendtime, filled in last */ + + /* + * Read the log directly into the output buffer to avoid extra memcpy + * calls. + */ + enlargeStringInfo(&output_message, nbytes); + +retry: + if (!WALRead(xlogreader, + &output_message.data[output_message.len], + startptr, + nbytes, + xlogreader->seg.ws_tli, /* Pass the current TLI because + * only WalSndSegmentOpen controls + * whether new TLI is needed. */ + &errinfo)) + WALReadRaiseError(&errinfo); + + /* See logical_read_xlog_page(). */ + XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); + CheckXLogRemoved(segno, xlogreader->seg.ws_tli); + + /* + * During recovery, the currently-open WAL file might be replaced with the + * file of the same name retrieved from archive. So we always need to + * check what we read was valid after reading into the buffer. If it's + * invalid, we try to open and read the file again. + */ + if (am_cascading_walsender) + { + WalSnd *walsnd = MyWalSnd; + bool reload; + + SpinLockAcquire(&walsnd->mutex); + reload = walsnd->needreload; + walsnd->needreload = false; + SpinLockRelease(&walsnd->mutex); + + if (reload && xlogreader->seg.ws_file >= 0) + { + wal_segment_close(xlogreader); + + goto retry; + } + } + + output_message.len += nbytes; + output_message.data[output_message.len] = '\0'; + + /* + * Fill the send timestamp last, so that it is taken as late as possible. + */ + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, GetCurrentTimestamp()); + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + + pq_putmessage_noblock('d', output_message.data, output_message.len); + + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} + +/* + * Stream out logically decoded data. + */ +static void +XLogSendLogical(void) +{ + XLogRecord *record; + char *errm; + + /* + * We'll use the current flush point to determine whether we've caught up. + * This variable is static in order to cache it across calls. Caching is + * helpful because GetFlushRecPtr() needs to acquire a heavily-contended + * spinlock. + */ + static XLogRecPtr flushPtr = InvalidXLogRecPtr; + + /* + * Don't know whether we've caught up yet. We'll set WalSndCaughtUp to + * true in WalSndWaitForWal, if we're actually waiting. We also set to + * true if XLogReadRecord() had to stop reading but WalSndWaitForWal + * didn't wait - i.e. when we're shutting down. + */ + WalSndCaughtUp = false; + + record = XLogReadRecord(logical_decoding_ctx->reader, &errm); + + /* xlog record was invalid */ + if (errm != NULL) + elog(ERROR, "could not find record while sending logically-decoded data: %s", + errm); + + if (record != NULL) + { + /* + * Note the lack of any call to LagTrackerWrite() which is handled by + * WalSndUpdateProgress which is called by output plugin through + * logical decoding write api. + */ + LogicalDecodingProcessRecord(logical_decoding_ctx, logical_decoding_ctx->reader); + + sentPtr = logical_decoding_ctx->reader->EndRecPtr; + } + + /* + * If first time through in this session, initialize flushPtr. Otherwise, + * we only need to update flushPtr if EndRecPtr is past it. + */ + if (flushPtr == InvalidXLogRecPtr || + logical_decoding_ctx->reader->EndRecPtr >= flushPtr) + { + if (am_cascading_walsender) + flushPtr = GetStandbyFlushRecPtr(NULL); + else + flushPtr = GetFlushRecPtr(NULL); + } + + /* If EndRecPtr is still past our flushPtr, it means we caught up. */ + if (logical_decoding_ctx->reader->EndRecPtr >= flushPtr) + WalSndCaughtUp = true; + + /* + * If we're caught up and have been requested to stop, have WalSndLoop() + * terminate the connection in an orderly manner, after writing out all + * the pending data. + */ + if (WalSndCaughtUp && got_STOPPING) + got_SIGUSR2 = true; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } +} + +/* + * Shutdown if the sender is caught up. + * + * NB: This should only be called when the shutdown signal has been received + * from postmaster. + * + * Note that if we determine that there's still more data to send, this + * function will return control to the caller. + */ +static void +WalSndDone(WalSndSendDataCallback send_data) +{ + XLogRecPtr replicatedPtr; + + /* ... let's just be real sure we're caught up ... */ + send_data(); + + /* + * To figure out whether all WAL has successfully been replicated, check + * flush location if valid, write otherwise. Tools like pg_receivewal will + * usually (unless in synchronous mode) return an invalid flush location. + */ + replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? + MyWalSnd->write : MyWalSnd->flush; + + if (WalSndCaughtUp && sentPtr == replicatedPtr && + !pq_is_send_pending()) + { + QueryCompletion qc; + + /* Inform the standby that XLOG streaming is done */ + SetQueryCompletion(&qc, CMDTAG_COPY, 0); + EndCommand(&qc, DestRemote, false); + pq_flush(); + + proc_exit(0); + } + if (!waiting_for_ping_response) + WalSndKeepalive(true, InvalidXLogRecPtr); +} + +/* + * Returns the latest point in WAL that has been safely flushed to disk, and + * can be sent to the standby. This should only be called when in recovery, + * ie. we're streaming to a cascaded standby. + * + * As a side-effect, *tli is updated to the TLI of the last + * replayed WAL record. + */ +static XLogRecPtr +GetStandbyFlushRecPtr(TimeLineID *tli) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + if (tli) + *tli = replayTLI; + + result = replayPtr; + if (receiveTLI == replayTLI && receivePtr > replayPtr) + result = receivePtr; + + return result; +} + +/* + * Request walsenders to reload the currently-open WAL file + */ +void +WalSndRqstFileReload(void) +{ + int i; + + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + + SpinLockAcquire(&walsnd->mutex); + if (walsnd->pid == 0) + { + SpinLockRelease(&walsnd->mutex); + continue; + } + walsnd->needreload = true; + SpinLockRelease(&walsnd->mutex); + } +} + +/* + * Handle PROCSIG_WALSND_INIT_STOPPING signal. + */ +void +HandleWalSndInitStopping(void) +{ + Assert(am_walsender); + + /* + * If replication has not yet started, die like with SIGTERM. If + * replication is active, only set a flag and wake up the main loop. It + * will send any outstanding WAL, wait for it to be replicated to the + * standby, and then exit gracefully. + */ + if (!replication_active) + kill(MyProcPid, SIGTERM); + else + got_STOPPING = true; +} + +/* + * SIGUSR2: set flag to do a last cycle and shut down afterwards. The WAL + * sender should already have been switched to WALSNDSTATE_STOPPING at + * this point. + */ +static void +WalSndLastCycleHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* Set up signal handlers */ +void +WalSndSignals(void) +{ + /* Set up signal handlers */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, StatementCancelHandler); /* query cancel */ + pqsignal(SIGTERM, die); /* request shutdown */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, WalSndLastCycleHandler); /* request a last cycle and + * shutdown */ + + /* Reset some signals that are accepted by postmaster but not here */ + pqsignal(SIGCHLD, SIG_DFL); +} + +/* Report shared-memory space needed by WalSndShmemInit */ +Size +WalSndShmemSize(void) +{ + Size size = 0; + + size = offsetof(WalSndCtlData, walsnds); + size = add_size(size, mul_size(max_wal_senders, sizeof(WalSnd))); + + return size; +} + +/* Allocate and initialize walsender-related shared memory */ +void +WalSndShmemInit(void) +{ + bool found; + int i; + + WalSndCtl = (WalSndCtlData *) + ShmemInitStruct("Wal Sender Ctl", WalSndShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(WalSndCtl, 0, WalSndShmemSize()); + + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++) + dlist_init(&(WalSndCtl->SyncRepQueue[i])); + + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + + SpinLockInit(&walsnd->mutex); + } + + ConditionVariableInit(&WalSndCtl->wal_flush_cv); + ConditionVariableInit(&WalSndCtl->wal_replay_cv); + } +} + +/* + * Wake up physical, logical or both kinds of walsenders + * + * The distinction between physical and logical walsenders is done, because: + * - physical walsenders can't send data until it's been flushed + * - logical walsenders on standby can't decode and send data until it's been + * applied + * + * For cascading replication we need to wake up physical walsenders separately + * from logical walsenders (see the comment before calling WalSndWakeup() in + * ApplyWalRecord() for more details). + * + * This will be called inside critical sections, so throwing an error is not + * advisable. + */ +void +WalSndWakeup(bool physical, bool logical) +{ + /* + * Wake up all the walsenders waiting on WAL being flushed or replayed + * respectively. Note that waiting walsender would have prepared to sleep + * on the CV (i.e., added itself to the CV's waitlist) in WalSndWait() + * before actually waiting. + */ + if (physical) + ConditionVariableBroadcast(&WalSndCtl->wal_flush_cv); + + if (logical) + ConditionVariableBroadcast(&WalSndCtl->wal_replay_cv); +} + +/* + * Wait for readiness on the FeBe socket, or a timeout. The mask should be + * composed of optional WL_SOCKET_WRITEABLE and WL_SOCKET_READABLE flags. Exit + * on postmaster death. + */ +static void +WalSndWait(uint32 socket_events, long timeout, uint32 wait_event) +{ + WaitEvent event; + + ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, socket_events, NULL); + + /* + * We use a condition variable to efficiently wake up walsenders in + * WalSndWakeup(). + * + * Every walsender prepares to sleep on a shared memory CV. Note that it + * just prepares to sleep on the CV (i.e., adds itself to the CV's + * waitlist), but does not actually wait on the CV (IOW, it never calls + * ConditionVariableSleep()). It still uses WaitEventSetWait() for + * waiting, because we also need to wait for socket events. The processes + * (startup process, walreceiver etc.) wanting to wake up walsenders use + * ConditionVariableBroadcast(), which in turn calls SetLatch(), helping + * walsenders come out of WaitEventSetWait(). + * + * This approach is simple and efficient because, one doesn't have to loop + * through all the walsenders slots, with a spinlock acquisition and + * release for every iteration, just to wake up only the waiting + * walsenders. It makes WalSndWakeup() callers' life easy. + * + * XXX: A desirable future improvement would be to add support for CVs + * into WaitEventSetWait(). + * + * And, we use separate shared memory CVs for physical and logical + * walsenders for selective wake ups, see WalSndWakeup() for more details. + */ + if (MyWalSnd->kind == REPLICATION_KIND_PHYSICAL) + ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); + else if (MyWalSnd->kind == REPLICATION_KIND_LOGICAL) + ConditionVariablePrepareToSleep(&WalSndCtl->wal_replay_cv); + + if (WaitEventSetWait(FeBeWaitSet, timeout, &event, 1, wait_event) == 1 && + (event.events & WL_POSTMASTER_DEATH)) + { + ConditionVariableCancelSleep(); + proc_exit(1); + } + + ConditionVariableCancelSleep(); +} + +/* + * Signal all walsenders to move to stopping state. + * + * This will trigger walsenders to move to a state where no further WAL can be + * generated. See this file's header for details. + */ +void +WalSndInitStopping(void) +{ + int i; + + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + pid_t pid; + + SpinLockAcquire(&walsnd->mutex); + pid = walsnd->pid; + SpinLockRelease(&walsnd->mutex); + + if (pid == 0) + continue; + + SendProcSignal(pid, PROCSIG_WALSND_INIT_STOPPING, InvalidBackendId); + } +} + +/* + * Wait that all the WAL senders have quit or reached the stopping state. This + * is used by the checkpointer to control when the shutdown checkpoint can + * safely be performed. + */ +void +WalSndWaitStopping(void) +{ + for (;;) + { + int i; + bool all_stopped = true; + + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + + SpinLockAcquire(&walsnd->mutex); + + if (walsnd->pid == 0) + { + SpinLockRelease(&walsnd->mutex); + continue; + } + + if (walsnd->state != WALSNDSTATE_STOPPING) + { + all_stopped = false; + SpinLockRelease(&walsnd->mutex); + break; + } + SpinLockRelease(&walsnd->mutex); + } + + /* safe to leave if confirmation is done for all WAL senders */ + if (all_stopped) + return; + + pg_usleep(10000L); /* wait for 10 msec */ + } +} + +/* Set state for current walsender (only called in walsender) */ +void +WalSndSetState(WalSndState state) +{ + WalSnd *walsnd = MyWalSnd; + + Assert(am_walsender); + + if (walsnd->state == state) + return; + + SpinLockAcquire(&walsnd->mutex); + walsnd->state = state; + SpinLockRelease(&walsnd->mutex); +} + +/* + * Return a string constant representing the state. This is used + * in system views, and should *not* be translated. + */ +static const char * +WalSndGetStateString(WalSndState state) +{ + switch (state) + { + case WALSNDSTATE_STARTUP: + return "startup"; + case WALSNDSTATE_BACKUP: + return "backup"; + case WALSNDSTATE_CATCHUP: + return "catchup"; + case WALSNDSTATE_STREAMING: + return "streaming"; + case WALSNDSTATE_STOPPING: + return "stopping"; + } + return "UNKNOWN"; +} + +static Interval * +offset_to_interval(TimeOffset offset) +{ + Interval *result = palloc(sizeof(Interval)); + + result->month = 0; + result->day = 0; + result->time = offset; + + return result; +} + +/* + * Returns activity of walsenders, including pids and xlog locations sent to + * standby servers. + */ +Datum +pg_stat_get_wal_senders(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_WAL_SENDERS_COLS 12 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + SyncRepStandbyData *sync_standbys; + int num_standbys; + int i; + + InitMaterializedSRF(fcinfo, 0); + + /* + * Get the currently active synchronous standbys. This could be out of + * date before we're done, but we'll use the data anyway. + */ + num_standbys = SyncRepGetCandidateStandbys(&sync_standbys); + + for (i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + XLogRecPtr sentPtr; + XLogRecPtr write; + XLogRecPtr flush; + XLogRecPtr apply; + TimeOffset writeLag; + TimeOffset flushLag; + TimeOffset applyLag; + int priority; + int pid; + WalSndState state; + TimestampTz replyTime; + bool is_sync_standby; + Datum values[PG_STAT_GET_WAL_SENDERS_COLS]; + bool nulls[PG_STAT_GET_WAL_SENDERS_COLS] = {0}; + int j; + + /* Collect data from shared memory */ + SpinLockAcquire(&walsnd->mutex); + if (walsnd->pid == 0) + { + SpinLockRelease(&walsnd->mutex); + continue; + } + pid = walsnd->pid; + sentPtr = walsnd->sentPtr; + state = walsnd->state; + write = walsnd->write; + flush = walsnd->flush; + apply = walsnd->apply; + writeLag = walsnd->writeLag; + flushLag = walsnd->flushLag; + applyLag = walsnd->applyLag; + priority = walsnd->sync_standby_priority; + replyTime = walsnd->replyTime; + SpinLockRelease(&walsnd->mutex); + + /* + * Detect whether walsender is/was considered synchronous. We can + * provide some protection against stale data by checking the PID + * along with walsnd_index. + */ + is_sync_standby = false; + for (j = 0; j < num_standbys; j++) + { + if (sync_standbys[j].walsnd_index == i && + sync_standbys[j].pid == pid) + { + is_sync_standby = true; + break; + } + } + + values[0] = Int32GetDatum(pid); + + if (!has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS)) + { + /* + * Only superusers and roles with privileges of pg_read_all_stats + * can see details. Other users only get the pid value to know + * it's a walsender, but no details. + */ + MemSet(&nulls[1], true, PG_STAT_GET_WAL_SENDERS_COLS - 1); + } + else + { + values[1] = CStringGetTextDatum(WalSndGetStateString(state)); + + if (XLogRecPtrIsInvalid(sentPtr)) + nulls[2] = true; + values[2] = LSNGetDatum(sentPtr); + + if (XLogRecPtrIsInvalid(write)) + nulls[3] = true; + values[3] = LSNGetDatum(write); + + if (XLogRecPtrIsInvalid(flush)) + nulls[4] = true; + values[4] = LSNGetDatum(flush); + + if (XLogRecPtrIsInvalid(apply)) + nulls[5] = true; + values[5] = LSNGetDatum(apply); + + /* + * Treat a standby such as a pg_basebackup background process + * which always returns an invalid flush location, as an + * asynchronous standby. + */ + priority = XLogRecPtrIsInvalid(flush) ? 0 : priority; + + if (writeLag < 0) + nulls[6] = true; + else + values[6] = IntervalPGetDatum(offset_to_interval(writeLag)); + + if (flushLag < 0) + nulls[7] = true; + else + values[7] = IntervalPGetDatum(offset_to_interval(flushLag)); + + if (applyLag < 0) + nulls[8] = true; + else + values[8] = IntervalPGetDatum(offset_to_interval(applyLag)); + + values[9] = Int32GetDatum(priority); + + /* + * More easily understood version of standby state. This is purely + * informational. + * + * In quorum-based sync replication, the role of each standby + * listed in synchronous_standby_names can be changing very + * frequently. Any standbys considered as "sync" at one moment can + * be switched to "potential" ones at the next moment. So, it's + * basically useless to report "sync" or "potential" as their sync + * states. We report just "quorum" for them. + */ + if (priority == 0) + values[10] = CStringGetTextDatum("async"); + else if (is_sync_standby) + values[10] = SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY ? + CStringGetTextDatum("sync") : CStringGetTextDatum("quorum"); + else + values[10] = CStringGetTextDatum("potential"); + + if (replyTime == 0) + nulls[11] = true; + else + values[11] = TimestampTzGetDatum(replyTime); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + +/* + * Send a keepalive message to standby. + * + * If requestReply is set, the message requests the other party to send + * a message back to us, for heartbeat purposes. We also set a flag to + * let nearby code know that we're waiting for that response, to avoid + * repeated requests. + * + * writePtr is the location up to which the WAL is sent. It is essentially + * the same as sentPtr but in some cases, we need to send keep alive before + * sentPtr is updated like when skipping empty transactions. + */ +static void +WalSndKeepalive(bool requestReply, XLogRecPtr writePtr) +{ + elog(DEBUG2, "sending replication keepalive"); + + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 'k'); + pq_sendint64(&output_message, XLogRecPtrIsInvalid(writePtr) ? sentPtr : writePtr); + pq_sendint64(&output_message, GetCurrentTimestamp()); + pq_sendbyte(&output_message, requestReply ? 1 : 0); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock('d', output_message.data, output_message.len); + + /* Set local flag */ + if (requestReply) + waiting_for_ping_response = true; +} + +/* + * Send keepalive message if too much time has elapsed. + */ +static void +WalSndKeepaliveIfNecessary(void) +{ + TimestampTz ping_time; + + /* + * Don't send keepalive messages if timeouts are globally disabled or + * we're doing something not partaking in timeouts. + */ + if (wal_sender_timeout <= 0 || last_reply_timestamp <= 0) + return; + + if (waiting_for_ping_response) + return; + + /* + * If half of wal_sender_timeout has lapsed without receiving any reply + * from the standby, send a keep-alive message to the standby requesting + * an immediate reply. + */ + ping_time = TimestampTzPlusMilliseconds(last_reply_timestamp, + wal_sender_timeout / 2); + if (last_processing >= ping_time) + { + WalSndKeepalive(true, InvalidXLogRecPtr); + + /* Try to flush pending output to the client */ + if (pq_flush_if_writable() != 0) + WalSndShutdown(); + } +} + +/* + * Record the end of the WAL and the time it was flushed locally, so that + * LagTrackerRead can compute the elapsed time (lag) when this WAL location is + * eventually reported to have been written, flushed and applied by the + * standby in a reply message. + */ +static void +LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) +{ + bool buffer_full; + int new_write_head; + int i; + + if (!am_walsender) + return; + + /* + * If the lsn hasn't advanced since last time, then do nothing. This way + * we only record a new sample when new WAL has been written. + */ + if (lag_tracker->last_lsn == lsn) + return; + lag_tracker->last_lsn = lsn; + + /* + * If advancing the write head of the circular buffer would crash into any + * of the read heads, then the buffer is full. In other words, the + * slowest reader (presumably apply) is the one that controls the release + * of space. + */ + new_write_head = (lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE; + buffer_full = false; + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; ++i) + { + if (new_write_head == lag_tracker->read_heads[i]) + buffer_full = true; + } + + /* + * If the buffer is full, for now we just rewind by one slot and overwrite + * the last sample, as a simple (if somewhat uneven) way to lower the + * sampling rate. There may be better adaptive compaction algorithms. + */ + if (buffer_full) + { + new_write_head = lag_tracker->write_head; + if (lag_tracker->write_head > 0) + lag_tracker->write_head--; + else + lag_tracker->write_head = LAG_TRACKER_BUFFER_SIZE - 1; + } + + /* Store a sample at the current write head position. */ + lag_tracker->buffer[lag_tracker->write_head].lsn = lsn; + lag_tracker->buffer[lag_tracker->write_head].time = local_flush_time; + lag_tracker->write_head = new_write_head; +} + +/* + * Find out how much time has elapsed between the moment WAL location 'lsn' + * (or the highest known earlier LSN) was flushed locally and the time 'now'. + * We have a separate read head for each of the reported LSN locations we + * receive in replies from standby; 'head' controls which read head is + * used. Whenever a read head crosses an LSN which was written into the + * lag buffer with LagTrackerWrite, we can use the associated timestamp to + * find out the time this LSN (or an earlier one) was flushed locally, and + * therefore compute the lag. + * + * Return -1 if no new sample data is available, and otherwise the elapsed + * time in microseconds. + */ +static TimeOffset +LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) +{ + TimestampTz time = 0; + + /* Read all unread samples up to this LSN or end of buffer. */ + while (lag_tracker->read_heads[head] != lag_tracker->write_head && + lag_tracker->buffer[lag_tracker->read_heads[head]].lsn <= lsn) + { + time = lag_tracker->buffer[lag_tracker->read_heads[head]].time; + lag_tracker->last_read[head] = + lag_tracker->buffer[lag_tracker->read_heads[head]]; + lag_tracker->read_heads[head] = + (lag_tracker->read_heads[head] + 1) % LAG_TRACKER_BUFFER_SIZE; + } + + /* + * If the lag tracker is empty, that means the standby has processed + * everything we've ever sent so we should now clear 'last_read'. If we + * didn't do that, we'd risk using a stale and irrelevant sample for + * interpolation at the beginning of the next burst of WAL after a period + * of idleness. + */ + if (lag_tracker->read_heads[head] == lag_tracker->write_head) + lag_tracker->last_read[head].time = 0; + + if (time > now) + { + /* If the clock somehow went backwards, treat as not found. */ + return -1; + } + else if (time == 0) + { + /* + * We didn't cross a time. If there is a future sample that we + * haven't reached yet, and we've already reached at least one sample, + * let's interpolate the local flushed time. This is mainly useful + * for reporting a completely stuck apply position as having + * increasing lag, since otherwise we'd have to wait for it to + * eventually start moving again and cross one of our samples before + * we can show the lag increasing. + */ + if (lag_tracker->read_heads[head] == lag_tracker->write_head) + { + /* There are no future samples, so we can't interpolate. */ + return -1; + } + else if (lag_tracker->last_read[head].time != 0) + { + /* We can interpolate between last_read and the next sample. */ + double fraction; + WalTimeSample prev = lag_tracker->last_read[head]; + WalTimeSample next = lag_tracker->buffer[lag_tracker->read_heads[head]]; + + if (lsn < prev.lsn) + { + /* + * Reported LSNs shouldn't normally go backwards, but it's + * possible when there is a timeline change. Treat as not + * found. + */ + return -1; + } + + Assert(prev.lsn < next.lsn); + + if (prev.time > next.time) + { + /* If the clock somehow went backwards, treat as not found. */ + return -1; + } + + /* See how far we are between the previous and next samples. */ + fraction = + (double) (lsn - prev.lsn) / (double) (next.lsn - prev.lsn); + + /* Scale the local flush time proportionally. */ + time = (TimestampTz) + ((double) prev.time + (next.time - prev.time) * fraction); + } + else + { + /* + * We have only a future sample, implying that we were entirely + * caught up but and now there is a new burst of WAL and the + * standby hasn't processed the first sample yet. Until the + * standby reaches the future sample the best we can do is report + * the hypothetical lag if that sample were to be replayed now. + */ + time = lag_tracker->buffer[lag_tracker->read_heads[head]].time; + } + } + + /* Return the elapsed time since local flush time in microseconds. */ + Assert(time != 0); + return now - time; +} |