summaryrefslogtreecommitdiffstats
path: root/src/backend/replication/basebackup.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/replication/basebackup.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/replication/basebackup.c')
-rw-r--r--src/backend/replication/basebackup.c2034
1 files changed, 2034 insertions, 0 deletions
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
new file mode 100644
index 0000000..d142cc2
--- /dev/null
+++ b/src/backend/replication/basebackup.c
@@ -0,0 +1,2034 @@
+/*-------------------------------------------------------------------------
+ *
+ * basebackup.c
+ * code for taking a base backup and streaming it to a standby
+ *
+ * Portions Copyright (c) 2010-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/replication/basebackup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+
+#include "access/xlog_internal.h" /* for pg_start/stop_backup */
+#include "catalog/pg_type.h"
+#include "common/file_perm.h"
+#include "commands/progress.h"
+#include "lib/stringinfo.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "nodes/pg_list.h"
+#include "pgstat.h"
+#include "pgtar.h"
+#include "port.h"
+#include "postmaster/syslogger.h"
+#include "replication/basebackup.h"
+#include "replication/backup_manifest.h"
+#include "replication/walsender.h"
+#include "replication/walsender_private.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
+#include "storage/dsm_impl.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/reinit.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+#include "utils/relcache.h"
+#include "utils/resowner.h"
+#include "utils/timestamp.h"
+
+typedef struct
+{
+ const char *label;
+ bool progress;
+ bool fastcheckpoint;
+ bool nowait;
+ bool includewal;
+ uint32 maxrate;
+ bool sendtblspcmapfile;
+ backup_manifest_option manifest;
+ pg_checksum_type manifest_checksum_type;
+} basebackup_options;
+
+static int64 sendTablespace(char *path, char *oid, bool sizeonly,
+ struct backup_manifest_info *manifest);
+static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
+ List *tablespaces, bool sendtblspclinks,
+ backup_manifest_info *manifest, const char *spcoid);
+static bool sendFile(const char *readfilename, const char *tarfilename,
+ struct stat *statbuf, bool missing_ok, Oid dboid,
+ backup_manifest_info *manifest, const char *spcoid);
+static void sendFileWithContent(const char *filename, const char *content,
+ backup_manifest_info *manifest);
+static int64 _tarWriteHeader(const char *filename, const char *linktarget,
+ struct stat *statbuf, bool sizeonly);
+static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
+ bool sizeonly);
+static void send_int8_string(StringInfoData *buf, int64 intval);
+static void SendBackupHeader(List *tablespaces);
+static void perform_base_backup(basebackup_options *opt);
+static void parse_basebackup_options(List *options, basebackup_options *opt);
+static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli);
+static int compareWalFileNames(const ListCell *a, const ListCell *b);
+static void throttle(size_t increment);
+static void update_basebackup_progress(int64 delta);
+static bool is_checksummed_file(const char *fullpath, const char *filename);
+static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
+ const char *filename, bool partial_read_ok);
+
+/* Was the backup currently in-progress initiated in recovery mode? */
+static bool backup_started_in_recovery = false;
+
+/* Relative path of temporary statistics directory */
+static char *statrelpath = NULL;
+
+/*
+ * Size of each block sent into the tar stream for larger files.
+ */
+#define TAR_SEND_SIZE 32768
+
+/*
+ * How frequently to throttle, as a fraction of the specified rate-second.
+ */
+#define THROTTLING_FREQUENCY 8
+
+/* The actual number of bytes, transfer of which may cause sleep. */
+static uint64 throttling_sample;
+
+/* Amount of data already transferred but not yet throttled. */
+static int64 throttling_counter;
+
+/* The minimum time required to transfer throttling_sample bytes. */
+static TimeOffset elapsed_min_unit;
+
+/* The last check of the transfer rate. */
+static TimestampTz throttled_last;
+
+/* The starting XLOG position of the base backup. */
+static XLogRecPtr startptr;
+
+/* Total number of checksum failures during base backup. */
+static long long int total_checksum_failures;
+
+/* Do not verify checksums. */
+static bool noverify_checksums = false;
+
+/*
+ * Total amount of backup data that will be streamed.
+ * -1 means that the size is not estimated.
+ */
+static int64 backup_total = 0;
+
+/* Amount of backup data already streamed */
+static int64 backup_streamed = 0;
+
+/*
+ * Definition of one element part of an exclusion list, used for paths part
+ * of checksum validation or base backups. "name" is the name of the file
+ * or path to check for exclusion. If "match_prefix" is true, any items
+ * matching the name as prefix are excluded.
+ */
+struct exclude_list_item
+{
+ const char *name;
+ bool match_prefix;
+};
+
+/*
+ * The contents of these directories are removed or recreated during server
+ * start so they are not included in backups. The directories themselves are
+ * kept and included as empty to preserve access permissions.
+ *
+ * Note: this list should be kept in sync with the filter lists in pg_rewind's
+ * filemap.c.
+ */
+static const char *const excludeDirContents[] =
+{
+ /*
+ * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
+ * when stats_temp_directory is set because PGSS_TEXT_FILE is always
+ * created there.
+ */
+ PG_STAT_TMP_DIR,
+
+ /*
+ * It is generally not useful to backup the contents of this directory
+ * even if the intention is to restore to another primary. See backup.sgml
+ * for a more detailed description.
+ */
+ "pg_replslot",
+
+ /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
+ PG_DYNSHMEM_DIR,
+
+ /* Contents removed on startup, see AsyncShmemInit(). */
+ "pg_notify",
+
+ /*
+ * Old contents are loaded for possible debugging but are not required for
+ * normal operation, see SerialInit().
+ */
+ "pg_serial",
+
+ /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
+ "pg_snapshots",
+
+ /* Contents zeroed on startup, see StartupSUBTRANS(). */
+ "pg_subtrans",
+
+ /* end of list */
+ NULL
+};
+
+/*
+ * List of files excluded from backups.
+ */
+static const struct exclude_list_item excludeFiles[] =
+{
+ /* Skip auto conf temporary file. */
+ {PG_AUTOCONF_FILENAME ".tmp", false},
+
+ /* Skip current log file temporary file */
+ {LOG_METAINFO_DATAFILE_TMP, false},
+
+ /*
+ * Skip relation cache because it is rebuilt on startup. This includes
+ * temporary files.
+ */
+ {RELCACHE_INIT_FILENAME, true},
+
+ /*
+ * If there's a backup_label or tablespace_map file, it belongs to a
+ * backup started by the user with pg_start_backup(). It is *not* correct
+ * for this backup. Our backup_label/tablespace_map is injected into the
+ * tar separately.
+ */
+ {BACKUP_LABEL_FILE, false},
+ {TABLESPACE_MAP, false},
+
+ /*
+ * If there's a backup_manifest, it belongs to a backup that was used to
+ * start this server. It is *not* correct for this backup. Our
+ * backup_manifest is injected into the backup separately if users want
+ * it.
+ */
+ {"backup_manifest", false},
+
+ {"postmaster.pid", false},
+ {"postmaster.opts", false},
+
+ /* end of list */
+ {NULL, false}
+};
+
+/*
+ * List of files excluded from checksum validation.
+ *
+ * Note: this list should be kept in sync with what pg_checksums.c
+ * includes.
+ */
+static const struct exclude_list_item noChecksumFiles[] = {
+ {"pg_control", false},
+ {"pg_filenode.map", false},
+ {"pg_internal.init", true},
+ {"PG_VERSION", false},
+#ifdef EXEC_BACKEND
+ {"config_exec_params", true},
+#endif
+ {NULL, false}
+};
+
+/*
+ * Actually do a base backup for the specified tablespaces.
+ *
+ * This is split out mainly to avoid complaints about "variable might be
+ * clobbered by longjmp" from stupider versions of gcc.
+ */
+static void
+perform_base_backup(basebackup_options *opt)
+{
+ TimeLineID starttli;
+ XLogRecPtr endptr;
+ TimeLineID endtli;
+ StringInfo labelfile;
+ StringInfo tblspc_map_file;
+ backup_manifest_info manifest;
+ int datadirpathlen;
+ List *tablespaces = NIL;
+
+ backup_total = 0;
+ backup_streamed = 0;
+ pgstat_progress_start_command(PROGRESS_COMMAND_BASEBACKUP, InvalidOid);
+
+ /*
+ * If the estimation of the total backup size is disabled, make the
+ * backup_total column in the view return NULL by setting the parameter to
+ * -1.
+ */
+ if (!opt->progress)
+ {
+ backup_total = -1;
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TOTAL,
+ backup_total);
+ }
+
+ /* we're going to use a BufFile, so we need a ResourceOwner */
+ Assert(CurrentResourceOwner == NULL);
+ CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup");
+
+ datadirpathlen = strlen(DataDir);
+
+ backup_started_in_recovery = RecoveryInProgress();
+
+ labelfile = makeStringInfo();
+ tblspc_map_file = makeStringInfo();
+ InitializeBackupManifest(&manifest, opt->manifest,
+ opt->manifest_checksum_type);
+
+ total_checksum_failures = 0;
+
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
+ PROGRESS_BASEBACKUP_PHASE_WAIT_CHECKPOINT);
+ startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
+ labelfile, &tablespaces,
+ tblspc_map_file);
+
+ /*
+ * Once do_pg_start_backup has been called, ensure that any failure causes
+ * us to abort the backup so we don't "leak" a backup counter. For this
+ * reason, *all* functionality between do_pg_start_backup() and the end of
+ * do_pg_stop_backup() should be inside the error cleanup block!
+ */
+
+ PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
+ {
+ ListCell *lc;
+ tablespaceinfo *ti;
+ int tblspc_streamed = 0;
+
+ /*
+ * Calculate the relative path of temporary statistics directory in
+ * order to skip the files which are located in that directory later.
+ */
+ if (is_absolute_path(pgstat_stat_directory) &&
+ strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
+ statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
+ else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
+ statrelpath = psprintf("./%s", pgstat_stat_directory);
+ else
+ statrelpath = pgstat_stat_directory;
+
+ /* Add a node for the base directory at the end */
+ ti = palloc0(sizeof(tablespaceinfo));
+ ti->size = -1;
+ tablespaces = lappend(tablespaces, ti);
+
+ /*
+ * Calculate the total backup size by summing up the size of each
+ * tablespace
+ */
+ if (opt->progress)
+ {
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
+ PROGRESS_BASEBACKUP_PHASE_ESTIMATE_BACKUP_SIZE);
+
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
+
+ if (tmp->path == NULL)
+ tmp->size = sendDir(".", 1, true, tablespaces, true, NULL,
+ NULL);
+ else
+ tmp->size = sendTablespace(tmp->path, tmp->oid, true,
+ NULL);
+ backup_total += tmp->size;
+ }
+ }
+
+ /* Report that we are now streaming database files as a base backup */
+ {
+ const int index[] = {
+ PROGRESS_BASEBACKUP_PHASE,
+ PROGRESS_BASEBACKUP_BACKUP_TOTAL,
+ PROGRESS_BASEBACKUP_TBLSPC_TOTAL
+ };
+ const int64 val[] = {
+ PROGRESS_BASEBACKUP_PHASE_STREAM_BACKUP,
+ backup_total, list_length(tablespaces)
+ };
+
+ pgstat_progress_update_multi_param(3, index, val);
+ }
+
+ /* Send the starting position of the backup */
+ SendXlogRecPtrResult(startptr, starttli);
+
+ /* Send tablespace header */
+ SendBackupHeader(tablespaces);
+
+ /* Setup and activate network throttling, if client requested it */
+ if (opt->maxrate > 0)
+ {
+ throttling_sample =
+ (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY;
+
+ /*
+ * The minimum amount of time for throttling_sample bytes to be
+ * transferred.
+ */
+ elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY;
+
+ /* Enable throttling. */
+ throttling_counter = 0;
+
+ /* The 'real data' starts now (header was ignored). */
+ throttled_last = GetCurrentTimestamp();
+ }
+ else
+ {
+ /* Disable throttling. */
+ throttling_counter = -1;
+ }
+
+ /* Send off our tablespaces one by one */
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+ StringInfoData buf;
+
+ /* Send CopyOutResponse message */
+ pq_beginmessage(&buf, 'H');
+ pq_sendbyte(&buf, 0); /* overall format */
+ pq_sendint16(&buf, 0); /* natts */
+ pq_endmessage(&buf);
+
+ if (ti->path == NULL)
+ {
+ struct stat statbuf;
+ bool sendtblspclinks = true;
+
+ /* In the main tar, include the backup_label first... */
+ sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data,
+ &manifest);
+
+ /* Then the tablespace_map file, if required... */
+ if (opt->sendtblspcmapfile)
+ {
+ sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data,
+ &manifest);
+ sendtblspclinks = false;
+ }
+
+ /* Then the bulk of the files... */
+ sendDir(".", 1, false, tablespaces, sendtblspclinks,
+ &manifest, NULL);
+
+ /* ... and pg_control after everything else. */
+ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ XLOG_CONTROL_FILE)));
+ sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
+ false, InvalidOid, &manifest, NULL);
+ }
+ else
+ sendTablespace(ti->path, ti->oid, false, &manifest);
+
+ /*
+ * If we're including WAL, and this is the main data directory we
+ * don't terminate the tar stream here. Instead, we will append
+ * the xlog files below and terminate it then. This is safe since
+ * the main data directory is always sent *last*.
+ */
+ if (opt->includewal && ti->path == NULL)
+ {
+ Assert(lnext(tablespaces, lc) == NULL);
+ }
+ else
+ pq_putemptymessage('c'); /* CopyDone */
+
+ tblspc_streamed++;
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_TBLSPC_STREAMED,
+ tblspc_streamed);
+ }
+
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
+ PROGRESS_BASEBACKUP_PHASE_WAIT_WAL_ARCHIVE);
+ endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
+
+
+ if (opt->includewal)
+ {
+ /*
+ * We've left the last tar file "open", so we can now append the
+ * required WAL files to it.
+ */
+ char pathbuf[MAXPGPATH];
+ XLogSegNo segno;
+ XLogSegNo startsegno;
+ XLogSegNo endsegno;
+ struct stat statbuf;
+ List *historyFileList = NIL;
+ List *walFileList = NIL;
+ char firstoff[MAXFNAMELEN];
+ char lastoff[MAXFNAMELEN];
+ DIR *dir;
+ struct dirent *de;
+ ListCell *lc;
+ TimeLineID tli;
+
+ pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
+ PROGRESS_BASEBACKUP_PHASE_TRANSFER_WAL);
+
+ /*
+ * I'd rather not worry about timelines here, so scan pg_wal and
+ * include all WAL files in the range between 'startptr' and 'endptr',
+ * regardless of the timeline the file is stamped with. If there are
+ * some spurious WAL files belonging to timelines that don't belong in
+ * this server's history, they will be included too. Normally there
+ * shouldn't be such files, but if there are, there's little harm in
+ * including them.
+ */
+ XLByteToSeg(startptr, startsegno, wal_segment_size);
+ XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size);
+ XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
+ XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size);
+
+ dir = AllocateDir("pg_wal");
+ while ((de = ReadDir(dir, "pg_wal")) != NULL)
+ {
+ /* Does it look like a WAL segment, and is it in the range? */
+ if (IsXLogFileName(de->d_name) &&
+ strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
+ strcmp(de->d_name + 8, lastoff + 8) <= 0)
+ {
+ walFileList = lappend(walFileList, pstrdup(de->d_name));
+ }
+ /* Does it look like a timeline history file? */
+ else if (IsTLHistoryFileName(de->d_name))
+ {
+ historyFileList = lappend(historyFileList, pstrdup(de->d_name));
+ }
+ }
+ FreeDir(dir);
+
+ /*
+ * Before we go any further, check that none of the WAL segments we
+ * need were removed.
+ */
+ CheckXLogRemoved(startsegno, ThisTimeLineID);
+
+ /*
+ * Sort the WAL filenames. We want to send the files in order from
+ * oldest to newest, to reduce the chance that a file is recycled
+ * before we get a chance to send it over.
+ */
+ list_sort(walFileList, compareWalFileNames);
+
+ /*
+ * There must be at least one xlog file in the pg_wal directory, since
+ * we are doing backup-including-xlog.
+ */
+ if (walFileList == NIL)
+ ereport(ERROR,
+ (errmsg("could not find any WAL files")));
+
+ /*
+ * Sanity check: the first and last segment should cover startptr and
+ * endptr, with no gaps in between.
+ */
+ XLogFromFileName((char *) linitial(walFileList),
+ &tli, &segno, wal_segment_size);
+ if (segno != startsegno)
+ {
+ char startfname[MAXFNAMELEN];
+
+ XLogFileName(startfname, ThisTimeLineID, startsegno,
+ wal_segment_size);
+ ereport(ERROR,
+ (errmsg("could not find WAL file \"%s\"", startfname)));
+ }
+ foreach(lc, walFileList)
+ {
+ char *walFileName = (char *) lfirst(lc);
+ XLogSegNo currsegno = segno;
+ XLogSegNo nextsegno = segno + 1;
+
+ XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
+ if (!(nextsegno == segno || currsegno == segno))
+ {
+ char nextfname[MAXFNAMELEN];
+
+ XLogFileName(nextfname, ThisTimeLineID, nextsegno,
+ wal_segment_size);
+ ereport(ERROR,
+ (errmsg("could not find WAL file \"%s\"", nextfname)));
+ }
+ }
+ if (segno != endsegno)
+ {
+ char endfname[MAXFNAMELEN];
+
+ XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size);
+ ereport(ERROR,
+ (errmsg("could not find WAL file \"%s\"", endfname)));
+ }
+
+ /* Ok, we have everything we need. Send the WAL files. */
+ foreach(lc, walFileList)
+ {
+ char *walFileName = (char *) lfirst(lc);
+ int fd;
+ char buf[TAR_SEND_SIZE];
+ size_t cnt;
+ pgoff_t len = 0;
+
+ snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
+ XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
+
+ fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
+ if (fd < 0)
+ {
+ int save_errno = errno;
+
+ /*
+ * Most likely reason for this is that the file was already
+ * removed by a checkpoint, so check for that to get a better
+ * error message.
+ */
+ CheckXLogRemoved(segno, tli);
+
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", pathbuf)));
+ }
+
+ if (fstat(fd, &statbuf) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ pathbuf)));
+ if (statbuf.st_size != wal_segment_size)
+ {
+ CheckXLogRemoved(segno, tli);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("unexpected WAL file size \"%s\"", walFileName)));
+ }
+
+ /* send the WAL file itself */
+ _tarWriteHeader(pathbuf, NULL, &statbuf, false);
+
+ while ((cnt = basebackup_read_file(fd, buf,
+ Min(sizeof(buf),
+ wal_segment_size - len),
+ len, pathbuf, true)) > 0)
+ {
+ CheckXLogRemoved(segno, tli);
+ /* Send the chunk as a CopyData message */
+ if (pq_putmessage('d', buf, cnt))
+ ereport(ERROR,
+ (errmsg("base backup could not send data, aborting backup")));
+ update_basebackup_progress(cnt);
+
+ len += cnt;
+ throttle(cnt);
+
+ if (len == wal_segment_size)
+ break;
+ }
+
+ if (len != wal_segment_size)
+ {
+ CheckXLogRemoved(segno, tli);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("unexpected WAL file size \"%s\"", walFileName)));
+ }
+
+ /*
+ * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need
+ * for padding.
+ */
+ Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
+
+ CloseTransientFile(fd);
+
+ /*
+ * Mark file as archived, otherwise files can get archived again
+ * after promotion of a new node. This is in line with
+ * walreceiver.c always doing an XLogArchiveForceDone() after a
+ * complete segment.
+ */
+ StatusFilePath(pathbuf, walFileName, ".done");
+ sendFileWithContent(pathbuf, "", &manifest);
+ }
+
+ /*
+ * Send timeline history files too. Only the latest timeline history
+ * file is required for recovery, and even that only if there happens
+ * to be a timeline switch in the first WAL segment that contains the
+ * checkpoint record, or if we're taking a base backup from a standby
+ * server and the target timeline changes while the backup is taken.
+ * But they are small and highly useful for debugging purposes, so
+ * better include them all, always.
+ */
+ foreach(lc, historyFileList)
+ {
+ char *fname = lfirst(lc);
+
+ snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
+
+ if (lstat(pathbuf, &statbuf) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", pathbuf)));
+
+ sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid,
+ &manifest, NULL);
+
+ /* unconditionally mark file as archived */
+ StatusFilePath(pathbuf, fname, ".done");
+ sendFileWithContent(pathbuf, "", &manifest);
+ }
+
+ /* Send CopyDone message for the last tar file */
+ pq_putemptymessage('c');
+ }
+
+ AddWALInfoToBackupManifest(&manifest, startptr, starttli, endptr, endtli);
+
+ SendBackupManifest(&manifest);
+
+ SendXlogRecPtrResult(endptr, endtli);
+
+ if (total_checksum_failures)
+ {
+ if (total_checksum_failures > 1)
+ ereport(WARNING,
+ (errmsg_plural("%lld total checksum verification failure",
+ "%lld total checksum verification failures",
+ total_checksum_failures,
+ total_checksum_failures)));
+
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("checksum verification failure during base backup")));
+ }
+
+ /*
+ * Make sure to free the manifest before the resource owners as manifests
+ * use cryptohash contexts that may depend on resource owners (like
+ * OpenSSL).
+ */
+ FreeBackupManifest(&manifest);
+
+ /* clean up the resource owner we created */
+ WalSndResourceCleanup(true);
+
+ pgstat_progress_end_command();
+}
+
+/*
+ * list_sort comparison function, to compare log/seg portion of WAL segment
+ * filenames, ignoring the timeline portion.
+ */
+static int
+compareWalFileNames(const ListCell *a, const ListCell *b)
+{
+ char *fna = (char *) lfirst(a);
+ char *fnb = (char *) lfirst(b);
+
+ return strcmp(fna + 8, fnb + 8);
+}
+
+/*
+ * Parse the base backup options passed down by the parser
+ */
+static void
+parse_basebackup_options(List *options, basebackup_options *opt)
+{
+ ListCell *lopt;
+ bool o_label = false;
+ bool o_progress = false;
+ bool o_fast = false;
+ bool o_nowait = false;
+ bool o_wal = false;
+ bool o_maxrate = false;
+ bool o_tablespace_map = false;
+ bool o_noverify_checksums = false;
+ bool o_manifest = false;
+ bool o_manifest_checksums = false;
+
+ MemSet(opt, 0, sizeof(*opt));
+ opt->manifest = MANIFEST_OPTION_NO;
+ opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C;
+
+ foreach(lopt, options)
+ {
+ DefElem *defel = (DefElem *) lfirst(lopt);
+
+ if (strcmp(defel->defname, "label") == 0)
+ {
+ if (o_label)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->label = strVal(defel->arg);
+ o_label = true;
+ }
+ else if (strcmp(defel->defname, "progress") == 0)
+ {
+ if (o_progress)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->progress = true;
+ o_progress = true;
+ }
+ else if (strcmp(defel->defname, "fast") == 0)
+ {
+ if (o_fast)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->fastcheckpoint = true;
+ o_fast = true;
+ }
+ else if (strcmp(defel->defname, "nowait") == 0)
+ {
+ if (o_nowait)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->nowait = true;
+ o_nowait = true;
+ }
+ else if (strcmp(defel->defname, "wal") == 0)
+ {
+ if (o_wal)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->includewal = true;
+ o_wal = true;
+ }
+ else if (strcmp(defel->defname, "max_rate") == 0)
+ {
+ long maxrate;
+
+ if (o_maxrate)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+
+ maxrate = intVal(defel->arg);
+ if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
+ (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
+
+ opt->maxrate = (uint32) maxrate;
+ o_maxrate = true;
+ }
+ else if (strcmp(defel->defname, "tablespace_map") == 0)
+ {
+ if (o_tablespace_map)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ opt->sendtblspcmapfile = true;
+ o_tablespace_map = true;
+ }
+ else if (strcmp(defel->defname, "noverify_checksums") == 0)
+ {
+ if (o_noverify_checksums)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ noverify_checksums = true;
+ o_noverify_checksums = true;
+ }
+ else if (strcmp(defel->defname, "manifest") == 0)
+ {
+ char *optval = strVal(defel->arg);
+ bool manifest_bool;
+
+ if (o_manifest)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ if (parse_bool(optval, &manifest_bool))
+ {
+ if (manifest_bool)
+ opt->manifest = MANIFEST_OPTION_YES;
+ else
+ opt->manifest = MANIFEST_OPTION_NO;
+ }
+ else if (pg_strcasecmp(optval, "force-encode") == 0)
+ opt->manifest = MANIFEST_OPTION_FORCE_ENCODE;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized manifest option: \"%s\"",
+ optval)));
+ o_manifest = true;
+ }
+ else if (strcmp(defel->defname, "manifest_checksums") == 0)
+ {
+ char *optval = strVal(defel->arg);
+
+ if (o_manifest_checksums)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("duplicate option \"%s\"", defel->defname)));
+ if (!pg_checksum_parse_type(optval,
+ &opt->manifest_checksum_type))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized checksum algorithm: \"%s\"",
+ optval)));
+ o_manifest_checksums = true;
+ }
+ else
+ elog(ERROR, "option \"%s\" not recognized",
+ defel->defname);
+ }
+ if (opt->label == NULL)
+ opt->label = "base backup";
+ if (opt->manifest == MANIFEST_OPTION_NO)
+ {
+ if (o_manifest_checksums)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("manifest checksums require a backup manifest")));
+ opt->manifest_checksum_type = CHECKSUM_TYPE_NONE;
+ }
+}
+
+
+/*
+ * SendBaseBackup() - send a complete base backup.
+ *
+ * The function will put the system into backup mode like pg_start_backup()
+ * does, so that the backup is consistent even though we read directly from
+ * the filesystem, bypassing the buffer cache.
+ */
+void
+SendBaseBackup(BaseBackupCmd *cmd)
+{
+ basebackup_options opt;
+ SessionBackupState status = get_backup_status();
+
+ if (status == SESSION_BACKUP_NON_EXCLUSIVE)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("a backup is already in progress in this session")));
+
+ parse_basebackup_options(cmd->options, &opt);
+
+ WalSndSetState(WALSNDSTATE_BACKUP);
+
+ if (update_process_title)
+ {
+ char activitymsg[50];
+
+ snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
+ opt.label);
+ set_ps_display(activitymsg);
+ }
+
+ perform_base_backup(&opt);
+}
+
+static void
+send_int8_string(StringInfoData *buf, int64 intval)
+{
+ char is[32];
+
+ sprintf(is, INT64_FORMAT, intval);
+ pq_sendint32(buf, strlen(is));
+ pq_sendbytes(buf, is, strlen(is));
+}
+
+static void
+SendBackupHeader(List *tablespaces)
+{
+ StringInfoData buf;
+ ListCell *lc;
+
+ /* Construct and send the directory information */
+ pq_beginmessage(&buf, 'T'); /* RowDescription */
+ pq_sendint16(&buf, 3); /* 3 fields */
+
+ /* First field - spcoid */
+ pq_sendstring(&buf, "spcoid");
+ pq_sendint32(&buf, 0); /* table oid */
+ pq_sendint16(&buf, 0); /* attnum */
+ pq_sendint32(&buf, OIDOID); /* type oid */
+ pq_sendint16(&buf, 4); /* typlen */
+ pq_sendint32(&buf, 0); /* typmod */
+ pq_sendint16(&buf, 0); /* format code */
+
+ /* Second field - spclocation */
+ pq_sendstring(&buf, "spclocation");
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+ pq_sendint32(&buf, TEXTOID);
+ pq_sendint16(&buf, -1);
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+
+ /* Third field - size */
+ pq_sendstring(&buf, "size");
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+ pq_sendint32(&buf, INT8OID);
+ pq_sendint16(&buf, 8);
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+ pq_endmessage(&buf);
+
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *ti = lfirst(lc);
+
+ /* Send one datarow message */
+ pq_beginmessage(&buf, 'D');
+ pq_sendint16(&buf, 3); /* number of columns */
+ if (ti->path == NULL)
+ {
+ pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */
+ pq_sendint32(&buf, -1);
+ }
+ else
+ {
+ Size len;
+
+ len = strlen(ti->oid);
+ pq_sendint32(&buf, len);
+ pq_sendbytes(&buf, ti->oid, len);
+
+ len = strlen(ti->path);
+ pq_sendint32(&buf, len);
+ pq_sendbytes(&buf, ti->path, len);
+ }
+ if (ti->size >= 0)
+ send_int8_string(&buf, ti->size / 1024);
+ else
+ pq_sendint32(&buf, -1); /* NULL */
+
+ pq_endmessage(&buf);
+ }
+
+ /* Send a CommandComplete message */
+ pq_puttextmessage('C', "SELECT");
+}
+
+/*
+ * Send a single resultset containing just a single
+ * XLogRecPtr record (in text format)
+ */
+static void
+SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli)
+{
+ StringInfoData buf;
+ char str[MAXFNAMELEN];
+ Size len;
+
+ pq_beginmessage(&buf, 'T'); /* RowDescription */
+ pq_sendint16(&buf, 2); /* 2 fields */
+
+ /* Field headers */
+ pq_sendstring(&buf, "recptr");
+ pq_sendint32(&buf, 0); /* table oid */
+ pq_sendint16(&buf, 0); /* attnum */
+ pq_sendint32(&buf, TEXTOID); /* type oid */
+ pq_sendint16(&buf, -1);
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+
+ pq_sendstring(&buf, "tli");
+ pq_sendint32(&buf, 0); /* table oid */
+ pq_sendint16(&buf, 0); /* attnum */
+
+ /*
+ * int8 may seem like a surprising data type for this, but in theory int4
+ * would not be wide enough for this, as TimeLineID is unsigned.
+ */
+ pq_sendint32(&buf, INT8OID); /* type oid */
+ pq_sendint16(&buf, -1);
+ pq_sendint32(&buf, 0);
+ pq_sendint16(&buf, 0);
+ pq_endmessage(&buf);
+
+ /* Data row */
+ pq_beginmessage(&buf, 'D');
+ pq_sendint16(&buf, 2); /* number of columns */
+
+ len = snprintf(str, sizeof(str),
+ "%X/%X", LSN_FORMAT_ARGS(ptr));
+ pq_sendint32(&buf, len);
+ pq_sendbytes(&buf, str, len);
+
+ len = snprintf(str, sizeof(str), "%u", tli);
+ pq_sendint32(&buf, len);
+ pq_sendbytes(&buf, str, len);
+
+ pq_endmessage(&buf);
+
+ /* Send a CommandComplete message */
+ pq_puttextmessage('C', "SELECT");
+}
+
+/*
+ * Inject a file with given name and content in the output tar stream.
+ */
+static void
+sendFileWithContent(const char *filename, const char *content,
+ backup_manifest_info *manifest)
+{
+ struct stat statbuf;
+ int pad,
+ len;
+ pg_checksum_context checksum_ctx;
+
+ if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
+ elog(ERROR, "could not initialize checksum of file \"%s\"",
+ filename);
+
+ len = strlen(content);
+
+ /*
+ * Construct a stat struct for the backup_label file we're injecting in
+ * the tar.
+ */
+ /* Windows doesn't have the concept of uid and gid */
+#ifdef WIN32
+ statbuf.st_uid = 0;
+ statbuf.st_gid = 0;
+#else
+ statbuf.st_uid = geteuid();
+ statbuf.st_gid = getegid();
+#endif
+ statbuf.st_mtime = time(NULL);
+ statbuf.st_mode = pg_file_create_mode;
+ statbuf.st_size = len;
+
+ _tarWriteHeader(filename, NULL, &statbuf, false);
+ /* Send the contents as a CopyData message */
+ pq_putmessage('d', content, len);
+ update_basebackup_progress(len);
+
+ /* Pad to a multiple of the tar block size. */
+ pad = tarPaddingBytesRequired(len);
+ if (pad > 0)
+ {
+ char buf[TAR_BLOCK_SIZE];
+
+ MemSet(buf, 0, pad);
+ pq_putmessage('d', buf, pad);
+ update_basebackup_progress(pad);
+ }
+
+ if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0)
+ elog(ERROR, "could not update checksum of file \"%s\"",
+ filename);
+
+ AddFileToBackupManifest(manifest, NULL, filename, len,
+ (pg_time_t) statbuf.st_mtime, &checksum_ctx);
+}
+
+/*
+ * Include the tablespace directory pointed to by 'path' in the output tar
+ * stream. If 'sizeonly' is true, we just calculate a total length and return
+ * it, without actually sending anything.
+ *
+ * Only used to send auxiliary tablespaces, not PGDATA.
+ */
+static int64
+sendTablespace(char *path, char *spcoid, bool sizeonly,
+ backup_manifest_info *manifest)
+{
+ int64 size;
+ char pathbuf[MAXPGPATH];
+ struct stat statbuf;
+
+ /*
+ * 'path' points to the tablespace location, but we only want to include
+ * the version directory in it that belongs to us.
+ */
+ snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
+ TABLESPACE_VERSION_DIRECTORY);
+
+ /*
+ * Store a directory entry in the tar file so we get the permissions
+ * right.
+ */
+ if (lstat(pathbuf, &statbuf) != 0)
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file or directory \"%s\": %m",
+ pathbuf)));
+
+ /* If the tablespace went away while scanning, it's no error. */
+ return 0;
+ }
+
+ size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
+ sizeonly);
+
+ /* Send all the files in the tablespace version directory */
+ size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true, manifest,
+ spcoid);
+
+ return size;
+}
+
+/*
+ * Include all files from the given directory in the output tar stream. If
+ * 'sizeonly' is true, we just calculate a total length and return it, without
+ * actually sending anything.
+ *
+ * Omit any directory in the tablespaces list, to avoid backing up
+ * tablespaces twice when they were created inside PGDATA.
+ *
+ * If sendtblspclinks is true, we need to include symlink
+ * information in the tar file. If not, we can skip that
+ * as it will be sent separately in the tablespace_map file.
+ */
+static int64
+sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
+ bool sendtblspclinks, backup_manifest_info *manifest,
+ const char *spcoid)
+{
+ DIR *dir;
+ struct dirent *de;
+ char pathbuf[MAXPGPATH * 2];
+ struct stat statbuf;
+ int64 size = 0;
+ const char *lastDir; /* Split last dir from parent path. */
+ bool isDbDir = false; /* Does this directory contain relations? */
+
+ /*
+ * Determine if the current path is a database directory that can contain
+ * relations.
+ *
+ * Start by finding the location of the delimiter between the parent path
+ * and the current path.
+ */
+ lastDir = last_dir_separator(path);
+
+ /* Does this path look like a database path (i.e. all digits)? */
+ if (lastDir != NULL &&
+ strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
+ {
+ /* Part of path that contains the parent directory. */
+ int parentPathLen = lastDir - path;
+
+ /*
+ * Mark path as a database directory if the parent path is either
+ * $PGDATA/base or a tablespace version path.
+ */
+ if (strncmp(path, "./base", parentPathLen) == 0 ||
+ (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
+ strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
+ TABLESPACE_VERSION_DIRECTORY,
+ sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
+ isDbDir = true;
+ }
+
+ dir = AllocateDir(path);
+ while ((de = ReadDir(dir, path)) != NULL)
+ {
+ int excludeIdx;
+ bool excludeFound;
+ ForkNumber relForkNum; /* Type of fork if file is a relation */
+ int relOidChars; /* Chars in filename that are the rel oid */
+
+ /* Skip special stuff */
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+ continue;
+
+ /* Skip temporary files */
+ if (strncmp(de->d_name,
+ PG_TEMP_FILE_PREFIX,
+ strlen(PG_TEMP_FILE_PREFIX)) == 0)
+ continue;
+
+ /*
+ * Check if the postmaster has signaled us to exit, and abort with an
+ * error in that case. The error handler further up will call
+ * do_pg_abort_backup() for us. Also check that if the backup was
+ * started while still in recovery, the server wasn't promoted.
+ * do_pg_stop_backup() will check that too, but it's better to stop
+ * the backup early than continue to the end and fail there.
+ */
+ CHECK_FOR_INTERRUPTS();
+ if (RecoveryInProgress() != backup_started_in_recovery)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("the standby was promoted during online backup"),
+ errhint("This means that the backup being taken is corrupt "
+ "and should not be used. "
+ "Try taking another online backup.")));
+
+ /* Scan for files that should be excluded */
+ excludeFound = false;
+ for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
+ {
+ int cmplen = strlen(excludeFiles[excludeIdx].name);
+
+ if (!excludeFiles[excludeIdx].match_prefix)
+ cmplen++;
+ if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
+ {
+ elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
+ excludeFound = true;
+ break;
+ }
+ }
+
+ if (excludeFound)
+ continue;
+
+ /* Exclude all forks for unlogged tables except the init fork */
+ if (isDbDir &&
+ parse_filename_for_nontemp_relation(de->d_name, &relOidChars,
+ &relForkNum))
+ {
+ /* Never exclude init forks */
+ if (relForkNum != INIT_FORKNUM)
+ {
+ char initForkFile[MAXPGPATH];
+ char relOid[OIDCHARS + 1];
+
+ /*
+ * If any other type of fork, check if there is an init fork
+ * with the same OID. If so, the file can be excluded.
+ */
+ memcpy(relOid, de->d_name, relOidChars);
+ relOid[relOidChars] = '\0';
+ snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
+ path, relOid);
+
+ if (lstat(initForkFile, &statbuf) == 0)
+ {
+ elog(DEBUG2,
+ "unlogged relation file \"%s\" excluded from backup",
+ de->d_name);
+
+ continue;
+ }
+ }
+ }
+
+ /* Exclude temporary relations */
+ if (isDbDir && looks_like_temp_rel_name(de->d_name))
+ {
+ elog(DEBUG2,
+ "temporary relation file \"%s\" excluded from backup",
+ de->d_name);
+
+ continue;
+ }
+
+ snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
+
+ /* Skip pg_control here to back up it last */
+ if (strcmp(pathbuf, "./global/pg_control") == 0)
+ continue;
+
+ if (lstat(pathbuf, &statbuf) != 0)
+ {
+ if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat file or directory \"%s\": %m",
+ pathbuf)));
+
+ /* If the file went away while scanning, it's not an error. */
+ continue;
+ }
+
+ /* Scan for directories whose contents should be excluded */
+ excludeFound = false;
+ for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
+ {
+ if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
+ {
+ elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
+ size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
+ excludeFound = true;
+ break;
+ }
+ }
+
+ if (excludeFound)
+ continue;
+
+ /*
+ * Exclude contents of directory specified by statrelpath if not set
+ * to the default (pg_stat_tmp) which is caught in the loop above.
+ */
+ if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
+ {
+ elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
+ size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
+ continue;
+ }
+
+ /*
+ * We can skip pg_wal, the WAL segments need to be fetched from the
+ * WAL archive anyway. But include it as an empty directory anyway, so
+ * we get permissions right.
+ */
+ if (strcmp(pathbuf, "./pg_wal") == 0)
+ {
+ /* If pg_wal is a symlink, write it as a directory anyway */
+ size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
+
+ /*
+ * Also send archive_status directory (by hackishly reusing
+ * statbuf from above ...).
+ */
+ size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf,
+ sizeonly);
+
+ continue; /* don't recurse into pg_wal */
+ }
+
+ /* Allow symbolic links in pg_tblspc only */
+ if (strcmp(path, "./pg_tblspc") == 0 &&
+#ifndef WIN32
+ S_ISLNK(statbuf.st_mode)
+#else
+ pgwin32_is_junction(pathbuf)
+#endif
+ )
+ {
+#if defined(HAVE_READLINK) || defined(WIN32)
+ char linkpath[MAXPGPATH];
+ int rllen;
+
+ rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
+ if (rllen < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read symbolic link \"%s\": %m",
+ pathbuf)));
+ if (rllen >= sizeof(linkpath))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("symbolic link \"%s\" target is too long",
+ pathbuf)));
+ linkpath[rllen] = '\0';
+
+ size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath,
+ &statbuf, sizeonly);
+#else
+
+ /*
+ * If the platform does not have symbolic links, it should not be
+ * possible to have tablespaces - clearly somebody else created
+ * them. Warn about it and ignore.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("tablespaces are not supported on this platform")));
+ continue;
+#endif /* HAVE_READLINK */
+ }
+ else if (S_ISDIR(statbuf.st_mode))
+ {
+ bool skip_this_dir = false;
+ ListCell *lc;
+
+ /*
+ * Store a directory entry in the tar file so we can get the
+ * permissions right.
+ */
+ size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf,
+ sizeonly);
+
+ /*
+ * Call ourselves recursively for a directory, unless it happens
+ * to be a separate tablespace located within PGDATA.
+ */
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+
+ /*
+ * ti->rpath is the tablespace relative path within PGDATA, or
+ * NULL if the tablespace has been properly located somewhere
+ * else.
+ *
+ * Skip past the leading "./" in pathbuf when comparing.
+ */
+ if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
+ {
+ skip_this_dir = true;
+ break;
+ }
+ }
+
+ /*
+ * skip sending directories inside pg_tblspc, if not required.
+ */
+ if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
+ skip_this_dir = true;
+
+ if (!skip_this_dir)
+ size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces,
+ sendtblspclinks, manifest, spcoid);
+ }
+ else if (S_ISREG(statbuf.st_mode))
+ {
+ bool sent = false;
+
+ if (!sizeonly)
+ sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
+ true, isDbDir ? atooid(lastDir + 1) : InvalidOid,
+ manifest, spcoid);
+
+ if (sent || sizeonly)
+ {
+ /* Add size. */
+ size += statbuf.st_size;
+
+ /* Pad to a multiple of the tar block size. */
+ size += tarPaddingBytesRequired(statbuf.st_size);
+
+ /* Size of the header for the file. */
+ size += TAR_BLOCK_SIZE;
+ }
+ }
+ else
+ ereport(WARNING,
+ (errmsg("skipping special file \"%s\"", pathbuf)));
+ }
+ FreeDir(dir);
+ return size;
+}
+
+/*
+ * Check if a file should have its checksum validated.
+ * We validate checksums on files in regular tablespaces
+ * (including global and default) only, and in those there
+ * are some files that are explicitly excluded.
+ */
+static bool
+is_checksummed_file(const char *fullpath, const char *filename)
+{
+ /* Check that the file is in a tablespace */
+ if (strncmp(fullpath, "./global/", 9) == 0 ||
+ strncmp(fullpath, "./base/", 7) == 0 ||
+ strncmp(fullpath, "/", 1) == 0)
+ {
+ int excludeIdx;
+
+ /* Compare file against noChecksumFiles skip list */
+ for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++)
+ {
+ int cmplen = strlen(noChecksumFiles[excludeIdx].name);
+
+ if (!noChecksumFiles[excludeIdx].match_prefix)
+ cmplen++;
+ if (strncmp(filename, noChecksumFiles[excludeIdx].name,
+ cmplen) == 0)
+ return false;
+ }
+
+ return true;
+ }
+ else
+ return false;
+}
+
+/*****
+ * Functions for handling tar file format
+ *
+ * Copied from pg_dump, but modified to work with libpq for sending
+ */
+
+
+/*
+ * Given the member, write the TAR header & send the file.
+ *
+ * If 'missing_ok' is true, will not throw an error if the file is not found.
+ *
+ * If dboid is anything other than InvalidOid then any checksum failures detected
+ * will get reported to the stats collector.
+ *
+ * Returns true if the file was successfully sent, false if 'missing_ok',
+ * and the file did not exist.
+ */
+static bool
+sendFile(const char *readfilename, const char *tarfilename,
+ struct stat *statbuf, bool missing_ok, Oid dboid,
+ backup_manifest_info *manifest, const char *spcoid)
+{
+ int fd;
+ BlockNumber blkno = 0;
+ bool block_retry = false;
+ char buf[TAR_SEND_SIZE];
+ uint16 checksum;
+ int checksum_failures = 0;
+ off_t cnt;
+ int i;
+ pgoff_t len = 0;
+ char *page;
+ size_t pad;
+ PageHeader phdr;
+ int segmentno = 0;
+ char *segmentpath;
+ bool verify_checksum = false;
+ pg_checksum_context checksum_ctx;
+
+ if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0)
+ elog(ERROR, "could not initialize checksum of file \"%s\"",
+ readfilename);
+
+ fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
+ if (fd < 0)
+ {
+ if (errno == ENOENT && missing_ok)
+ return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", readfilename)));
+ }
+
+ _tarWriteHeader(tarfilename, NULL, statbuf, false);
+
+ if (!noverify_checksums && DataChecksumsEnabled())
+ {
+ char *filename;
+
+ /*
+ * Get the filename (excluding path). As last_dir_separator()
+ * includes the last directory separator, we chop that off by
+ * incrementing the pointer.
+ */
+ filename = last_dir_separator(readfilename) + 1;
+
+ if (is_checksummed_file(readfilename, filename))
+ {
+ verify_checksum = true;
+
+ /*
+ * Cut off at the segment boundary (".") to get the segment number
+ * in order to mix it into the checksum.
+ */
+ segmentpath = strstr(filename, ".");
+ if (segmentpath != NULL)
+ {
+ segmentno = atoi(segmentpath + 1);
+ if (segmentno == 0)
+ ereport(ERROR,
+ (errmsg("invalid segment number %d in file \"%s\"",
+ segmentno, filename)));
+ }
+ }
+ }
+
+ /*
+ * Loop until we read the amount of data the caller told us to expect. The
+ * file could be longer, if it was extended while we were sending it, but
+ * for a base backup we can ignore such extended data. It will be restored
+ * from WAL.
+ */
+ while (len < statbuf->st_size)
+ {
+ /* Try to read some more data. */
+ cnt = basebackup_read_file(fd, buf,
+ Min(sizeof(buf), statbuf->st_size - len),
+ len, readfilename, true);
+
+ /*
+ * If we hit end-of-file, a concurrent truncation must have occurred.
+ * That's not an error condition, because WAL replay will fix things
+ * up.
+ */
+ if (cnt == 0)
+ break;
+
+ /*
+ * The checksums are verified at block level, so we iterate over the
+ * buffer in chunks of BLCKSZ, after making sure that
+ * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
+ * BLCKSZ bytes.
+ */
+ Assert(TAR_SEND_SIZE % BLCKSZ == 0);
+
+ if (verify_checksum && (cnt % BLCKSZ != 0))
+ {
+ ereport(WARNING,
+ (errmsg("could not verify checksum in file \"%s\", block "
+ "%u: read buffer size %d and page size %d "
+ "differ",
+ readfilename, blkno, (int) cnt, BLCKSZ)));
+ verify_checksum = false;
+ }
+
+ if (verify_checksum)
+ {
+ for (i = 0; i < cnt / BLCKSZ; i++)
+ {
+ page = buf + BLCKSZ * i;
+
+ /*
+ * Only check pages which have not been modified since the
+ * start of the base backup. Otherwise, they might have been
+ * written only halfway and the checksum would not be valid.
+ * However, replaying WAL would reinstate the correct page in
+ * this case. We also skip completely new pages, since they
+ * don't have a checksum yet.
+ */
+ if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+ {
+ checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
+ phdr = (PageHeader) page;
+ if (phdr->pd_checksum != checksum)
+ {
+ /*
+ * Retry the block on the first failure. It's
+ * possible that we read the first 4K page of the
+ * block just before postgres updated the entire block
+ * so it ends up looking torn to us. We only need to
+ * retry once because the LSN should be updated to
+ * something we can ignore on the next pass. If the
+ * error happens again then it is a true validation
+ * failure.
+ */
+ if (block_retry == false)
+ {
+ int reread_cnt;
+
+ /* Reread the failed block */
+ reread_cnt =
+ basebackup_read_file(fd, buf + BLCKSZ * i,
+ BLCKSZ, len + BLCKSZ * i,
+ readfilename,
+ false);
+ if (reread_cnt == 0)
+ {
+ /*
+ * If we hit end-of-file, a concurrent
+ * truncation must have occurred, so break out
+ * of this loop just as if the initial fread()
+ * returned 0. We'll drop through to the same
+ * code that handles that case. (We must fix
+ * up cnt first, though.)
+ */
+ cnt = BLCKSZ * i;
+ break;
+ }
+
+ /* Set flag so we know a retry was attempted */
+ block_retry = true;
+
+ /* Reset loop to validate the block again */
+ i--;
+ continue;
+ }
+
+ checksum_failures++;
+
+ if (checksum_failures <= 5)
+ ereport(WARNING,
+ (errmsg("checksum verification failed in "
+ "file \"%s\", block %u: calculated "
+ "%X but expected %X",
+ readfilename, blkno, checksum,
+ phdr->pd_checksum)));
+ if (checksum_failures == 5)
+ ereport(WARNING,
+ (errmsg("further checksum verification "
+ "failures in file \"%s\" will not "
+ "be reported", readfilename)));
+ }
+ }
+ block_retry = false;
+ blkno++;
+ }
+ }
+
+ /* Send the chunk as a CopyData message */
+ if (pq_putmessage('d', buf, cnt))
+ ereport(ERROR,
+ (errmsg("base backup could not send data, aborting backup")));
+ update_basebackup_progress(cnt);
+
+ /* Also feed it to the checksum machinery. */
+ if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0)
+ elog(ERROR, "could not update checksum of base backup");
+
+ len += cnt;
+ throttle(cnt);
+ }
+
+ /* If the file was truncated while we were sending it, pad it with zeros */
+ if (len < statbuf->st_size)
+ {
+ MemSet(buf, 0, sizeof(buf));
+ while (len < statbuf->st_size)
+ {
+ cnt = Min(sizeof(buf), statbuf->st_size - len);
+ pq_putmessage('d', buf, cnt);
+ if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0)
+ elog(ERROR, "could not update checksum of base backup");
+ update_basebackup_progress(cnt);
+ len += cnt;
+ throttle(cnt);
+ }
+ }
+
+ /*
+ * Pad to a block boundary, per tar format requirements. (This small piece
+ * of data is probably not worth throttling, and is not checksummed
+ * because it's not actually part of the file.)
+ */
+ pad = tarPaddingBytesRequired(len);
+ if (pad > 0)
+ {
+ MemSet(buf, 0, pad);
+ pq_putmessage('d', buf, pad);
+ update_basebackup_progress(pad);
+ }
+
+ CloseTransientFile(fd);
+
+ if (checksum_failures > 1)
+ {
+ ereport(WARNING,
+ (errmsg_plural("file \"%s\" has a total of %d checksum verification failure",
+ "file \"%s\" has a total of %d checksum verification failures",
+ checksum_failures,
+ readfilename, checksum_failures)));
+
+ pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
+ }
+
+ total_checksum_failures += checksum_failures;
+
+ AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size,
+ (pg_time_t) statbuf->st_mtime, &checksum_ctx);
+
+ return true;
+}
+
+
+static int64
+_tarWriteHeader(const char *filename, const char *linktarget,
+ struct stat *statbuf, bool sizeonly)
+{
+ char h[TAR_BLOCK_SIZE];
+ enum tarError rc;
+
+ if (!sizeonly)
+ {
+ rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size,
+ statbuf->st_mode, statbuf->st_uid, statbuf->st_gid,
+ statbuf->st_mtime);
+
+ switch (rc)
+ {
+ case TAR_OK:
+ break;
+ case TAR_NAME_TOO_LONG:
+ ereport(ERROR,
+ (errmsg("file name too long for tar format: \"%s\"",
+ filename)));
+ break;
+ case TAR_SYMLINK_TOO_LONG:
+ ereport(ERROR,
+ (errmsg("symbolic link target too long for tar format: "
+ "file name \"%s\", target \"%s\"",
+ filename, linktarget)));
+ break;
+ default:
+ elog(ERROR, "unrecognized tar error: %d", rc);
+ }
+
+ pq_putmessage('d', h, sizeof(h));
+ update_basebackup_progress(sizeof(h));
+ }
+
+ return sizeof(h);
+}
+
+/*
+ * Write tar header for a directory. If the entry in statbuf is a link then
+ * write it as a directory anyway.
+ */
+static int64
+_tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
+ bool sizeonly)
+{
+ /* If symlink, write it as a directory anyway */
+#ifndef WIN32
+ if (S_ISLNK(statbuf->st_mode))
+#else
+ if (pgwin32_is_junction(pathbuf))
+#endif
+ statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
+
+ return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly);
+}
+
+/*
+ * Increment the network transfer counter by the given number of bytes,
+ * and sleep if necessary to comply with the requested network transfer
+ * rate.
+ */
+static void
+throttle(size_t increment)
+{
+ TimeOffset elapsed_min;
+
+ if (throttling_counter < 0)
+ return;
+
+ throttling_counter += increment;
+ if (throttling_counter < throttling_sample)
+ return;
+
+ /* How much time should have elapsed at minimum? */
+ elapsed_min = elapsed_min_unit *
+ (throttling_counter / throttling_sample);
+
+ /*
+ * Since the latch could be set repeatedly because of concurrently WAL
+ * activity, sleep in a loop to ensure enough time has passed.
+ */
+ for (;;)
+ {
+ TimeOffset elapsed,
+ sleep;
+ int wait_result;
+
+ /* Time elapsed since the last measurement (and possible wake up). */
+ elapsed = GetCurrentTimestamp() - throttled_last;
+
+ /* sleep if the transfer is faster than it should be */
+ sleep = elapsed_min - elapsed;
+ if (sleep <= 0)
+ break;
+
+ ResetLatch(MyLatch);
+
+ /* We're eating a potentially set latch, so check for interrupts */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
+ * the maximum time to sleep. Thus the cast to long is safe.
+ */
+ wait_result = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ (long) (sleep / 1000),
+ WAIT_EVENT_BASE_BACKUP_THROTTLE);
+
+ if (wait_result & WL_LATCH_SET)
+ CHECK_FOR_INTERRUPTS();
+
+ /* Done waiting? */
+ if (wait_result & WL_TIMEOUT)
+ break;
+ }
+
+ /*
+ * As we work with integers, only whole multiple of throttling_sample was
+ * processed. The rest will be done during the next call of this function.
+ */
+ throttling_counter %= throttling_sample;
+
+ /*
+ * Time interval for the remaining amount and possible next increments
+ * starts now.
+ */
+ throttled_last = GetCurrentTimestamp();
+}
+
+/*
+ * Increment the counter for the amount of data already streamed
+ * by the given number of bytes, and update the progress report for
+ * pg_stat_progress_basebackup.
+ */
+static void
+update_basebackup_progress(int64 delta)
+{
+ const int index[] = {
+ PROGRESS_BASEBACKUP_BACKUP_STREAMED,
+ PROGRESS_BASEBACKUP_BACKUP_TOTAL
+ };
+ int64 val[2];
+ int nparam = 0;
+
+ backup_streamed += delta;
+ val[nparam++] = backup_streamed;
+
+ /*
+ * Avoid overflowing past 100% or the full size. This may make the total
+ * size number change as we approach the end of the backup (the estimate
+ * will always be wrong if WAL is included), but that's better than having
+ * the done column be bigger than the total.
+ */
+ if (backup_total > -1 && backup_streamed > backup_total)
+ {
+ backup_total = backup_streamed;
+ val[nparam++] = backup_total;
+ }
+
+ pgstat_progress_update_multi_param(nparam, index, val);
+}
+
+/*
+ * Read some data from a file, setting a wait event and reporting any error
+ * encountered.
+ *
+ * If partial_read_ok is false, also report an error if the number of bytes
+ * read is not equal to the number of bytes requested.
+ *
+ * Returns the number of bytes read.
+ */
+static int
+basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
+ const char *filename, bool partial_read_ok)
+{
+ int rc;
+
+ pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
+ rc = pg_pread(fd, buf, nbytes, offset);
+ pgstat_report_wait_end();
+
+ if (rc < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", filename)));
+ if (!partial_read_ok && rc > 0 && rc != nbytes)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ filename, rc, nbytes)));
+
+ return rc;
+}