diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/replication/basebackup.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/replication/basebackup.c')
-rw-r--r-- | src/backend/replication/basebackup.c | 2034 |
1 files changed, 2034 insertions, 0 deletions
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c new file mode 100644 index 0000000..d142cc2 --- /dev/null +++ b/src/backend/replication/basebackup.c @@ -0,0 +1,2034 @@ +/*------------------------------------------------------------------------- + * + * basebackup.c + * code for taking a base backup and streaming it to a standby + * + * Portions Copyright (c) 2010-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/basebackup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <sys/stat.h> +#include <unistd.h> +#include <time.h> + +#include "access/xlog_internal.h" /* for pg_start/stop_backup */ +#include "catalog/pg_type.h" +#include "common/file_perm.h" +#include "commands/progress.h" +#include "lib/stringinfo.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "pgstat.h" +#include "pgtar.h" +#include "port.h" +#include "postmaster/syslogger.h" +#include "replication/basebackup.h" +#include "replication/backup_manifest.h" +#include "replication/walsender.h" +#include "replication/walsender_private.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/reinit.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" +#include "utils/relcache.h" +#include "utils/resowner.h" +#include "utils/timestamp.h" + +typedef struct +{ + const char *label; + bool progress; + bool fastcheckpoint; + bool nowait; + bool includewal; + uint32 maxrate; + bool sendtblspcmapfile; + backup_manifest_option manifest; + pg_checksum_type manifest_checksum_type; +} basebackup_options; + +static int64 sendTablespace(char *path, char *oid, bool sizeonly, + struct backup_manifest_info *manifest); +static int64 sendDir(const char *path, int basepathlen, bool sizeonly, + List *tablespaces, bool sendtblspclinks, + backup_manifest_info *manifest, const char *spcoid); +static bool sendFile(const char *readfilename, const char *tarfilename, + struct stat *statbuf, bool missing_ok, Oid dboid, + backup_manifest_info *manifest, const char *spcoid); +static void sendFileWithContent(const char *filename, const char *content, + backup_manifest_info *manifest); +static int64 _tarWriteHeader(const char *filename, const char *linktarget, + struct stat *statbuf, bool sizeonly); +static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf, + bool sizeonly); +static void send_int8_string(StringInfoData *buf, int64 intval); +static void SendBackupHeader(List *tablespaces); +static void perform_base_backup(basebackup_options *opt); +static void parse_basebackup_options(List *options, basebackup_options *opt); +static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli); +static int compareWalFileNames(const ListCell *a, const ListCell *b); +static void throttle(size_t increment); +static void update_basebackup_progress(int64 delta); +static bool is_checksummed_file(const char *fullpath, const char *filename); +static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok); + +/* Was the backup currently in-progress initiated in recovery mode? */ +static bool backup_started_in_recovery = false; + +/* Relative path of temporary statistics directory */ +static char *statrelpath = NULL; + +/* + * Size of each block sent into the tar stream for larger files. + */ +#define TAR_SEND_SIZE 32768 + +/* + * How frequently to throttle, as a fraction of the specified rate-second. + */ +#define THROTTLING_FREQUENCY 8 + +/* The actual number of bytes, transfer of which may cause sleep. */ +static uint64 throttling_sample; + +/* Amount of data already transferred but not yet throttled. */ +static int64 throttling_counter; + +/* The minimum time required to transfer throttling_sample bytes. */ +static TimeOffset elapsed_min_unit; + +/* The last check of the transfer rate. */ +static TimestampTz throttled_last; + +/* The starting XLOG position of the base backup. */ +static XLogRecPtr startptr; + +/* Total number of checksum failures during base backup. */ +static long long int total_checksum_failures; + +/* Do not verify checksums. */ +static bool noverify_checksums = false; + +/* + * Total amount of backup data that will be streamed. + * -1 means that the size is not estimated. + */ +static int64 backup_total = 0; + +/* Amount of backup data already streamed */ +static int64 backup_streamed = 0; + +/* + * Definition of one element part of an exclusion list, used for paths part + * of checksum validation or base backups. "name" is the name of the file + * or path to check for exclusion. If "match_prefix" is true, any items + * matching the name as prefix are excluded. + */ +struct exclude_list_item +{ + const char *name; + bool match_prefix; +}; + +/* + * The contents of these directories are removed or recreated during server + * start so they are not included in backups. The directories themselves are + * kept and included as empty to preserve access permissions. + * + * Note: this list should be kept in sync with the filter lists in pg_rewind's + * filemap.c. + */ +static const char *const excludeDirContents[] = +{ + /* + * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even + * when stats_temp_directory is set because PGSS_TEXT_FILE is always + * created there. + */ + PG_STAT_TMP_DIR, + + /* + * It is generally not useful to backup the contents of this directory + * even if the intention is to restore to another primary. See backup.sgml + * for a more detailed description. + */ + "pg_replslot", + + /* Contents removed on startup, see dsm_cleanup_for_mmap(). */ + PG_DYNSHMEM_DIR, + + /* Contents removed on startup, see AsyncShmemInit(). */ + "pg_notify", + + /* + * Old contents are loaded for possible debugging but are not required for + * normal operation, see SerialInit(). + */ + "pg_serial", + + /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */ + "pg_snapshots", + + /* Contents zeroed on startup, see StartupSUBTRANS(). */ + "pg_subtrans", + + /* end of list */ + NULL +}; + +/* + * List of files excluded from backups. + */ +static const struct exclude_list_item excludeFiles[] = +{ + /* Skip auto conf temporary file. */ + {PG_AUTOCONF_FILENAME ".tmp", false}, + + /* Skip current log file temporary file */ + {LOG_METAINFO_DATAFILE_TMP, false}, + + /* + * Skip relation cache because it is rebuilt on startup. This includes + * temporary files. + */ + {RELCACHE_INIT_FILENAME, true}, + + /* + * If there's a backup_label or tablespace_map file, it belongs to a + * backup started by the user with pg_start_backup(). It is *not* correct + * for this backup. Our backup_label/tablespace_map is injected into the + * tar separately. + */ + {BACKUP_LABEL_FILE, false}, + {TABLESPACE_MAP, false}, + + /* + * If there's a backup_manifest, it belongs to a backup that was used to + * start this server. It is *not* correct for this backup. Our + * backup_manifest is injected into the backup separately if users want + * it. + */ + {"backup_manifest", false}, + + {"postmaster.pid", false}, + {"postmaster.opts", false}, + + /* end of list */ + {NULL, false} +}; + +/* + * List of files excluded from checksum validation. + * + * Note: this list should be kept in sync with what pg_checksums.c + * includes. + */ +static const struct exclude_list_item noChecksumFiles[] = { + {"pg_control", false}, + {"pg_filenode.map", false}, + {"pg_internal.init", true}, + {"PG_VERSION", false}, +#ifdef EXEC_BACKEND + {"config_exec_params", true}, +#endif + {NULL, false} +}; + +/* + * Actually do a base backup for the specified tablespaces. + * + * This is split out mainly to avoid complaints about "variable might be + * clobbered by longjmp" from stupider versions of gcc. + */ +static void +perform_base_backup(basebackup_options *opt) +{ + TimeLineID starttli; + XLogRecPtr endptr; + TimeLineID endtli; + StringInfo labelfile; + StringInfo tblspc_map_file; + backup_manifest_info manifest; + int datadirpathlen; + List *tablespaces = NIL; + + backup_total = 0; + backup_streamed = 0; + pgstat_progress_start_command(PROGRESS_COMMAND_BASEBACKUP, InvalidOid); + + /* + * If the estimation of the total backup size is disabled, make the + * backup_total column in the view return NULL by setting the parameter to + * -1. + */ + if (!opt->progress) + { + backup_total = -1; + pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TOTAL, + backup_total); + } + + /* we're going to use a BufFile, so we need a ResourceOwner */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); + + datadirpathlen = strlen(DataDir); + + backup_started_in_recovery = RecoveryInProgress(); + + labelfile = makeStringInfo(); + tblspc_map_file = makeStringInfo(); + InitializeBackupManifest(&manifest, opt->manifest, + opt->manifest_checksum_type); + + total_checksum_failures = 0; + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_WAIT_CHECKPOINT); + startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli, + labelfile, &tablespaces, + tblspc_map_file); + + /* + * Once do_pg_start_backup has been called, ensure that any failure causes + * us to abort the backup so we don't "leak" a backup counter. For this + * reason, *all* functionality between do_pg_start_backup() and the end of + * do_pg_stop_backup() should be inside the error cleanup block! + */ + + PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); + { + ListCell *lc; + tablespaceinfo *ti; + int tblspc_streamed = 0; + + /* + * Calculate the relative path of temporary statistics directory in + * order to skip the files which are located in that directory later. + */ + if (is_absolute_path(pgstat_stat_directory) && + strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0) + statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1); + else if (strncmp(pgstat_stat_directory, "./", 2) != 0) + statrelpath = psprintf("./%s", pgstat_stat_directory); + else + statrelpath = pgstat_stat_directory; + + /* Add a node for the base directory at the end */ + ti = palloc0(sizeof(tablespaceinfo)); + ti->size = -1; + tablespaces = lappend(tablespaces, ti); + + /* + * Calculate the total backup size by summing up the size of each + * tablespace + */ + if (opt->progress) + { + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_ESTIMATE_BACKUP_SIZE); + + foreach(lc, tablespaces) + { + tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc); + + if (tmp->path == NULL) + tmp->size = sendDir(".", 1, true, tablespaces, true, NULL, + NULL); + else + tmp->size = sendTablespace(tmp->path, tmp->oid, true, + NULL); + backup_total += tmp->size; + } + } + + /* Report that we are now streaming database files as a base backup */ + { + const int index[] = { + PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_BACKUP_TOTAL, + PROGRESS_BASEBACKUP_TBLSPC_TOTAL + }; + const int64 val[] = { + PROGRESS_BASEBACKUP_PHASE_STREAM_BACKUP, + backup_total, list_length(tablespaces) + }; + + pgstat_progress_update_multi_param(3, index, val); + } + + /* Send the starting position of the backup */ + SendXlogRecPtrResult(startptr, starttli); + + /* Send tablespace header */ + SendBackupHeader(tablespaces); + + /* Setup and activate network throttling, if client requested it */ + if (opt->maxrate > 0) + { + throttling_sample = + (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY; + + /* + * The minimum amount of time for throttling_sample bytes to be + * transferred. + */ + elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY; + + /* Enable throttling. */ + throttling_counter = 0; + + /* The 'real data' starts now (header was ignored). */ + throttled_last = GetCurrentTimestamp(); + } + else + { + /* Disable throttling. */ + throttling_counter = -1; + } + + /* Send off our tablespaces one by one */ + foreach(lc, tablespaces) + { + tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); + StringInfoData buf; + + /* Send CopyOutResponse message */ + pq_beginmessage(&buf, 'H'); + pq_sendbyte(&buf, 0); /* overall format */ + pq_sendint16(&buf, 0); /* natts */ + pq_endmessage(&buf); + + if (ti->path == NULL) + { + struct stat statbuf; + bool sendtblspclinks = true; + + /* In the main tar, include the backup_label first... */ + sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data, + &manifest); + + /* Then the tablespace_map file, if required... */ + if (opt->sendtblspcmapfile) + { + sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data, + &manifest); + sendtblspclinks = false; + } + + /* Then the bulk of the files... */ + sendDir(".", 1, false, tablespaces, sendtblspclinks, + &manifest, NULL); + + /* ... and pg_control after everything else. */ + if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + XLOG_CONTROL_FILE))); + sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, + false, InvalidOid, &manifest, NULL); + } + else + sendTablespace(ti->path, ti->oid, false, &manifest); + + /* + * If we're including WAL, and this is the main data directory we + * don't terminate the tar stream here. Instead, we will append + * the xlog files below and terminate it then. This is safe since + * the main data directory is always sent *last*. + */ + if (opt->includewal && ti->path == NULL) + { + Assert(lnext(tablespaces, lc) == NULL); + } + else + pq_putemptymessage('c'); /* CopyDone */ + + tblspc_streamed++; + pgstat_progress_update_param(PROGRESS_BASEBACKUP_TBLSPC_STREAMED, + tblspc_streamed); + } + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_WAIT_WAL_ARCHIVE); + endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli); + } + PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false)); + + + if (opt->includewal) + { + /* + * We've left the last tar file "open", so we can now append the + * required WAL files to it. + */ + char pathbuf[MAXPGPATH]; + XLogSegNo segno; + XLogSegNo startsegno; + XLogSegNo endsegno; + struct stat statbuf; + List *historyFileList = NIL; + List *walFileList = NIL; + char firstoff[MAXFNAMELEN]; + char lastoff[MAXFNAMELEN]; + DIR *dir; + struct dirent *de; + ListCell *lc; + TimeLineID tli; + + pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE, + PROGRESS_BASEBACKUP_PHASE_TRANSFER_WAL); + + /* + * I'd rather not worry about timelines here, so scan pg_wal and + * include all WAL files in the range between 'startptr' and 'endptr', + * regardless of the timeline the file is stamped with. If there are + * some spurious WAL files belonging to timelines that don't belong in + * this server's history, they will be included too. Normally there + * shouldn't be such files, but if there are, there's little harm in + * including them. + */ + XLByteToSeg(startptr, startsegno, wal_segment_size); + XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size); + XLByteToPrevSeg(endptr, endsegno, wal_segment_size); + XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size); + + dir = AllocateDir("pg_wal"); + while ((de = ReadDir(dir, "pg_wal")) != NULL) + { + /* Does it look like a WAL segment, and is it in the range? */ + if (IsXLogFileName(de->d_name) && + strcmp(de->d_name + 8, firstoff + 8) >= 0 && + strcmp(de->d_name + 8, lastoff + 8) <= 0) + { + walFileList = lappend(walFileList, pstrdup(de->d_name)); + } + /* Does it look like a timeline history file? */ + else if (IsTLHistoryFileName(de->d_name)) + { + historyFileList = lappend(historyFileList, pstrdup(de->d_name)); + } + } + FreeDir(dir); + + /* + * Before we go any further, check that none of the WAL segments we + * need were removed. + */ + CheckXLogRemoved(startsegno, ThisTimeLineID); + + /* + * Sort the WAL filenames. We want to send the files in order from + * oldest to newest, to reduce the chance that a file is recycled + * before we get a chance to send it over. + */ + list_sort(walFileList, compareWalFileNames); + + /* + * There must be at least one xlog file in the pg_wal directory, since + * we are doing backup-including-xlog. + */ + if (walFileList == NIL) + ereport(ERROR, + (errmsg("could not find any WAL files"))); + + /* + * Sanity check: the first and last segment should cover startptr and + * endptr, with no gaps in between. + */ + XLogFromFileName((char *) linitial(walFileList), + &tli, &segno, wal_segment_size); + if (segno != startsegno) + { + char startfname[MAXFNAMELEN]; + + XLogFileName(startfname, ThisTimeLineID, startsegno, + wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", startfname))); + } + foreach(lc, walFileList) + { + char *walFileName = (char *) lfirst(lc); + XLogSegNo currsegno = segno; + XLogSegNo nextsegno = segno + 1; + + XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); + if (!(nextsegno == segno || currsegno == segno)) + { + char nextfname[MAXFNAMELEN]; + + XLogFileName(nextfname, ThisTimeLineID, nextsegno, + wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", nextfname))); + } + } + if (segno != endsegno) + { + char endfname[MAXFNAMELEN]; + + XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size); + ereport(ERROR, + (errmsg("could not find WAL file \"%s\"", endfname))); + } + + /* Ok, we have everything we need. Send the WAL files. */ + foreach(lc, walFileList) + { + char *walFileName = (char *) lfirst(lc); + int fd; + char buf[TAR_SEND_SIZE]; + size_t cnt; + pgoff_t len = 0; + + snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName); + XLogFromFileName(walFileName, &tli, &segno, wal_segment_size); + + fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY); + if (fd < 0) + { + int save_errno = errno; + + /* + * Most likely reason for this is that the file was already + * removed by a checkpoint, so check for that to get a better + * error message. + */ + CheckXLogRemoved(segno, tli); + + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", pathbuf))); + } + + if (fstat(fd, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + pathbuf))); + if (statbuf.st_size != wal_segment_size) + { + CheckXLogRemoved(segno, tli); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected WAL file size \"%s\"", walFileName))); + } + + /* send the WAL file itself */ + _tarWriteHeader(pathbuf, NULL, &statbuf, false); + + while ((cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), + wal_segment_size - len), + len, pathbuf, true)) > 0) + { + CheckXLogRemoved(segno, tli); + /* Send the chunk as a CopyData message */ + if (pq_putmessage('d', buf, cnt)) + ereport(ERROR, + (errmsg("base backup could not send data, aborting backup"))); + update_basebackup_progress(cnt); + + len += cnt; + throttle(cnt); + + if (len == wal_segment_size) + break; + } + + if (len != wal_segment_size) + { + CheckXLogRemoved(segno, tli); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("unexpected WAL file size \"%s\"", walFileName))); + } + + /* + * wal_segment_size is a multiple of TAR_BLOCK_SIZE, so no need + * for padding. + */ + Assert(wal_segment_size % TAR_BLOCK_SIZE == 0); + + CloseTransientFile(fd); + + /* + * Mark file as archived, otherwise files can get archived again + * after promotion of a new node. This is in line with + * walreceiver.c always doing an XLogArchiveForceDone() after a + * complete segment. + */ + StatusFilePath(pathbuf, walFileName, ".done"); + sendFileWithContent(pathbuf, "", &manifest); + } + + /* + * Send timeline history files too. Only the latest timeline history + * file is required for recovery, and even that only if there happens + * to be a timeline switch in the first WAL segment that contains the + * checkpoint record, or if we're taking a base backup from a standby + * server and the target timeline changes while the backup is taken. + * But they are small and highly useful for debugging purposes, so + * better include them all, always. + */ + foreach(lc, historyFileList) + { + char *fname = lfirst(lc); + + snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname); + + if (lstat(pathbuf, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", pathbuf))); + + sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid, + &manifest, NULL); + + /* unconditionally mark file as archived */ + StatusFilePath(pathbuf, fname, ".done"); + sendFileWithContent(pathbuf, "", &manifest); + } + + /* Send CopyDone message for the last tar file */ + pq_putemptymessage('c'); + } + + AddWALInfoToBackupManifest(&manifest, startptr, starttli, endptr, endtli); + + SendBackupManifest(&manifest); + + SendXlogRecPtrResult(endptr, endtli); + + if (total_checksum_failures) + { + if (total_checksum_failures > 1) + ereport(WARNING, + (errmsg_plural("%lld total checksum verification failure", + "%lld total checksum verification failures", + total_checksum_failures, + total_checksum_failures))); + + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("checksum verification failure during base backup"))); + } + + /* + * Make sure to free the manifest before the resource owners as manifests + * use cryptohash contexts that may depend on resource owners (like + * OpenSSL). + */ + FreeBackupManifest(&manifest); + + /* clean up the resource owner we created */ + WalSndResourceCleanup(true); + + pgstat_progress_end_command(); +} + +/* + * list_sort comparison function, to compare log/seg portion of WAL segment + * filenames, ignoring the timeline portion. + */ +static int +compareWalFileNames(const ListCell *a, const ListCell *b) +{ + char *fna = (char *) lfirst(a); + char *fnb = (char *) lfirst(b); + + return strcmp(fna + 8, fnb + 8); +} + +/* + * Parse the base backup options passed down by the parser + */ +static void +parse_basebackup_options(List *options, basebackup_options *opt) +{ + ListCell *lopt; + bool o_label = false; + bool o_progress = false; + bool o_fast = false; + bool o_nowait = false; + bool o_wal = false; + bool o_maxrate = false; + bool o_tablespace_map = false; + bool o_noverify_checksums = false; + bool o_manifest = false; + bool o_manifest_checksums = false; + + MemSet(opt, 0, sizeof(*opt)); + opt->manifest = MANIFEST_OPTION_NO; + opt->manifest_checksum_type = CHECKSUM_TYPE_CRC32C; + + foreach(lopt, options) + { + DefElem *defel = (DefElem *) lfirst(lopt); + + if (strcmp(defel->defname, "label") == 0) + { + if (o_label) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->label = strVal(defel->arg); + o_label = true; + } + else if (strcmp(defel->defname, "progress") == 0) + { + if (o_progress) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->progress = true; + o_progress = true; + } + else if (strcmp(defel->defname, "fast") == 0) + { + if (o_fast) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->fastcheckpoint = true; + o_fast = true; + } + else if (strcmp(defel->defname, "nowait") == 0) + { + if (o_nowait) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->nowait = true; + o_nowait = true; + } + else if (strcmp(defel->defname, "wal") == 0) + { + if (o_wal) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->includewal = true; + o_wal = true; + } + else if (strcmp(defel->defname, "max_rate") == 0) + { + long maxrate; + + if (o_maxrate) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + + maxrate = intVal(defel->arg); + if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)", + (int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER))); + + opt->maxrate = (uint32) maxrate; + o_maxrate = true; + } + else if (strcmp(defel->defname, "tablespace_map") == 0) + { + if (o_tablespace_map) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->sendtblspcmapfile = true; + o_tablespace_map = true; + } + else if (strcmp(defel->defname, "noverify_checksums") == 0) + { + if (o_noverify_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + noverify_checksums = true; + o_noverify_checksums = true; + } + else if (strcmp(defel->defname, "manifest") == 0) + { + char *optval = strVal(defel->arg); + bool manifest_bool; + + if (o_manifest) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + if (parse_bool(optval, &manifest_bool)) + { + if (manifest_bool) + opt->manifest = MANIFEST_OPTION_YES; + else + opt->manifest = MANIFEST_OPTION_NO; + } + else if (pg_strcasecmp(optval, "force-encode") == 0) + opt->manifest = MANIFEST_OPTION_FORCE_ENCODE; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized manifest option: \"%s\"", + optval))); + o_manifest = true; + } + else if (strcmp(defel->defname, "manifest_checksums") == 0) + { + char *optval = strVal(defel->arg); + + if (o_manifest_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + if (!pg_checksum_parse_type(optval, + &opt->manifest_checksum_type)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized checksum algorithm: \"%s\"", + optval))); + o_manifest_checksums = true; + } + else + elog(ERROR, "option \"%s\" not recognized", + defel->defname); + } + if (opt->label == NULL) + opt->label = "base backup"; + if (opt->manifest == MANIFEST_OPTION_NO) + { + if (o_manifest_checksums) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("manifest checksums require a backup manifest"))); + opt->manifest_checksum_type = CHECKSUM_TYPE_NONE; + } +} + + +/* + * SendBaseBackup() - send a complete base backup. + * + * The function will put the system into backup mode like pg_start_backup() + * does, so that the backup is consistent even though we read directly from + * the filesystem, bypassing the buffer cache. + */ +void +SendBaseBackup(BaseBackupCmd *cmd) +{ + basebackup_options opt; + SessionBackupState status = get_backup_status(); + + if (status == SESSION_BACKUP_NON_EXCLUSIVE) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("a backup is already in progress in this session"))); + + parse_basebackup_options(cmd->options, &opt); + + WalSndSetState(WALSNDSTATE_BACKUP); + + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"", + opt.label); + set_ps_display(activitymsg); + } + + perform_base_backup(&opt); +} + +static void +send_int8_string(StringInfoData *buf, int64 intval) +{ + char is[32]; + + sprintf(is, INT64_FORMAT, intval); + pq_sendint32(buf, strlen(is)); + pq_sendbytes(buf, is, strlen(is)); +} + +static void +SendBackupHeader(List *tablespaces) +{ + StringInfoData buf; + ListCell *lc; + + /* Construct and send the directory information */ + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint16(&buf, 3); /* 3 fields */ + + /* First field - spcoid */ + pq_sendstring(&buf, "spcoid"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, OIDOID); /* type oid */ + pq_sendint16(&buf, 4); /* typlen */ + pq_sendint32(&buf, 0); /* typmod */ + pq_sendint16(&buf, 0); /* format code */ + + /* Second field - spclocation */ + pq_sendstring(&buf, "spclocation"); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_sendint32(&buf, TEXTOID); + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + + /* Third field - size */ + pq_sendstring(&buf, "size"); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_sendint32(&buf, INT8OID); + pq_sendint16(&buf, 8); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + + foreach(lc, tablespaces) + { + tablespaceinfo *ti = lfirst(lc); + + /* Send one datarow message */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, 3); /* number of columns */ + if (ti->path == NULL) + { + pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */ + pq_sendint32(&buf, -1); + } + else + { + Size len; + + len = strlen(ti->oid); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, ti->oid, len); + + len = strlen(ti->path); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, ti->path, len); + } + if (ti->size >= 0) + send_int8_string(&buf, ti->size / 1024); + else + pq_sendint32(&buf, -1); /* NULL */ + + pq_endmessage(&buf); + } + + /* Send a CommandComplete message */ + pq_puttextmessage('C', "SELECT"); +} + +/* + * Send a single resultset containing just a single + * XLogRecPtr record (in text format) + */ +static void +SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli) +{ + StringInfoData buf; + char str[MAXFNAMELEN]; + Size len; + + pq_beginmessage(&buf, 'T'); /* RowDescription */ + pq_sendint16(&buf, 2); /* 2 fields */ + + /* Field headers */ + pq_sendstring(&buf, "recptr"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, TEXTOID); /* type oid */ + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + + pq_sendstring(&buf, "tli"); + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + + /* + * int8 may seem like a surprising data type for this, but in theory int4 + * would not be wide enough for this, as TimeLineID is unsigned. + */ + pq_sendint32(&buf, INT8OID); /* type oid */ + pq_sendint16(&buf, -1); + pq_sendint32(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + + /* Data row */ + pq_beginmessage(&buf, 'D'); + pq_sendint16(&buf, 2); /* number of columns */ + + len = snprintf(str, sizeof(str), + "%X/%X", LSN_FORMAT_ARGS(ptr)); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, str, len); + + len = snprintf(str, sizeof(str), "%u", tli); + pq_sendint32(&buf, len); + pq_sendbytes(&buf, str, len); + + pq_endmessage(&buf); + + /* Send a CommandComplete message */ + pq_puttextmessage('C', "SELECT"); +} + +/* + * Inject a file with given name and content in the output tar stream. + */ +static void +sendFileWithContent(const char *filename, const char *content, + backup_manifest_info *manifest) +{ + struct stat statbuf; + int pad, + len; + pg_checksum_context checksum_ctx; + + if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) + elog(ERROR, "could not initialize checksum of file \"%s\"", + filename); + + len = strlen(content); + + /* + * Construct a stat struct for the backup_label file we're injecting in + * the tar. + */ + /* Windows doesn't have the concept of uid and gid */ +#ifdef WIN32 + statbuf.st_uid = 0; + statbuf.st_gid = 0; +#else + statbuf.st_uid = geteuid(); + statbuf.st_gid = getegid(); +#endif + statbuf.st_mtime = time(NULL); + statbuf.st_mode = pg_file_create_mode; + statbuf.st_size = len; + + _tarWriteHeader(filename, NULL, &statbuf, false); + /* Send the contents as a CopyData message */ + pq_putmessage('d', content, len); + update_basebackup_progress(len); + + /* Pad to a multiple of the tar block size. */ + pad = tarPaddingBytesRequired(len); + if (pad > 0) + { + char buf[TAR_BLOCK_SIZE]; + + MemSet(buf, 0, pad); + pq_putmessage('d', buf, pad); + update_basebackup_progress(pad); + } + + if (pg_checksum_update(&checksum_ctx, (uint8 *) content, len) < 0) + elog(ERROR, "could not update checksum of file \"%s\"", + filename); + + AddFileToBackupManifest(manifest, NULL, filename, len, + (pg_time_t) statbuf.st_mtime, &checksum_ctx); +} + +/* + * Include the tablespace directory pointed to by 'path' in the output tar + * stream. If 'sizeonly' is true, we just calculate a total length and return + * it, without actually sending anything. + * + * Only used to send auxiliary tablespaces, not PGDATA. + */ +static int64 +sendTablespace(char *path, char *spcoid, bool sizeonly, + backup_manifest_info *manifest) +{ + int64 size; + char pathbuf[MAXPGPATH]; + struct stat statbuf; + + /* + * 'path' points to the tablespace location, but we only want to include + * the version directory in it that belongs to us. + */ + snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, + TABLESPACE_VERSION_DIRECTORY); + + /* + * Store a directory entry in the tar file so we get the permissions + * right. + */ + if (lstat(pathbuf, &statbuf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file or directory \"%s\": %m", + pathbuf))); + + /* If the tablespace went away while scanning, it's no error. */ + return 0; + } + + size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf, + sizeonly); + + /* Send all the files in the tablespace version directory */ + size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true, manifest, + spcoid); + + return size; +} + +/* + * Include all files from the given directory in the output tar stream. If + * 'sizeonly' is true, we just calculate a total length and return it, without + * actually sending anything. + * + * Omit any directory in the tablespaces list, to avoid backing up + * tablespaces twice when they were created inside PGDATA. + * + * If sendtblspclinks is true, we need to include symlink + * information in the tar file. If not, we can skip that + * as it will be sent separately in the tablespace_map file. + */ +static int64 +sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, + bool sendtblspclinks, backup_manifest_info *manifest, + const char *spcoid) +{ + DIR *dir; + struct dirent *de; + char pathbuf[MAXPGPATH * 2]; + struct stat statbuf; + int64 size = 0; + const char *lastDir; /* Split last dir from parent path. */ + bool isDbDir = false; /* Does this directory contain relations? */ + + /* + * Determine if the current path is a database directory that can contain + * relations. + * + * Start by finding the location of the delimiter between the parent path + * and the current path. + */ + lastDir = last_dir_separator(path); + + /* Does this path look like a database path (i.e. all digits)? */ + if (lastDir != NULL && + strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1)) + { + /* Part of path that contains the parent directory. */ + int parentPathLen = lastDir - path; + + /* + * Mark path as a database directory if the parent path is either + * $PGDATA/base or a tablespace version path. + */ + if (strncmp(path, "./base", parentPathLen) == 0 || + (parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) && + strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1), + TABLESPACE_VERSION_DIRECTORY, + sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0)) + isDbDir = true; + } + + dir = AllocateDir(path); + while ((de = ReadDir(dir, path)) != NULL) + { + int excludeIdx; + bool excludeFound; + ForkNumber relForkNum; /* Type of fork if file is a relation */ + int relOidChars; /* Chars in filename that are the rel oid */ + + /* Skip special stuff */ + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + /* Skip temporary files */ + if (strncmp(de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + continue; + + /* + * Check if the postmaster has signaled us to exit, and abort with an + * error in that case. The error handler further up will call + * do_pg_abort_backup() for us. Also check that if the backup was + * started while still in recovery, the server wasn't promoted. + * do_pg_stop_backup() will check that too, but it's better to stop + * the backup early than continue to the end and fail there. + */ + CHECK_FOR_INTERRUPTS(); + if (RecoveryInProgress() != backup_started_in_recovery) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the standby was promoted during online backup"), + errhint("This means that the backup being taken is corrupt " + "and should not be used. " + "Try taking another online backup."))); + + /* Scan for files that should be excluded */ + excludeFound = false; + for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++) + { + int cmplen = strlen(excludeFiles[excludeIdx].name); + + if (!excludeFiles[excludeIdx].match_prefix) + cmplen++; + if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0) + { + elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name); + excludeFound = true; + break; + } + } + + if (excludeFound) + continue; + + /* Exclude all forks for unlogged tables except the init fork */ + if (isDbDir && + parse_filename_for_nontemp_relation(de->d_name, &relOidChars, + &relForkNum)) + { + /* Never exclude init forks */ + if (relForkNum != INIT_FORKNUM) + { + char initForkFile[MAXPGPATH]; + char relOid[OIDCHARS + 1]; + + /* + * If any other type of fork, check if there is an init fork + * with the same OID. If so, the file can be excluded. + */ + memcpy(relOid, de->d_name, relOidChars); + relOid[relOidChars] = '\0'; + snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init", + path, relOid); + + if (lstat(initForkFile, &statbuf) == 0) + { + elog(DEBUG2, + "unlogged relation file \"%s\" excluded from backup", + de->d_name); + + continue; + } + } + } + + /* Exclude temporary relations */ + if (isDbDir && looks_like_temp_rel_name(de->d_name)) + { + elog(DEBUG2, + "temporary relation file \"%s\" excluded from backup", + de->d_name); + + continue; + } + + snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name); + + /* Skip pg_control here to back up it last */ + if (strcmp(pathbuf, "./global/pg_control") == 0) + continue; + + if (lstat(pathbuf, &statbuf) != 0) + { + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file or directory \"%s\": %m", + pathbuf))); + + /* If the file went away while scanning, it's not an error. */ + continue; + } + + /* Scan for directories whose contents should be excluded */ + excludeFound = false; + for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++) + { + if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0) + { + elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name); + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + excludeFound = true; + break; + } + } + + if (excludeFound) + continue; + + /* + * Exclude contents of directory specified by statrelpath if not set + * to the default (pg_stat_tmp) which is caught in the loop above. + */ + if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0) + { + elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath); + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + continue; + } + + /* + * We can skip pg_wal, the WAL segments need to be fetched from the + * WAL archive anyway. But include it as an empty directory anyway, so + * we get permissions right. + */ + if (strcmp(pathbuf, "./pg_wal") == 0) + { + /* If pg_wal is a symlink, write it as a directory anyway */ + size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly); + + /* + * Also send archive_status directory (by hackishly reusing + * statbuf from above ...). + */ + size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf, + sizeonly); + + continue; /* don't recurse into pg_wal */ + } + + /* Allow symbolic links in pg_tblspc only */ + if (strcmp(path, "./pg_tblspc") == 0 && +#ifndef WIN32 + S_ISLNK(statbuf.st_mode) +#else + pgwin32_is_junction(pathbuf) +#endif + ) + { +#if defined(HAVE_READLINK) || defined(WIN32) + char linkpath[MAXPGPATH]; + int rllen; + + rllen = readlink(pathbuf, linkpath, sizeof(linkpath)); + if (rllen < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read symbolic link \"%s\": %m", + pathbuf))); + if (rllen >= sizeof(linkpath)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("symbolic link \"%s\" target is too long", + pathbuf))); + linkpath[rllen] = '\0'; + + size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath, + &statbuf, sizeonly); +#else + + /* + * If the platform does not have symbolic links, it should not be + * possible to have tablespaces - clearly somebody else created + * them. Warn about it and ignore. + */ + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablespaces are not supported on this platform"))); + continue; +#endif /* HAVE_READLINK */ + } + else if (S_ISDIR(statbuf.st_mode)) + { + bool skip_this_dir = false; + ListCell *lc; + + /* + * Store a directory entry in the tar file so we can get the + * permissions right. + */ + size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf, + sizeonly); + + /* + * Call ourselves recursively for a directory, unless it happens + * to be a separate tablespace located within PGDATA. + */ + foreach(lc, tablespaces) + { + tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); + + /* + * ti->rpath is the tablespace relative path within PGDATA, or + * NULL if the tablespace has been properly located somewhere + * else. + * + * Skip past the leading "./" in pathbuf when comparing. + */ + if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0) + { + skip_this_dir = true; + break; + } + } + + /* + * skip sending directories inside pg_tblspc, if not required. + */ + if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks) + skip_this_dir = true; + + if (!skip_this_dir) + size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, + sendtblspclinks, manifest, spcoid); + } + else if (S_ISREG(statbuf.st_mode)) + { + bool sent = false; + + if (!sizeonly) + sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, + true, isDbDir ? atooid(lastDir + 1) : InvalidOid, + manifest, spcoid); + + if (sent || sizeonly) + { + /* Add size. */ + size += statbuf.st_size; + + /* Pad to a multiple of the tar block size. */ + size += tarPaddingBytesRequired(statbuf.st_size); + + /* Size of the header for the file. */ + size += TAR_BLOCK_SIZE; + } + } + else + ereport(WARNING, + (errmsg("skipping special file \"%s\"", pathbuf))); + } + FreeDir(dir); + return size; +} + +/* + * Check if a file should have its checksum validated. + * We validate checksums on files in regular tablespaces + * (including global and default) only, and in those there + * are some files that are explicitly excluded. + */ +static bool +is_checksummed_file(const char *fullpath, const char *filename) +{ + /* Check that the file is in a tablespace */ + if (strncmp(fullpath, "./global/", 9) == 0 || + strncmp(fullpath, "./base/", 7) == 0 || + strncmp(fullpath, "/", 1) == 0) + { + int excludeIdx; + + /* Compare file against noChecksumFiles skip list */ + for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++) + { + int cmplen = strlen(noChecksumFiles[excludeIdx].name); + + if (!noChecksumFiles[excludeIdx].match_prefix) + cmplen++; + if (strncmp(filename, noChecksumFiles[excludeIdx].name, + cmplen) == 0) + return false; + } + + return true; + } + else + return false; +} + +/***** + * Functions for handling tar file format + * + * Copied from pg_dump, but modified to work with libpq for sending + */ + + +/* + * Given the member, write the TAR header & send the file. + * + * If 'missing_ok' is true, will not throw an error if the file is not found. + * + * If dboid is anything other than InvalidOid then any checksum failures detected + * will get reported to the stats collector. + * + * Returns true if the file was successfully sent, false if 'missing_ok', + * and the file did not exist. + */ +static bool +sendFile(const char *readfilename, const char *tarfilename, + struct stat *statbuf, bool missing_ok, Oid dboid, + backup_manifest_info *manifest, const char *spcoid) +{ + int fd; + BlockNumber blkno = 0; + bool block_retry = false; + char buf[TAR_SEND_SIZE]; + uint16 checksum; + int checksum_failures = 0; + off_t cnt; + int i; + pgoff_t len = 0; + char *page; + size_t pad; + PageHeader phdr; + int segmentno = 0; + char *segmentpath; + bool verify_checksum = false; + pg_checksum_context checksum_ctx; + + if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) + elog(ERROR, "could not initialize checksum of file \"%s\"", + readfilename); + + fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY); + if (fd < 0) + { + if (errno == ENOENT && missing_ok) + return false; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", readfilename))); + } + + _tarWriteHeader(tarfilename, NULL, statbuf, false); + + if (!noverify_checksums && DataChecksumsEnabled()) + { + char *filename; + + /* + * Get the filename (excluding path). As last_dir_separator() + * includes the last directory separator, we chop that off by + * incrementing the pointer. + */ + filename = last_dir_separator(readfilename) + 1; + + if (is_checksummed_file(readfilename, filename)) + { + verify_checksum = true; + + /* + * Cut off at the segment boundary (".") to get the segment number + * in order to mix it into the checksum. + */ + segmentpath = strstr(filename, "."); + if (segmentpath != NULL) + { + segmentno = atoi(segmentpath + 1); + if (segmentno == 0) + ereport(ERROR, + (errmsg("invalid segment number %d in file \"%s\"", + segmentno, filename))); + } + } + } + + /* + * Loop until we read the amount of data the caller told us to expect. The + * file could be longer, if it was extended while we were sending it, but + * for a base backup we can ignore such extended data. It will be restored + * from WAL. + */ + while (len < statbuf->st_size) + { + /* Try to read some more data. */ + cnt = basebackup_read_file(fd, buf, + Min(sizeof(buf), statbuf->st_size - len), + len, readfilename, true); + + /* + * If we hit end-of-file, a concurrent truncation must have occurred. + * That's not an error condition, because WAL replay will fix things + * up. + */ + if (cnt == 0) + break; + + /* + * The checksums are verified at block level, so we iterate over the + * buffer in chunks of BLCKSZ, after making sure that + * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of + * BLCKSZ bytes. + */ + Assert(TAR_SEND_SIZE % BLCKSZ == 0); + + if (verify_checksum && (cnt % BLCKSZ != 0)) + { + ereport(WARNING, + (errmsg("could not verify checksum in file \"%s\", block " + "%u: read buffer size %d and page size %d " + "differ", + readfilename, blkno, (int) cnt, BLCKSZ))); + verify_checksum = false; + } + + if (verify_checksum) + { + for (i = 0; i < cnt / BLCKSZ; i++) + { + page = buf + BLCKSZ * i; + + /* + * Only check pages which have not been modified since the + * start of the base backup. Otherwise, they might have been + * written only halfway and the checksum would not be valid. + * However, replaying WAL would reinstate the correct page in + * this case. We also skip completely new pages, since they + * don't have a checksum yet. + */ + if (!PageIsNew(page) && PageGetLSN(page) < startptr) + { + checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + phdr = (PageHeader) page; + if (phdr->pd_checksum != checksum) + { + /* + * Retry the block on the first failure. It's + * possible that we read the first 4K page of the + * block just before postgres updated the entire block + * so it ends up looking torn to us. We only need to + * retry once because the LSN should be updated to + * something we can ignore on the next pass. If the + * error happens again then it is a true validation + * failure. + */ + if (block_retry == false) + { + int reread_cnt; + + /* Reread the failed block */ + reread_cnt = + basebackup_read_file(fd, buf + BLCKSZ * i, + BLCKSZ, len + BLCKSZ * i, + readfilename, + false); + if (reread_cnt == 0) + { + /* + * If we hit end-of-file, a concurrent + * truncation must have occurred, so break out + * of this loop just as if the initial fread() + * returned 0. We'll drop through to the same + * code that handles that case. (We must fix + * up cnt first, though.) + */ + cnt = BLCKSZ * i; + break; + } + + /* Set flag so we know a retry was attempted */ + block_retry = true; + + /* Reset loop to validate the block again */ + i--; + continue; + } + + checksum_failures++; + + if (checksum_failures <= 5) + ereport(WARNING, + (errmsg("checksum verification failed in " + "file \"%s\", block %u: calculated " + "%X but expected %X", + readfilename, blkno, checksum, + phdr->pd_checksum))); + if (checksum_failures == 5) + ereport(WARNING, + (errmsg("further checksum verification " + "failures in file \"%s\" will not " + "be reported", readfilename))); + } + } + block_retry = false; + blkno++; + } + } + + /* Send the chunk as a CopyData message */ + if (pq_putmessage('d', buf, cnt)) + ereport(ERROR, + (errmsg("base backup could not send data, aborting backup"))); + update_basebackup_progress(cnt); + + /* Also feed it to the checksum machinery. */ + if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0) + elog(ERROR, "could not update checksum of base backup"); + + len += cnt; + throttle(cnt); + } + + /* If the file was truncated while we were sending it, pad it with zeros */ + if (len < statbuf->st_size) + { + MemSet(buf, 0, sizeof(buf)); + while (len < statbuf->st_size) + { + cnt = Min(sizeof(buf), statbuf->st_size - len); + pq_putmessage('d', buf, cnt); + if (pg_checksum_update(&checksum_ctx, (uint8 *) buf, cnt) < 0) + elog(ERROR, "could not update checksum of base backup"); + update_basebackup_progress(cnt); + len += cnt; + throttle(cnt); + } + } + + /* + * Pad to a block boundary, per tar format requirements. (This small piece + * of data is probably not worth throttling, and is not checksummed + * because it's not actually part of the file.) + */ + pad = tarPaddingBytesRequired(len); + if (pad > 0) + { + MemSet(buf, 0, pad); + pq_putmessage('d', buf, pad); + update_basebackup_progress(pad); + } + + CloseTransientFile(fd); + + if (checksum_failures > 1) + { + ereport(WARNING, + (errmsg_plural("file \"%s\" has a total of %d checksum verification failure", + "file \"%s\" has a total of %d checksum verification failures", + checksum_failures, + readfilename, checksum_failures))); + + pgstat_report_checksum_failures_in_db(dboid, checksum_failures); + } + + total_checksum_failures += checksum_failures; + + AddFileToBackupManifest(manifest, spcoid, tarfilename, statbuf->st_size, + (pg_time_t) statbuf->st_mtime, &checksum_ctx); + + return true; +} + + +static int64 +_tarWriteHeader(const char *filename, const char *linktarget, + struct stat *statbuf, bool sizeonly) +{ + char h[TAR_BLOCK_SIZE]; + enum tarError rc; + + if (!sizeonly) + { + rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size, + statbuf->st_mode, statbuf->st_uid, statbuf->st_gid, + statbuf->st_mtime); + + switch (rc) + { + case TAR_OK: + break; + case TAR_NAME_TOO_LONG: + ereport(ERROR, + (errmsg("file name too long for tar format: \"%s\"", + filename))); + break; + case TAR_SYMLINK_TOO_LONG: + ereport(ERROR, + (errmsg("symbolic link target too long for tar format: " + "file name \"%s\", target \"%s\"", + filename, linktarget))); + break; + default: + elog(ERROR, "unrecognized tar error: %d", rc); + } + + pq_putmessage('d', h, sizeof(h)); + update_basebackup_progress(sizeof(h)); + } + + return sizeof(h); +} + +/* + * Write tar header for a directory. If the entry in statbuf is a link then + * write it as a directory anyway. + */ +static int64 +_tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf, + bool sizeonly) +{ + /* If symlink, write it as a directory anyway */ +#ifndef WIN32 + if (S_ISLNK(statbuf->st_mode)) +#else + if (pgwin32_is_junction(pathbuf)) +#endif + statbuf->st_mode = S_IFDIR | pg_dir_create_mode; + + return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly); +} + +/* + * Increment the network transfer counter by the given number of bytes, + * and sleep if necessary to comply with the requested network transfer + * rate. + */ +static void +throttle(size_t increment) +{ + TimeOffset elapsed_min; + + if (throttling_counter < 0) + return; + + throttling_counter += increment; + if (throttling_counter < throttling_sample) + return; + + /* How much time should have elapsed at minimum? */ + elapsed_min = elapsed_min_unit * + (throttling_counter / throttling_sample); + + /* + * Since the latch could be set repeatedly because of concurrently WAL + * activity, sleep in a loop to ensure enough time has passed. + */ + for (;;) + { + TimeOffset elapsed, + sleep; + int wait_result; + + /* Time elapsed since the last measurement (and possible wake up). */ + elapsed = GetCurrentTimestamp() - throttled_last; + + /* sleep if the transfer is faster than it should be */ + sleep = elapsed_min - elapsed; + if (sleep <= 0) + break; + + ResetLatch(MyLatch); + + /* We're eating a potentially set latch, so check for interrupts */ + CHECK_FOR_INTERRUPTS(); + + /* + * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be + * the maximum time to sleep. Thus the cast to long is safe. + */ + wait_result = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (long) (sleep / 1000), + WAIT_EVENT_BASE_BACKUP_THROTTLE); + + if (wait_result & WL_LATCH_SET) + CHECK_FOR_INTERRUPTS(); + + /* Done waiting? */ + if (wait_result & WL_TIMEOUT) + break; + } + + /* + * As we work with integers, only whole multiple of throttling_sample was + * processed. The rest will be done during the next call of this function. + */ + throttling_counter %= throttling_sample; + + /* + * Time interval for the remaining amount and possible next increments + * starts now. + */ + throttled_last = GetCurrentTimestamp(); +} + +/* + * Increment the counter for the amount of data already streamed + * by the given number of bytes, and update the progress report for + * pg_stat_progress_basebackup. + */ +static void +update_basebackup_progress(int64 delta) +{ + const int index[] = { + PROGRESS_BASEBACKUP_BACKUP_STREAMED, + PROGRESS_BASEBACKUP_BACKUP_TOTAL + }; + int64 val[2]; + int nparam = 0; + + backup_streamed += delta; + val[nparam++] = backup_streamed; + + /* + * Avoid overflowing past 100% or the full size. This may make the total + * size number change as we approach the end of the backup (the estimate + * will always be wrong if WAL is included), but that's better than having + * the done column be bigger than the total. + */ + if (backup_total > -1 && backup_streamed > backup_total) + { + backup_total = backup_streamed; + val[nparam++] = backup_total; + } + + pgstat_progress_update_multi_param(nparam, index, val); +} + +/* + * Read some data from a file, setting a wait event and reporting any error + * encountered. + * + * If partial_read_ok is false, also report an error if the number of bytes + * read is not equal to the number of bytes requested. + * + * Returns the number of bytes read. + */ +static int +basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, + const char *filename, bool partial_read_ok) +{ + int rc; + + pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ); + rc = pg_pread(fd, buf, nbytes, offset); + pgstat_report_wait_end(); + + if (rc < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", filename))); + if (!partial_read_ok && rc > 0 && rc != nbytes) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": read %d of %zu", + filename, rc, nbytes))); + + return rc; +} |