Adding upstream version 15.5.upstream/15.5

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
commit: 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree: 739caf8c461053357daa9f162bef34516c7bf452 /src/bin/pg_rewind/pg_rewind.c
parent: Initial commit. (diff)
download: postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
1 files changed, 1169 insertions, 0 deletions
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
new file mode 100644
index 0000000..1ff8da1
--- /dev/null
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -0,0 +1,1169 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_rewind.c
+ *	  Synchronizes a PostgreSQL data directory to a new timeline
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/xlog_internal.h"
+#include "catalog/catversion.h"
+#include "catalog/pg_control.h"
+#include "common/controldata_utils.h"
+#include "common/file_perm.h"
+#include "common/restricted_token.h"
+#include "common/string.h"
+#include "fe_utils/recovery_gen.h"
+#include "fe_utils/string_utils.h"
+#include "file_ops.h"
+#include "filemap.h"
+#include "getopt_long.h"
+#include "pg_rewind.h"
+#include "rewind_source.h"
+#include "storage/bufpage.h"
+
+static void usage(const char *progname);
+
+static void perform_rewind(filemap_t *filemap, rewind_source *source,
+						   XLogRecPtr chkptrec,
+						   TimeLineID chkpttli,
+						   XLogRecPtr chkptredo);
+
+static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
+							  XLogRecPtr checkpointloc);
+
+static void digestControlFile(ControlFileData *ControlFile,
+							  const char *content, size_t size);
+static void getRestoreCommand(const char *argv0);
+static void sanityChecks(void);
+static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
+static void ensureCleanShutdown(const char *argv0);
+static void disconnect_atexit(void);
+
+static ControlFileData ControlFile_target;
+static ControlFileData ControlFile_source;
+static ControlFileData ControlFile_source_after;
+
+const char *progname;
+int			WalSegSz;
+
+/* Configuration options */
+char	   *datadir_target = NULL;
+char	   *datadir_source = NULL;
+char	   *connstr_source = NULL;
+char	   *restore_command = NULL;
+char	   *config_file = NULL;
+
+static bool debug = false;
+bool		showprogress = false;
+bool		dry_run = false;
+bool		do_sync = true;
+bool		restore_wal = false;
+
+/* Target history */
+TimeLineHistoryEntry *targetHistory;
+int			targetNentries;
+
+/* Progress counters */
+uint64		fetch_size;
+uint64		fetch_done;
+
+static PGconn *conn;
+static rewind_source *source;
+
+static void
+usage(const char *progname)
+{
+	printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
+	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -c, --restore-target-wal       use restore_command in target configuration to\n"
+			 "                                 retrieve WAL files from archives\n"));
+	printf(_("  -D, --target-pgdata=DIRECTORY  existing data directory to modify\n"));
+	printf(_("      --source-pgdata=DIRECTORY  source data directory to synchronize with\n"));
+	printf(_("      --source-server=CONNSTR    source server to synchronize with\n"));
+	printf(_("  -n, --dry-run                  stop before modifying anything\n"));
+	printf(_("  -N, --no-sync                  do not wait for changes to be written\n"
+			 "                                 safely to disk\n"));
+	printf(_("  -P, --progress                 write progress messages\n"));
+	printf(_("  -R, --write-recovery-conf      write configuration for replication\n"
+			 "                                 (requires --source-server)\n"));
+	printf(_("      --config-file=FILENAME     use specified main server configuration\n"
+			 "                                 file when running target cluster\n"));
+	printf(_("      --debug                    write a lot of debug messages\n"));
+	printf(_("      --no-ensure-shutdown       do not automatically fix unclean shutdown\n"));
+	printf(_("  -V, --version                  output version information, then exit\n"));
+	printf(_("  -?, --help                     show this help, then exit\n"));
+	printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
+	printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
+}
+
+
+int
+main(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"help", no_argument, NULL, '?'},
+		{"target-pgdata", required_argument, NULL, 'D'},
+		{"write-recovery-conf", no_argument, NULL, 'R'},
+		{"source-pgdata", required_argument, NULL, 1},
+		{"source-server", required_argument, NULL, 2},
+		{"no-ensure-shutdown", no_argument, NULL, 4},
+		{"config-file", required_argument, NULL, 5},
+		{"version", no_argument, NULL, 'V'},
+		{"restore-target-wal", no_argument, NULL, 'c'},
+		{"dry-run", no_argument, NULL, 'n'},
+		{"no-sync", no_argument, NULL, 'N'},
+		{"progress", no_argument, NULL, 'P'},
+		{"debug", no_argument, NULL, 3},
+		{NULL, 0, NULL, 0}
+	};
+	int			option_index;
+	int			c;
+	XLogRecPtr	divergerec;
+	int			lastcommontliIndex;
+	XLogRecPtr	chkptrec;
+	TimeLineID	chkpttli;
+	XLogRecPtr	chkptredo;
+	XLogRecPtr	target_wal_endrec;
+	size_t		size;
+	char	   *buffer;
+	bool		no_ensure_shutdown = false;
+	bool		rewind_needed;
+	bool		writerecoveryconf = false;
+	filemap_t  *filemap;
+
+	pg_logging_init(argv[0]);
+	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
+	progname = get_progname(argv[0]);
+
+	/* Process command-line arguments */
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			usage(progname);
+			exit(0);
+		}
+		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+		{
+			puts("pg_rewind (PostgreSQL) " PG_VERSION);
+			exit(0);
+		}
+	}
+
+	while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
+	{
+		switch (c)
+		{
+			case 'c':
+				restore_wal = true;
+				break;
+
+			case 'P':
+				showprogress = true;
+				break;
+
+			case 'n':
+				dry_run = true;
+				break;
+
+			case 'N':
+				do_sync = false;
+				break;
+
+			case 'R':
+				writerecoveryconf = true;
+				break;
+
+			case 3:
+				debug = true;
+				pg_logging_increase_verbosity();
+				break;
+
+			case 'D':			/* -D or --target-pgdata */
+				datadir_target = pg_strdup(optarg);
+				break;
+
+			case 1:				/* --source-pgdata */
+				datadir_source = pg_strdup(optarg);
+				break;
+
+			case 2:				/* --source-server */
+				connstr_source = pg_strdup(optarg);
+				break;
+
+			case 4:
+				no_ensure_shutdown = true;
+				break;
+
+			case 5:
+				config_file = pg_strdup(optarg);
+				break;
+
+			default:
+				/* getopt_long already emitted a complaint */
+				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+				exit(1);
+		}
+	}
+
+	if (datadir_source == NULL && connstr_source == NULL)
+	{
+		pg_log_error("no source specified (--source-pgdata or --source-server)");
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	if (datadir_source != NULL && connstr_source != NULL)
+	{
+		pg_log_error("only one of --source-pgdata or --source-server can be specified");
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	if (datadir_target == NULL)
+	{
+		pg_log_error("no target data directory specified (--target-pgdata)");
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	if (writerecoveryconf && connstr_source == NULL)
+	{
+		pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	if (optind < argc)
+	{
+		pg_log_error("too many command-line arguments (first is \"%s\")",
+					 argv[optind]);
+		pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+		exit(1);
+	}
+
+	/*
+	 * Don't allow pg_rewind to be run as root, to avoid overwriting the
+	 * ownership of files in the data directory. We need only check for root
+	 * -- any other user won't have sufficient permissions to modify files in
+	 * the data directory.
+	 */
+#ifndef WIN32
+	if (geteuid() == 0)
+	{
+		pg_log_error("cannot be executed by \"root\"");
+		pg_log_error_hint("You must run %s as the PostgreSQL superuser.",
+						  progname);
+		exit(1);
+	}
+#endif
+
+	get_restricted_token();
+
+	/* Set mask based on PGDATA permissions */
+	if (!GetDataDirectoryCreatePerm(datadir_target))
+		pg_fatal("could not read permissions of directory \"%s\": %m",
+				 datadir_target);
+
+	umask(pg_mode_mask);
+
+	getRestoreCommand(argv[0]);
+
+	atexit(disconnect_atexit);
+
+	/*
+	 * Ok, we have all the options and we're ready to start. First, connect to
+	 * remote server.
+	 */
+	if (connstr_source)
+	{
+		conn = PQconnectdb(connstr_source);
+
+		if (PQstatus(conn) == CONNECTION_BAD)
+			pg_fatal("%s", PQerrorMessage(conn));
+
+		if (showprogress)
+			pg_log_info("connected to server");
+
+		source = init_libpq_source(conn);
+	}
+	else
+		source = init_local_source(datadir_source);
+
+	/*
+	 * Check the status of the target instance.
+	 *
+	 * If the target instance was not cleanly shut down, start and stop the
+	 * target cluster once in single-user mode to enforce recovery to finish,
+	 * ensuring that the cluster can be used by pg_rewind.  Note that if
+	 * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
+	 * need to make sure by themselves that the target cluster is in a clean
+	 * state.
+	 */
+	buffer = slurpFile(datadir_target, "global/pg_control", &size);
+	digestControlFile(&ControlFile_target, buffer, size);
+	pg_free(buffer);
+
+	if (!no_ensure_shutdown &&
+		ControlFile_target.state != DB_SHUTDOWNED &&
+		ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
+	{
+		ensureCleanShutdown(argv[0]);
+
+		buffer = slurpFile(datadir_target, "global/pg_control", &size);
+		digestControlFile(&ControlFile_target, buffer, size);
+		pg_free(buffer);
+	}
+
+	buffer = source->fetch_file(source, "global/pg_control", &size);
+	digestControlFile(&ControlFile_source, buffer, size);
+	pg_free(buffer);
+
+	sanityChecks();
+
+	/*
+	 * Find the common ancestor timeline between the clusters.
+	 *
+	 * If both clusters are already on the same timeline, there's nothing to
+	 * do.
+	 */
+	if (ControlFile_target.checkPointCopy.ThisTimeLineID ==
+		ControlFile_source.checkPointCopy.ThisTimeLineID)
+	{
+		pg_log_info("source and target cluster are on the same timeline");
+		rewind_needed = false;
+		target_wal_endrec = 0;
+	}
+	else
+	{
+		XLogRecPtr	chkptendrec;
+
+		findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
+		pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
+					LSN_FORMAT_ARGS(divergerec),
+					targetHistory[lastcommontliIndex].tli);
+
+		/*
+		 * Determine the end-of-WAL on the target.
+		 *
+		 * The WAL ends at the last shutdown checkpoint, or at
+		 * minRecoveryPoint if it was a standby. (If we supported rewinding a
+		 * server that was not shut down cleanly, we would need to replay
+		 * until we reach the first invalid record, like crash recovery does.)
+		 */
+
+		/* read the checkpoint record on the target to see where it ends. */
+		chkptendrec = readOneRecord(datadir_target,
+									ControlFile_target.checkPoint,
+									targetNentries - 1,
+									restore_command);
+
+		if (ControlFile_target.minRecoveryPoint > chkptendrec)
+		{
+			target_wal_endrec = ControlFile_target.minRecoveryPoint;
+		}
+		else
+		{
+			target_wal_endrec = chkptendrec;
+		}
+
+		/*
+		 * Check for the possibility that the target is in fact a direct
+		 * ancestor of the source. In that case, there is no divergent history
+		 * in the target that needs rewinding.
+		 */
+		if (target_wal_endrec > divergerec)
+		{
+			rewind_needed = true;
+		}
+		else
+		{
+			/* the last common checkpoint record must be part of target WAL */
+			Assert(target_wal_endrec == divergerec);
+
+			rewind_needed = false;
+		}
+	}
+
+	if (!rewind_needed)
+	{
+		pg_log_info("no rewind required");
+		if (writerecoveryconf && !dry_run)
+			WriteRecoveryConfig(conn, datadir_target,
+								GenerateRecoveryConfig(conn, NULL));
+		exit(0);
+	}
+
+	findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
+					   &chkptrec, &chkpttli, &chkptredo, restore_command);
+	pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
+				LSN_FORMAT_ARGS(chkptrec), chkpttli);
+
+	/* Initialize the hash table to track the status of each file */
+	filehash_init();
+
+	/*
+	 * Collect information about all files in the both data directories.
+	 */
+	if (showprogress)
+		pg_log_info("reading source file list");
+	source->traverse_files(source, &process_source_file);
+
+	if (showprogress)
+		pg_log_info("reading target file list");
+	traverse_datadir(datadir_target, &process_target_file);
+
+	/*
+	 * Read the target WAL from last checkpoint before the point of fork, to
+	 * extract all the pages that were modified on the target cluster after
+	 * the fork.
+	 */
+	if (showprogress)
+		pg_log_info("reading WAL in target");
+	extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
+				   target_wal_endrec, restore_command);
+
+	/*
+	 * We have collected all information we need from both systems. Decide
+	 * what to do with each file.
+	 */
+	filemap = decide_file_actions();
+	if (showprogress)
+		calculate_totals(filemap);
+
+	/* this is too verbose even for verbose mode */
+	if (debug)
+		print_filemap(filemap);
+
+	/*
+	 * Ok, we're ready to start copying things over.
+	 */
+	if (showprogress)
+	{
+		pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
+					(unsigned long) (filemap->fetch_size / (1024 * 1024)),
+					(unsigned long) (filemap->total_size / (1024 * 1024)));
+
+		fetch_size = filemap->fetch_size;
+		fetch_done = 0;
+	}
+
+	/*
+	 * We have now collected all the information we need from both systems,
+	 * and we are ready to start modifying the target directory.
+	 *
+	 * This is the point of no return. Once we start copying things, there is
+	 * no turning back!
+	 */
+	perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
+
+	if (showprogress)
+		pg_log_info("syncing target data directory");
+	sync_target_dir();
+
+	/* Also update the standby configuration, if requested. */
+	if (writerecoveryconf && !dry_run)
+		WriteRecoveryConfig(conn, datadir_target,
+							GenerateRecoveryConfig(conn, NULL));
+
+	/* don't need the source connection anymore */
+	source->destroy(source);
+	if (conn)
+	{
+		PQfinish(conn);
+		conn = NULL;
+	}
+
+	pg_log_info("Done!");
+
+	return 0;
+}
+
+/*
+ * Perform the rewind.
+ *
+ * We have already collected all the information we need from the
+ * target and the source.
+ */
+static void
+perform_rewind(filemap_t *filemap, rewind_source *source,
+			   XLogRecPtr chkptrec,
+			   TimeLineID chkpttli,
+			   XLogRecPtr chkptredo)
+{
+	XLogRecPtr	endrec;
+	TimeLineID	endtli;
+	ControlFileData ControlFile_new;
+	size_t		size;
+	char	   *buffer;
+
+	/*
+	 * Execute the actions in the file map, fetching data from the source
+	 * system as needed.
+	 */
+	for (int i = 0; i < filemap->nentries; i++)
+	{
+		file_entry_t *entry = filemap->entries[i];
+
+		/*
+		 * If this is a relation file, copy the modified blocks.
+		 *
+		 * This is in addition to any other changes.
+		 */
+		if (entry->target_pages_to_overwrite.bitmapsize > 0)
+		{
+			datapagemap_iterator_t *iter;
+			BlockNumber blkno;
+			off_t		offset;
+
+			iter = datapagemap_iterate(&entry->target_pages_to_overwrite);
+			while (datapagemap_next(iter, &blkno))
+			{
+				offset = blkno * BLCKSZ;
+				source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
+			}
+			pg_free(iter);
+		}
+
+		switch (entry->action)
+		{
+			case FILE_ACTION_NONE:
+				/* nothing else to do */
+				break;
+
+			case FILE_ACTION_COPY:
+				source->queue_fetch_file(source, entry->path, entry->source_size);
+				break;
+
+			case FILE_ACTION_TRUNCATE:
+				truncate_target_file(entry->path, entry->source_size);
+				break;
+
+			case FILE_ACTION_COPY_TAIL:
+				source->queue_fetch_range(source, entry->path,
+										  entry->target_size,
+										  entry->source_size - entry->target_size);
+				break;
+
+			case FILE_ACTION_REMOVE:
+				remove_target(entry);
+				break;
+
+			case FILE_ACTION_CREATE:
+				create_target(entry);
+				break;
+
+			case FILE_ACTION_UNDECIDED:
+				pg_fatal("no action decided for file \"%s\"", entry->path);
+				break;
+		}
+	}
+
+	/* Complete any remaining range-fetches that we queued up above. */
+	source->finish_fetch(source);
+
+	close_target_file();
+
+	progress_report(true);
+
+	/*
+	 * Fetch the control file from the source last. This ensures that the
+	 * minRecoveryPoint is up-to-date.
+	 */
+	buffer = source->fetch_file(source, "global/pg_control", &size);
+	digestControlFile(&ControlFile_source_after, buffer, size);
+	pg_free(buffer);
+
+	/*
+	 * Sanity check: If the source is a local system, the control file should
+	 * not have changed since we started.
+	 *
+	 * XXX: We assume it hasn't been modified, but actually, what could go
+	 * wrong? The logic handles a libpq source that's modified concurrently,
+	 * why not a local datadir?
+	 */
+	if (datadir_source &&
+		memcmp(&ControlFile_source, &ControlFile_source_after,
+			   sizeof(ControlFileData)) != 0)
+	{
+		pg_fatal("source system was modified while pg_rewind was running");
+	}
+
+	if (showprogress)
+		pg_log_info("creating backup label and updating control file");
+
+	/*
+	 * Create a backup label file, to tell the target where to begin the WAL
+	 * replay. Normally, from the last common checkpoint between the source
+	 * and the target. But if the source is a standby server, it's possible
+	 * that the last common checkpoint is *after* the standby's restartpoint.
+	 * That implies that the source server has applied the checkpoint record,
+	 * but hasn't performed a corresponding restartpoint yet. Make sure we
+	 * start at the restartpoint's redo point in that case.
+	 *
+	 * Use the old version of the source's control file for this. The server
+	 * might have finished the restartpoint after we started copying files,
+	 * but we must begin from the redo point at the time that started copying.
+	 */
+	if (ControlFile_source.checkPointCopy.redo < chkptredo)
+	{
+		chkptredo = ControlFile_source.checkPointCopy.redo;
+		chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
+		chkptrec = ControlFile_source.checkPoint;
+	}
+	createBackupLabel(chkptredo, chkpttli, chkptrec);
+
+	/*
+	 * Update control file of target, to tell the target how far it must
+	 * replay the WAL (minRecoveryPoint).
+	 */
+	if (connstr_source)
+	{
+		/*
+		 * The source is a live server. Like in an online backup, it's
+		 * important that we recover all the WAL that was generated while we
+		 * were copying files.
+		 */
+		if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
+		{
+			/*
+			 * Source is a standby server. We must replay to its
+			 * minRecoveryPoint.
+			 */
+			endrec = ControlFile_source_after.minRecoveryPoint;
+			endtli = ControlFile_source_after.minRecoveryPointTLI;
+		}
+		else
+		{
+			/*
+			 * Source is a production, non-standby, server. We must replay to
+			 * the last WAL insert location.
+			 */
+			if (ControlFile_source_after.state != DB_IN_PRODUCTION)
+				pg_fatal("source system was in unexpected state at end of rewind");
+
+			endrec = source->get_current_wal_insert_lsn(source);
+			endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+		}
+	}
+	else
+	{
+		/*
+		 * Source is a local data directory. It should've shut down cleanly,
+		 * and we must replay to the latest shutdown checkpoint.
+		 */
+		endrec = ControlFile_source_after.checkPoint;
+		endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
+	}
+
+	memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
+	ControlFile_new.minRecoveryPoint = endrec;
+	ControlFile_new.minRecoveryPointTLI = endtli;
+	ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
+	if (!dry_run)
+		update_controlfile(datadir_target, &ControlFile_new, do_sync);
+}
+
+static void
+sanityChecks(void)
+{
+	/* TODO Check that there's no backup_label in either cluster */
+
+	/* Check system_identifier match */
+	if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
+		pg_fatal("source and target clusters are from different systems");
+
+	/* check version */
+	if (ControlFile_target.pg_control_version != PG_CONTROL_VERSION ||
+		ControlFile_source.pg_control_version != PG_CONTROL_VERSION ||
+		ControlFile_target.catalog_version_no != CATALOG_VERSION_NO ||
+		ControlFile_source.catalog_version_no != CATALOG_VERSION_NO)
+	{
+		pg_fatal("clusters are not compatible with this version of pg_rewind");
+	}
+
+	/*
+	 * Target cluster need to use checksums or hint bit wal-logging, this to
+	 * prevent from data corruption that could occur because of hint bits.
+	 */
+	if (ControlFile_target.data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
+		!ControlFile_target.wal_log_hints)
+	{
+		pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
+	}
+
+	/*
+	 * Target cluster better not be running. This doesn't guard against
+	 * someone starting the cluster concurrently. Also, this is probably more
+	 * strict than necessary; it's OK if the target node was not shut down
+	 * cleanly, as long as it isn't running at the moment.
+	 */
+	if (ControlFile_target.state != DB_SHUTDOWNED &&
+		ControlFile_target.state != DB_SHUTDOWNED_IN_RECOVERY)
+		pg_fatal("target server must be shut down cleanly");
+
+	/*
+	 * When the source is a data directory, also require that the source
+	 * server is shut down. There isn't any very strong reason for this
+	 * limitation, but better safe than sorry.
+	 */
+	if (datadir_source &&
+		ControlFile_source.state != DB_SHUTDOWNED &&
+		ControlFile_source.state != DB_SHUTDOWNED_IN_RECOVERY)
+		pg_fatal("source data directory must be shut down cleanly");
+}
+
+/*
+ * Print a progress report based on the fetch_size and fetch_done variables.
+ *
+ * Progress report is written at maximum once per second, except that the
+ * last progress report is always printed.
+ *
+ * If finished is set to true, this is the last progress report. The cursor
+ * is moved to the next line.
+ */
+void
+progress_report(bool finished)
+{
+	static pg_time_t last_progress_report = 0;
+	int			percent;
+	char		fetch_done_str[32];
+	char		fetch_size_str[32];
+	pg_time_t	now;
+
+	if (!showprogress)
+		return;
+
+	now = time(NULL);
+	if (now == last_progress_report && !finished)
+		return;					/* Max once per second */
+
+	last_progress_report = now;
+	percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
+
+	/*
+	 * Avoid overflowing past 100% or the full size. This may make the total
+	 * size number change as we approach the end of the backup (the estimate
+	 * will always be wrong if WAL is included), but that's better than having
+	 * the done column be bigger than the total.
+	 */
+	if (percent > 100)
+		percent = 100;
+	if (fetch_done > fetch_size)
+		fetch_size = fetch_done;
+
+	snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
+			 fetch_done / 1024);
+	snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
+			 fetch_size / 1024);
+
+	fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
+			(int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
+			percent);
+
+	/*
+	 * Stay on the same line if reporting to a terminal and we're not done
+	 * yet.
+	 */
+	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
+}
+
+/*
+ * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
+ * infinity as src/include/access/timeline.h states. This routine should
+ * be used only when comparing WAL locations related to history files.
+ */
+static XLogRecPtr
+MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
+{
+	if (XLogRecPtrIsInvalid(a))
+		return b;
+	else if (XLogRecPtrIsInvalid(b))
+		return a;
+	else
+		return Min(a, b);
+}
+
+/*
+ * Retrieve timeline history for given control file which should behold
+ * either source or target.
+ */
+static TimeLineHistoryEntry *
+getTimelineHistory(ControlFileData *controlFile, int *nentries)
+{
+	TimeLineHistoryEntry *history;
+	TimeLineID	tli;
+
+	tli = controlFile->checkPointCopy.ThisTimeLineID;
+
+	/*
+	 * Timeline 1 does not have a history file, so there is no need to check
+	 * and fake an entry with infinite start and end positions.
+	 */
+	if (tli == 1)
+	{
+		history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
+		history->tli = tli;
+		history->begin = history->end = InvalidXLogRecPtr;
+		*nentries = 1;
+	}
+	else
+	{
+		char		path[MAXPGPATH];
+		char	   *histfile;
+
+		TLHistoryFilePath(path, tli);
+
+		/* Get history file from appropriate source */
+		if (controlFile == &ControlFile_source)
+			histfile = source->fetch_file(source, path, NULL);
+		else if (controlFile == &ControlFile_target)
+			histfile = slurpFile(datadir_target, path, NULL);
+		else
+			pg_fatal("invalid control file");
+
+		history = rewind_parseTimeLineHistory(histfile, tli, nentries);
+		pg_free(histfile);
+	}
+
+	if (debug)
+	{
+		int			i;
+
+		if (controlFile == &ControlFile_source)
+			pg_log_debug("Source timeline history:");
+		else if (controlFile == &ControlFile_target)
+			pg_log_debug("Target timeline history:");
+		else
+			Assert(false);
+
+		/*
+		 * Print the target timeline history.
+		 */
+		for (i = 0; i < targetNentries; i++)
+		{
+			TimeLineHistoryEntry *entry;
+
+			entry = &history[i];
+			pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
+						 LSN_FORMAT_ARGS(entry->begin),
+						 LSN_FORMAT_ARGS(entry->end));
+		}
+	}
+
+	return history;
+}
+
+/*
+ * Determine the TLI of the last common timeline in the timeline history of the
+ * two clusters. targetHistory is filled with target timeline history and
+ * targetNentries is number of items in targetHistory. *tliIndex is set to the
+ * index of last common timeline in targetHistory array, and *recptr is set to
+ * the position where the timeline history diverged (ie. the first WAL record
+ * that's not the same in both clusters).
+ *
+ * Control files of both clusters must be read into ControlFile_target/source
+ * before calling this routine.
+ */
+static void
+findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
+{
+	TimeLineHistoryEntry *sourceHistory;
+	int			sourceNentries;
+	int			i,
+				n;
+
+	/* Retrieve timelines for both source and target */
+	sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
+	targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries);
+
+	/*
+	 * Trace the history forward, until we hit the timeline diverge. It may
+	 * still be possible that the source and target nodes used the same
+	 * timeline number in their history but with different start position
+	 * depending on the history files that each node has fetched in previous
+	 * recovery processes. Hence check the start position of the new timeline
+	 * as well and move down by one extra timeline entry if they do not match.
+	 */
+	n = Min(sourceNentries, targetNentries);
+	for (i = 0; i < n; i++)
+	{
+		if (sourceHistory[i].tli != targetHistory[i].tli ||
+			sourceHistory[i].begin != targetHistory[i].begin)
+			break;
+	}
+
+	if (i > 0)
+	{
+		i--;
+		*recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
+		*tliIndex = i;
+
+		pg_free(sourceHistory);
+		return;
+	}
+	else
+	{
+		pg_fatal("could not find common ancestor of the source and target cluster's timelines");
+	}
+}
+
+
+/*
+ * Create a backup_label file that forces recovery to begin at the last common
+ * checkpoint.
+ */
+static void
+createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
+{
+	XLogSegNo	startsegno;
+	time_t		stamp_time;
+	char		strfbuf[128];
+	char		xlogfilename[MAXFNAMELEN];
+	struct tm  *tmp;
+	char		buf[1000];
+	int			len;
+
+	XLByteToSeg(startpoint, startsegno, WalSegSz);
+	XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
+
+	/*
+	 * Construct backup label file
+	 */
+	stamp_time = time(NULL);
+	tmp = localtime(&stamp_time);
+	strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
+
+	len = snprintf(buf, sizeof(buf),
+				   "START WAL LOCATION: %X/%X (file %s)\n"
+				   "CHECKPOINT LOCATION: %X/%X\n"
+				   "BACKUP METHOD: pg_rewind\n"
+				   "BACKUP FROM: standby\n"
+				   "START TIME: %s\n",
+	/* omit LABEL: line */
+				   LSN_FORMAT_ARGS(startpoint), xlogfilename,
+				   LSN_FORMAT_ARGS(checkpointloc),
+				   strfbuf);
+	if (len >= sizeof(buf))
+		pg_fatal("backup label buffer too small");	/* shouldn't happen */
+
+	/* TODO: move old file out of the way, if any. */
+	open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
+	write_target_range(buf, 0, len);
+	close_target_file();
+}
+
+/*
+ * Check CRC of control file
+ */
+static void
+checkControlFile(ControlFileData *ControlFile)
+{
+	pg_crc32c	crc;
+
+	/* Calculate CRC */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
+	FIN_CRC32C(crc);
+
+	/* And simply compare it */
+	if (!EQ_CRC32C(crc, ControlFile->crc))
+		pg_fatal("unexpected control file CRC");
+}
+
+/*
+ * Verify control file contents in the buffer 'content', and copy it to
+ * *ControlFile.
+ */
+static void
+digestControlFile(ControlFileData *ControlFile, const char *content,
+				  size_t size)
+{
+	if (size != PG_CONTROL_FILE_SIZE)
+		pg_fatal("unexpected control file size %d, expected %d",
+				 (int) size, PG_CONTROL_FILE_SIZE);
+
+	memcpy(ControlFile, content, sizeof(ControlFileData));
+
+	/* set and validate WalSegSz */
+	WalSegSz = ControlFile->xlog_seg_size;
+
+	if (!IsValidWalSegSize(WalSegSz))
+		pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
+						  "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
+						  WalSegSz),
+				 WalSegSz);
+
+	/* Additional checks on control file */
+	checkControlFile(ControlFile);
+}
+
+/*
+ * Get value of GUC parameter restore_command from the target cluster.
+ *
+ * This uses a logic based on "postgres -C" to get the value from the
+ * cluster.
+ */
+static void
+getRestoreCommand(const char *argv0)
+{
+	int			rc;
+	char		postgres_exec_path[MAXPGPATH],
+				cmd_output[MAXPGPATH];
+	PQExpBuffer postgres_cmd;
+
+	if (!restore_wal)
+		return;
+
+	/* find postgres executable */
+	rc = find_other_exec(argv0, "postgres",
+						 PG_BACKEND_VERSIONSTR,
+						 postgres_exec_path);
+
+	if (rc < 0)
+	{
+		char		full_path[MAXPGPATH];
+
+		if (find_my_exec(argv0, full_path) < 0)
+			strlcpy(full_path, progname, sizeof(full_path));
+
+		if (rc == -1)
+			pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
+					 "postgres", progname, full_path);
+		else
+			pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
+					 "postgres", full_path, progname);
+	}
+
+	/*
+	 * Build a command able to retrieve the value of GUC parameter
+	 * restore_command, if set.
+	 */
+	postgres_cmd = createPQExpBuffer();
+
+	/* path to postgres, properly quoted */
+	appendShellString(postgres_cmd, postgres_exec_path);
+
+	/* add -D switch, with properly quoted data directory */
+	appendPQExpBufferStr(postgres_cmd, " -D ");
+	appendShellString(postgres_cmd, datadir_target);
+
+	/* add custom configuration file only if requested */
+	if (config_file != NULL)
+	{
+		appendPQExpBufferStr(postgres_cmd, " -c config_file=");
+		appendShellString(postgres_cmd, config_file);
+	}
+
+	/* add -C switch, for restore_command */
+	appendPQExpBufferStr(postgres_cmd, " -C restore_command");
+
+	if (!pipe_read_line(postgres_cmd->data, cmd_output, sizeof(cmd_output)))
+		exit(1);
+
+	(void) pg_strip_crlf(cmd_output);
+
+	if (strcmp(cmd_output, "") == 0)
+		pg_fatal("restore_command is not set in the target cluster");
+
+	restore_command = pg_strdup(cmd_output);
+
+	pg_log_debug("using for rewind restore_command = \'%s\'",
+				 restore_command);
+
+	destroyPQExpBuffer(postgres_cmd);
+}
+
+
+/*
+ * Ensure clean shutdown of target instance by launching single-user mode
+ * postgres to do crash recovery.
+ */
+static void
+ensureCleanShutdown(const char *argv0)
+{
+	int			ret;
+#define MAXCMDLEN (2 * MAXPGPATH)
+	char		exec_path[MAXPGPATH];
+	PQExpBuffer postgres_cmd;
+
+	/* locate postgres binary */
+	if ((ret = find_other_exec(argv0, "postgres",
+							   PG_BACKEND_VERSIONSTR,
+							   exec_path)) < 0)
+	{
+		char		full_path[MAXPGPATH];
+
+		if (find_my_exec(argv0, full_path) < 0)
+			strlcpy(full_path, progname, sizeof(full_path));
+
+		if (ret == -1)
+			pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
+					 "postgres", progname, full_path);
+		else
+			pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
+					 "postgres", full_path, progname);
+	}
+
+	pg_log_info("executing \"%s\" for target server to complete crash recovery",
+				exec_path);
+
+	/*
+	 * Skip processing if requested, but only after ensuring presence of
+	 * postgres.
+	 */
+	if (dry_run)
+		return;
+
+	/*
+	 * Finally run postgres in single-user mode.  There is no need to use
+	 * fsync here.  This makes the recovery faster, and the target data folder
+	 * is synced at the end anyway.
+	 */
+	postgres_cmd = createPQExpBuffer();
+
+	/* path to postgres, properly quoted */
+	appendShellString(postgres_cmd, exec_path);
+
+	/* add set of options with properly quoted data directory */
+	appendPQExpBufferStr(postgres_cmd, " --single -F -D ");
+	appendShellString(postgres_cmd, datadir_target);
+
+	/* add custom configuration file only if requested */
+	if (config_file != NULL)
+	{
+		appendPQExpBufferStr(postgres_cmd, " -c config_file=");
+		appendShellString(postgres_cmd, config_file);
+	}
+
+	/* finish with the database name, and a properly quoted redirection */
+	appendPQExpBufferStr(postgres_cmd, " template1 < ");
+	appendShellString(postgres_cmd, DEVNULL);
+
+	if (system(postgres_cmd->data) != 0)
+	{
+		pg_log_error("postgres single-user mode in target cluster failed");
+		pg_log_error_detail("Command was: %s", postgres_cmd->data);
+		exit(1);
+	}
+
+	destroyPQExpBuffer(postgres_cmd);
+}
+
+static void
+disconnect_atexit(void)
+{
+	if (conn != NULL)
+		PQfinish(conn);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
commit	5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree	739caf8c461053357daa9f162bef34516c7bf452 /src/bin/pg_rewind/pg_rewind.c
parent	Initial commit. (diff)
download	postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip