1 files changed, 251 insertions, 0 deletions
diff --git a/src/test/recovery/t/020_archive_status.pl b/src/test/recovery/t/020_archive_status.pl
new file mode 100644
index 0000000..2108d50
--- /dev/null
+++ b/src/test/recovery/t/020_archive_status.pl
@@ -0,0 +1,251 @@
+
+# Copyright (c) 2021-2022, PostgreSQL Global Development Group
+
+#
+# Tests related to WAL archiving and recovery.
+#
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $primary = PostgreSQL::Test::Cluster->new('primary');
+$primary->init(
+	has_archiving    => 1,
+	allows_streaming => 1);
+$primary->append_conf('postgresql.conf', 'autovacuum = off');
+$primary->start;
+my $primary_data = $primary->data_dir;
+
+# Temporarily use an archive_command value to make the archiver fail,
+# knowing that archiving is enabled.  Note that we cannot use a command
+# that does not exist as in this case the archiver process would just exit
+# without reporting the failure to pg_stat_archiver.  This also cannot
+# use a plain "false" as that's unportable on Windows.  So, instead, as
+# a portable solution, use an archive command based on a command known to
+# work but will fail: copy with an incorrect original path.
+my $incorrect_command =
+  $PostgreSQL::Test::Utils::windows_os
+  ? qq{copy "%p_does_not_exist" "%f_does_not_exist"}
+  : qq{cp "%p_does_not_exist" "%f_does_not_exist"};
+$primary->safe_psql(
+	'postgres', qq{
+    ALTER SYSTEM SET archive_command TO '$incorrect_command';
+    SELECT pg_reload_conf();
+});
+
+# Save the WAL segment currently in use and switch to a new segment.
+# This will be used to track the activity of the archiver.
+my $segment_name_1 = $primary->safe_psql('postgres',
+	q{SELECT pg_walfile_name(pg_current_wal_lsn())});
+my $segment_path_1       = "pg_wal/archive_status/$segment_name_1";
+my $segment_path_1_ready = "$segment_path_1.ready";
+my $segment_path_1_done  = "$segment_path_1.done";
+$primary->safe_psql(
+	'postgres', q{
+	CREATE TABLE mine AS SELECT generate_series(1,10) AS x;
+	SELECT pg_switch_wal();
+	CHECKPOINT;
+});
+
+# Wait for an archive failure.
+$primary->poll_query_until('postgres',
+	q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't')
+  or die "Timed out while waiting for archiving to fail";
+ok( -f "$primary_data/$segment_path_1_ready",
+	".ready file exists for WAL segment $segment_name_1 waiting to be archived"
+);
+ok( !-f "$primary_data/$segment_path_1_done",
+	".done file does not exist for WAL segment $segment_name_1 waiting to be archived"
+);
+
+is( $primary->safe_psql(
+		'postgres', q{
+		SELECT archived_count, last_failed_wal
+		FROM pg_stat_archiver
+	}),
+	"0|$segment_name_1",
+	"pg_stat_archiver failed to archive $segment_name_1");
+
+# Crash the cluster for the next test in charge of checking that non-archived
+# WAL segments are not removed.
+$primary->stop('immediate');
+
+# Recovery tests for the archiving with a standby partially check
+# the recovery behavior when restoring a backup taken using a
+# snapshot with no pg_backup_start/stop.  In this situation,
+# the recovered standby should enter first crash recovery then
+# switch to regular archive recovery.  Note that the base backup
+# is taken here so as archive_command will fail.  This is necessary
+# for the assumptions of the tests done with the standbys below.
+$primary->backup_fs_cold('backup');
+
+$primary->start;
+ok( -f "$primary_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary"
+);
+
+# Allow WAL archiving again and wait for a success.
+$primary->safe_psql(
+	'postgres', q{
+	ALTER SYSTEM RESET archive_command;
+	SELECT pg_reload_conf();
+});
+
+$primary->poll_query_until('postgres',
+	q{SELECT archived_count FROM pg_stat_archiver}, '1')
+  or die "Timed out while waiting for archiving to finish";
+
+ok(!-f "$primary_data/$segment_path_1_ready",
+	".ready file for archived WAL segment $segment_name_1 removed");
+
+ok(-f "$primary_data/$segment_path_1_done",
+	".done file for archived WAL segment $segment_name_1 exists");
+
+is( $primary->safe_psql(
+		'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }),
+	$segment_name_1,
+	"archive success reported in pg_stat_archiver for WAL segment $segment_name_1"
+);
+
+# Create some WAL activity and a new checkpoint so as the next standby can
+# create a restartpoint.  As this standby starts in crash recovery because
+# of the cold backup taken previously, it needs a clean restartpoint to deal
+# with existing status files.
+my $segment_name_2 = $primary->safe_psql('postgres',
+	q{SELECT pg_walfile_name(pg_current_wal_lsn())});
+my $segment_path_2       = "pg_wal/archive_status/$segment_name_2";
+my $segment_path_2_ready = "$segment_path_2.ready";
+my $segment_path_2_done  = "$segment_path_2.done";
+$primary->safe_psql(
+	'postgres', q{
+	INSERT INTO mine SELECT generate_series(10,20) AS x;
+	CHECKPOINT;
+});
+
+# Switch to a new segment and use the returned LSN to make sure that
+# standbys have caught up to this point.
+my $primary_lsn = $primary->safe_psql(
+	'postgres', q{
+	SELECT pg_switch_wal();
+});
+
+$primary->poll_query_until('postgres',
+	q{ SELECT last_archived_wal FROM pg_stat_archiver },
+	$segment_name_2)
+  or die "Timed out while waiting for archiving to finish";
+
+# Test standby with archive_mode = on.
+my $standby1 = PostgreSQL::Test::Cluster->new('standby');
+$standby1->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby1->append_conf('postgresql.conf', "archive_mode = on");
+my $standby1_data = $standby1->data_dir;
+$standby1->start;
+
+# Wait for the replay of the segment switch done previously, ensuring
+# that all segments needed are restored from the archives.
+$standby1->poll_query_until('postgres',
+	qq{ SELECT pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '$primary_lsn') >= 0 }
+) or die "Timed out while waiting for xlog replay on standby1";
+
+$standby1->safe_psql('postgres', q{CHECKPOINT});
+
+# Recovery with archive_mode=on does not keep .ready signal files inherited
+# from backup.  Note that this WAL segment existed in the backup.
+ok( !-f "$standby1_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 present in backup got removed with archive_mode=on on standby"
+);
+
+# Recovery with archive_mode=on should not create .ready files.
+# Note that this segment did not exist in the backup.
+ok( !-f "$standby1_data/$segment_path_2_ready",
+	".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby"
+);
+
+# Recovery with archive_mode = on creates .done files.
+ok( -f "$standby1_data/$segment_path_2_done",
+	".done file for WAL segment $segment_name_2 created when archive_mode=on on standby"
+);
+
+# Test recovery with archive_mode = always, which should always keep
+# .ready files if archiving is enabled, though here we want the archive
+# command to fail to persist the .ready files.  Note that this node
+# has inherited the archive command of the previous cold backup that
+# will cause archiving failures.
+my $standby2 = PostgreSQL::Test::Cluster->new('standby2');
+$standby2->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby2->append_conf('postgresql.conf', 'archive_mode = always');
+my $standby2_data = $standby2->data_dir;
+$standby2->start;
+
+# Wait for the replay of the segment switch done previously, ensuring
+# that all segments needed are restored from the archives.
+$standby2->poll_query_until('postgres',
+	qq{ SELECT pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '$primary_lsn') >= 0 }
+) or die "Timed out while waiting for xlog replay on standby2";
+
+$standby2->safe_psql('postgres', q{CHECKPOINT});
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_2_ready",
+	".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby"
+);
+
+# Reset statistics of the archiver for the next checks.
+$standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')});
+
+# Now crash the cluster to check that recovery step does not
+# remove non-archived WAL segments on a standby where archiving
+# is enabled.
+$standby2->stop('immediate');
+$standby2->start;
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+	"WAL segment still ready to archive after crash recovery on standby with archive_mode=always"
+);
+
+# Allow WAL archiving again, and wait for the segments to be archived.
+$standby2->safe_psql(
+	'postgres', q{
+	ALTER SYSTEM RESET archive_command;
+	SELECT pg_reload_conf();
+});
+$standby2->poll_query_until('postgres',
+	q{SELECT last_archived_wal FROM pg_stat_archiver},
+	$segment_name_2)
+  or die "Timed out while waiting for archiving to finish";
+
+is( $standby2->safe_psql(
+		'postgres', q{SELECT archived_count FROM pg_stat_archiver}),
+	'2',
+	"correct number of WAL segments archived from standby");
+
+ok( !-f "$standby2_data/$segment_path_1_ready"
+	  && !-f "$standby2_data/$segment_path_2_ready",
+	".ready files removed after archive success with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_1_done"
+	  && -f "$standby2_data/$segment_path_2_done",
+	".done files created after archive success with archive_mode=always on standby"
+);
+
+# Check that the archiver process calls the shell archive module's shutdown
+# callback.
+$standby2->append_conf('postgresql.conf', "log_min_messages = debug1");
+$standby2->reload;
+
+# Run a query to make sure that the reload has taken effect.
+$standby2->safe_psql('postgres', q{SELECT 1});
+my $log_location = -s $standby2->logfile;
+
+$standby2->stop;
+my $logfile = slurp_file($standby2->logfile, $log_location);
+ok( $logfile =~ qr/archiver process shutting down/,
+	'check shutdown callback of shell archive module');
+
+done_testing();