# Copyright (c) 2021-2022, PostgreSQL Global Development Group # # Tests related to WAL archiving and recovery. # use strict; use warnings; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; use Test::More; my $primary = PostgreSQL::Test::Cluster->new('primary'); $primary->init( has_archiving => 1, allows_streaming => 1); $primary->append_conf('postgresql.conf', 'autovacuum = off'); $primary->start; my $primary_data = $primary->data_dir; # Temporarily use an archive_command value to make the archiver fail, # knowing that archiving is enabled. Note that we cannot use a command # that does not exist as in this case the archiver process would just exit # without reporting the failure to pg_stat_archiver. This also cannot # use a plain "false" as that's unportable on Windows. So, instead, as # a portable solution, use an archive command based on a command known to # work but will fail: copy with an incorrect original path. my $incorrect_command = $PostgreSQL::Test::Utils::windows_os ? qq{copy "%p_does_not_exist" "%f_does_not_exist"} : qq{cp "%p_does_not_exist" "%f_does_not_exist"}; $primary->safe_psql( 'postgres', qq{ ALTER SYSTEM SET archive_command TO '$incorrect_command'; SELECT pg_reload_conf(); }); # Save the WAL segment currently in use and switch to a new segment. # This will be used to track the activity of the archiver. my $segment_name_1 = $primary->safe_psql('postgres', q{SELECT pg_walfile_name(pg_current_wal_lsn())}); my $segment_path_1 = "pg_wal/archive_status/$segment_name_1"; my $segment_path_1_ready = "$segment_path_1.ready"; my $segment_path_1_done = "$segment_path_1.done"; $primary->safe_psql( 'postgres', q{ CREATE TABLE mine AS SELECT generate_series(1,10) AS x; SELECT pg_switch_wal(); CHECKPOINT; }); # Wait for an archive failure. $primary->poll_query_until('postgres', q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't') or die "Timed out while waiting for archiving to fail"; ok( -f "$primary_data/$segment_path_1_ready", ".ready file exists for WAL segment $segment_name_1 waiting to be archived" ); ok( !-f "$primary_data/$segment_path_1_done", ".done file does not exist for WAL segment $segment_name_1 waiting to be archived" ); is( $primary->safe_psql( 'postgres', q{ SELECT archived_count, last_failed_wal FROM pg_stat_archiver }), "0|$segment_name_1", "pg_stat_archiver failed to archive $segment_name_1"); # Crash the cluster for the next test in charge of checking that non-archived # WAL segments are not removed. $primary->stop('immediate'); # Recovery tests for the archiving with a standby partially check # the recovery behavior when restoring a backup taken using a # snapshot with no pg_backup_start/stop. In this situation, # the recovered standby should enter first crash recovery then # switch to regular archive recovery. Note that the base backup # is taken here so as archive_command will fail. This is necessary # for the assumptions of the tests done with the standbys below. $primary->backup_fs_cold('backup'); $primary->start; ok( -f "$primary_data/$segment_path_1_ready", ".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary" ); # Allow WAL archiving again and wait for a success. $primary->safe_psql( 'postgres', q{ ALTER SYSTEM RESET archive_command; SELECT pg_reload_conf(); }); $primary->poll_query_until('postgres', q{SELECT archived_count FROM pg_stat_archiver}, '1') or die "Timed out while waiting for archiving to finish"; ok(!-f "$primary_data/$segment_path_1_ready", ".ready file for archived WAL segment $segment_name_1 removed"); ok(-f "$primary_data/$segment_path_1_done", ".done file for archived WAL segment $segment_name_1 exists"); is( $primary->safe_psql( 'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }), $segment_name_1, "archive success reported in pg_stat_archiver for WAL segment $segment_name_1" ); # Create some WAL activity and a new checkpoint so as the next standby can # create a restartpoint. As this standby starts in crash recovery because # of the cold backup taken previously, it needs a clean restartpoint to deal # with existing status files. my $segment_name_2 = $primary->safe_psql('postgres', q{SELECT pg_walfile_name(pg_current_wal_lsn())}); my $segment_path_2 = "pg_wal/archive_status/$segment_name_2"; my $segment_path_2_ready = "$segment_path_2.ready"; my $segment_path_2_done = "$segment_path_2.done"; $primary->safe_psql( 'postgres', q{ INSERT INTO mine SELECT generate_series(10,20) AS x; CHECKPOINT; }); # Switch to a new segment and use the returned LSN to make sure that # standbys have caught up to this point. my $primary_lsn = $primary->safe_psql( 'postgres', q{ SELECT pg_switch_wal(); }); $primary->poll_query_until('postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }, $segment_name_2) or die "Timed out while waiting for archiving to finish"; # Test standby with archive_mode = on. my $standby1 = PostgreSQL::Test::Cluster->new('standby'); $standby1->init_from_backup($primary, 'backup', has_restoring => 1); $standby1->append_conf('postgresql.conf', "archive_mode = on"); my $standby1_data = $standby1->data_dir; $standby1->start; # Wait for the replay of the segment switch done previously, ensuring # that all segments needed are restored from the archives. $standby1->poll_query_until('postgres', qq{ SELECT pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '$primary_lsn') >= 0 } ) or die "Timed out while waiting for xlog replay on standby1"; $standby1->safe_psql('postgres', q{CHECKPOINT}); # Recovery with archive_mode=on does not keep .ready signal files inherited # from backup. Note that this WAL segment existed in the backup. ok( !-f "$standby1_data/$segment_path_1_ready", ".ready file for WAL segment $segment_name_1 present in backup got removed with archive_mode=on on standby" ); # Recovery with archive_mode=on should not create .ready files. # Note that this segment did not exist in the backup. ok( !-f "$standby1_data/$segment_path_2_ready", ".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby" ); # Recovery with archive_mode = on creates .done files. ok( -f "$standby1_data/$segment_path_2_done", ".done file for WAL segment $segment_name_2 created when archive_mode=on on standby" ); # Test recovery with archive_mode = always, which should always keep # .ready files if archiving is enabled, though here we want the archive # command to fail to persist the .ready files. Note that this node # has inherited the archive command of the previous cold backup that # will cause archiving failures. my $standby2 = PostgreSQL::Test::Cluster->new('standby2'); $standby2->init_from_backup($primary, 'backup', has_restoring => 1); $standby2->append_conf('postgresql.conf', 'archive_mode = always'); my $standby2_data = $standby2->data_dir; $standby2->start; # Wait for the replay of the segment switch done previously, ensuring # that all segments needed are restored from the archives. $standby2->poll_query_until('postgres', qq{ SELECT pg_wal_lsn_diff(pg_last_wal_replay_lsn(), '$primary_lsn') >= 0 } ) or die "Timed out while waiting for xlog replay on standby2"; $standby2->safe_psql('postgres', q{CHECKPOINT}); ok( -f "$standby2_data/$segment_path_1_ready", ".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby" ); ok( -f "$standby2_data/$segment_path_2_ready", ".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby" ); # Reset statistics of the archiver for the next checks. $standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')}); # Now crash the cluster to check that recovery step does not # remove non-archived WAL segments on a standby where archiving # is enabled. $standby2->stop('immediate'); $standby2->start; ok( -f "$standby2_data/$segment_path_1_ready", "WAL segment still ready to archive after crash recovery on standby with archive_mode=always" ); # Allow WAL archiving again, and wait for the segments to be archived. $standby2->safe_psql( 'postgres', q{ ALTER SYSTEM RESET archive_command; SELECT pg_reload_conf(); }); $standby2->poll_query_until('postgres', q{SELECT last_archived_wal FROM pg_stat_archiver}, $segment_name_2) or die "Timed out while waiting for archiving to finish"; is( $standby2->safe_psql( 'postgres', q{SELECT archived_count FROM pg_stat_archiver}), '2', "correct number of WAL segments archived from standby"); ok( !-f "$standby2_data/$segment_path_1_ready" && !-f "$standby2_data/$segment_path_2_ready", ".ready files removed after archive success with archive_mode=always on standby" ); ok( -f "$standby2_data/$segment_path_1_done" && -f "$standby2_data/$segment_path_2_done", ".done files created after archive success with archive_mode=always on standby" ); # Check that the archiver process calls the shell archive module's shutdown # callback. $standby2->append_conf('postgresql.conf', "log_min_messages = debug1"); $standby2->reload; # Run a query to make sure that the reload has taken effect. $standby2->safe_psql('postgres', q{SELECT 1}); my $log_location = -s $standby2->logfile; $standby2->stop; my $logfile = slurp_file($standby2->logfile, $log_location); ok( $logfile =~ qr/archiver process shutting down/, 'check shutdown callback of shell archive module'); done_testing();