diff options
Diffstat (limited to 'src/test/recovery/t/019_replslot_limit.pl')
-rw-r--r-- | src/test/recovery/t/019_replslot_limit.pl | 444 |
1 files changed, 444 insertions, 0 deletions
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl new file mode 100644 index 0000000..4ec1a9a --- /dev/null +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -0,0 +1,444 @@ + +# Copyright (c) 2021-2022, PostgreSQL Global Development Group + +# Test for replication slot limit +# Ensure that max_slot_wal_keep_size limits the number of WAL files to +# be kept by replication slots. +use strict; +use warnings; + +use PostgreSQL::Test::Utils; +use PostgreSQL::Test::Cluster; + +use File::Path qw(rmtree); +use Test::More; +use Time::HiRes qw(usleep); + +$ENV{PGDATABASE} = 'postgres'; + +# Initialize primary node, setting wal-segsize to 1MB +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, extra => ['--wal-segsize=1']); +$node_primary->append_conf( + 'postgresql.conf', qq( +min_wal_size = 2MB +max_wal_size = 4MB +log_checkpoints = yes +)); +$node_primary->start; +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('rep1')"); + +# The slot state and remain should be null before the first connection +my $result = $node_primary->safe_psql('postgres', + "SELECT restart_lsn IS NULL, wal_status is NULL, safe_wal_size is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "t|t|t", 'check the state of non-reserved slot is "unknown"'); + + +# Take backup +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create a standby linking to it using the replication slot +my $node_standby = PostgreSQL::Test::Cluster->new('standby_1'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'"); + +$node_standby->start; + +# Wait until standby has replayed enough data +$node_primary->wait_for_catchup($node_standby); + +# Stop standby +$node_standby->stop; + +# Preparation done, the slot is the state "reserved" now +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "reserved|t", 'check the catching-up state'); + +# Advance WAL by five segments (= 5MB) on primary +advance_wal($node_primary, 1); +$node_primary->safe_psql('postgres', "CHECKPOINT;"); + +# The slot is always "safe" when fitting max_wal_size +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "reserved|t", + 'check that it is safe if WAL fits in max_wal_size'); + +advance_wal($node_primary, 4); +$node_primary->safe_psql('postgres', "CHECKPOINT;"); + +# The slot is always "safe" when max_slot_wal_keep_size is not set +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "reserved|t", 'check that slot is working'); + +# The standby can reconnect to primary +$node_standby->start; + +$node_primary->wait_for_catchup($node_standby); + +$node_standby->stop; + +# Set max_slot_wal_keep_size on primary +my $max_slot_wal_keep_size_mb = 6; +$node_primary->append_conf( + 'postgresql.conf', qq( +max_slot_wal_keep_size = ${max_slot_wal_keep_size_mb}MB +)); +$node_primary->reload; + +# The slot is in safe state. + +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'"); +is($result, "reserved", 'check that max_slot_wal_keep_size is working'); + +# Advance WAL again then checkpoint, reducing remain by 2 MB. +advance_wal($node_primary, 2); +$node_primary->safe_psql('postgres', "CHECKPOINT;"); + +# The slot is still working +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'"); +is($result, "reserved", + 'check that safe_wal_size gets close to the current LSN'); + +# The standby can reconnect to primary +$node_standby->start; +$node_primary->wait_for_catchup($node_standby); +$node_standby->stop; + +# wal_keep_size overrides max_slot_wal_keep_size +$result = $node_primary->safe_psql('postgres', + "ALTER SYSTEM SET wal_keep_size to '8MB'; SELECT pg_reload_conf();"); +# Advance WAL again then checkpoint, reducing remain by 6 MB. +advance_wal($node_primary, 6); +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status as remain FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "extended", + 'check that wal_keep_size overrides max_slot_wal_keep_size'); +# restore wal_keep_size +$result = $node_primary->safe_psql('postgres', + "ALTER SYSTEM SET wal_keep_size to 0; SELECT pg_reload_conf();"); + +# The standby can reconnect to primary +$node_standby->start; +$node_primary->wait_for_catchup($node_standby); +$node_standby->stop; + +# Advance WAL again without checkpoint, reducing remain by 6 MB. +advance_wal($node_primary, 6); + +# Slot gets into 'reserved' state +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'"); +is($result, "extended", 'check that the slot state changes to "extended"'); + +# do checkpoint so that the next checkpoint runs too early +$node_primary->safe_psql('postgres', "CHECKPOINT;"); + +# Advance WAL again without checkpoint; remain goes to 0. +advance_wal($node_primary, 1); + +# Slot gets into 'unreserved' state and safe_wal_size is negative +$result = $node_primary->safe_psql('postgres', + "SELECT wal_status, safe_wal_size <= 0 FROM pg_replication_slots WHERE slot_name = 'rep1'" +); +is($result, "unreserved|t", + 'check that the slot state changes to "unreserved"'); + +# The standby still can connect to primary before a checkpoint +$node_standby->start; + +$node_primary->wait_for_catchup($node_standby); + +$node_standby->stop; + +ok( !$node_standby->log_contains( + "requested WAL segment [0-9A-F]+ has already been removed"), + 'check that required WAL segments are still available'); + +# Create one checkpoint, to improve stability of the next steps +$node_primary->safe_psql('postgres', "CHECKPOINT;"); + +# Prevent other checkpoints from occurring while advancing WAL segments +$node_primary->safe_psql('postgres', + "ALTER SYSTEM SET max_wal_size='40MB'; SELECT pg_reload_conf()"); + +# Advance WAL again. The slot loses the oldest segment by the next checkpoint +my $logstart = get_log_size($node_primary); +advance_wal($node_primary, 7); + +# Now create another checkpoint and wait until the WARNING is issued +$node_primary->safe_psql('postgres', + 'ALTER SYSTEM RESET max_wal_size; SELECT pg_reload_conf()'); +$node_primary->safe_psql('postgres', "CHECKPOINT;"); +my $invalidated = 0; +for (my $i = 0; $i < 10000; $i++) +{ + if ($node_primary->log_contains( + "invalidating slot \"rep1\" because its restart_lsn [0-9A-F/]+ exceeds max_slot_wal_keep_size", + $logstart)) + { + $invalidated = 1; + last; + } + usleep(100_000); +} +ok($invalidated, 'check that slot invalidation has been logged'); + +$result = $node_primary->safe_psql( + 'postgres', + qq[ + SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size + FROM pg_replication_slots WHERE slot_name = 'rep1']); +is($result, "rep1|f|t|lost|", + 'check that the slot became inactive and the state "lost" persists'); + +# Wait until current checkpoint ends +my $checkpoint_ended = 0; +for (my $i = 0; $i < 10000; $i++) +{ + if ($node_primary->log_contains("checkpoint complete: ", $logstart)) + { + $checkpoint_ended = 1; + last; + } + usleep(100_000); +} +ok($checkpoint_ended, 'waited for checkpoint to end'); + +# The invalidated slot shouldn't keep the old-segment horizon back; +# see bug #17103: https://postgr.es/m/17103-004130e8f27782c9@postgresql.org +# Test for this by creating a new slot and comparing its restart LSN +# to the oldest existing file. +my $redoseg = $node_primary->safe_psql('postgres', + "SELECT pg_walfile_name(lsn) FROM pg_create_physical_replication_slot('s2', true)" +); +my $oldestseg = $node_primary->safe_psql('postgres', + "SELECT pg_ls_dir AS f FROM pg_ls_dir('pg_wal') WHERE pg_ls_dir ~ '^[0-9A-F]{24}\$' ORDER BY 1 LIMIT 1" +); +$node_primary->safe_psql('postgres', + qq[SELECT pg_drop_replication_slot('s2')]); +is($oldestseg, $redoseg, "check that segments have been removed"); + +# The standby no longer can connect to the primary +$logstart = get_log_size($node_standby); +$node_standby->start; + +my $failed = 0; +for (my $i = 0; $i < 10000; $i++) +{ + if ($node_standby->log_contains( + "requested WAL segment [0-9A-F]+ has already been removed", + $logstart)) + { + $failed = 1; + last; + } + usleep(100_000); +} +ok($failed, 'check that replication has been broken'); + +$node_primary->stop; +$node_standby->stop; + +my $node_primary2 = PostgreSQL::Test::Cluster->new('primary2'); +$node_primary2->init(allows_streaming => 1); +$node_primary2->append_conf( + 'postgresql.conf', qq( +min_wal_size = 32MB +max_wal_size = 32MB +log_checkpoints = yes +)); +$node_primary2->start; +$node_primary2->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('rep1')"); +$backup_name = 'my_backup2'; +$node_primary2->backup($backup_name); + +$node_primary2->stop; +$node_primary2->append_conf( + 'postgresql.conf', qq( +max_slot_wal_keep_size = 0 +)); +$node_primary2->start; + +$node_standby = PostgreSQL::Test::Cluster->new('standby_2'); +$node_standby->init_from_backup($node_primary2, $backup_name, + has_streaming => 1); +$node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'"); +$node_standby->start; +my @result = + split( + '\n', + $node_primary2->safe_psql( + 'postgres', + "CREATE TABLE tt(); + DROP TABLE tt; + SELECT pg_switch_wal(); + CHECKPOINT; + SELECT 'finished';", + timeout => $PostgreSQL::Test::Utils::timeout_default)); +is($result[1], 'finished', 'check if checkpoint command is not blocked'); + +$node_primary2->stop; +$node_standby->stop; + +# The next test depends on Perl's `kill`, which apparently is not +# portable to Windows. (It would be nice to use Test::More's `subtest`, +# but that's not in the ancient version we require.) +if ($PostgreSQL::Test::Utils::windows_os) +{ + done_testing(); + exit; +} + +# Get a slot terminated while the walsender is active +# We do this by sending SIGSTOP to the walsender. Skip this on Windows. +my $node_primary3 = PostgreSQL::Test::Cluster->new('primary3'); +$node_primary3->init(allows_streaming => 1, extra => ['--wal-segsize=1']); +$node_primary3->append_conf( + 'postgresql.conf', qq( + min_wal_size = 2MB + max_wal_size = 2MB + log_checkpoints = yes + max_slot_wal_keep_size = 1MB + )); +$node_primary3->start; +$node_primary3->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('rep3')"); +# Take backup +$backup_name = 'my_backup'; +$node_primary3->backup($backup_name); +# Create standby +my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3'); +$node_standby3->init_from_backup($node_primary3, $backup_name, + has_streaming => 1); +$node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'"); +$node_standby3->start; +$node_primary3->wait_for_catchup($node_standby3); + +my $senderpid; + +# We've seen occasional cases where multiple walsender pids are still active +# at this point, apparently just due to process shutdown being slow. To avoid +# spurious failures, retry a couple times. +my $i = 0; +while (1) +{ + my ($stdout, $stderr); + + $senderpid = $node_primary3->safe_psql('postgres', + "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); + + last if $senderpid =~ qr/^[0-9]+$/; + + diag "multiple walsenders active in iteration $i"; + + # show information about all active connections + $node_primary3->psql( + 'postgres', + "\\a\\t\nSELECT * FROM pg_stat_activity", + stdout => \$stdout, + stderr => \$stderr); + diag $stdout, $stderr; + + # unlikely that the problem would resolve after 15s, so give up at point + if ($i++ == 150) + { + # An immediate shutdown may hide evidence of a locking bug. If + # retrying didn't resolve the issue, shut down in fast mode. + $node_primary3->stop('fast'); + $node_standby3->stop('fast'); + die "could not determine walsender pid, can't continue"; + } + + usleep(100_000); +} + +like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"); + +my $receiverpid = $node_standby3->safe_psql('postgres', + "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'"); +like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid"); + +$logstart = get_log_size($node_primary3); +# freeze walsender and walreceiver. Slot will still be active, but walreceiver +# won't get anything anymore. +kill 'STOP', $senderpid, $receiverpid; +advance_wal($node_primary3, 2); + +my $max_attempts = $PostgreSQL::Test::Utils::timeout_default; +while ($max_attempts-- >= 0) +{ + if ($node_primary3->log_contains( + "terminating process $senderpid to release replication slot \"rep3\"", + $logstart)) + { + ok(1, "walsender termination logged"); + last; + } + sleep 1; +} + +# Now let the walsender continue; slot should be killed now. +# (Must not let walreceiver run yet; otherwise the standby could start another +# one before the slot can be killed) +kill 'CONT', $senderpid; +$node_primary3->poll_query_until('postgres', + "SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep3'", + "lost") + or die "timed out waiting for slot to be lost"; + +$max_attempts = $PostgreSQL::Test::Utils::timeout_default; +while ($max_attempts-- >= 0) +{ + if ($node_primary3->log_contains( + 'invalidating slot "rep3" because its restart_lsn', $logstart)) + { + ok(1, "slot invalidation logged"); + last; + } + sleep 1; +} + +# Now let the walreceiver continue, so that the node can be stopped cleanly +kill 'CONT', $receiverpid; + +$node_primary3->stop; +$node_standby3->stop; + +##################################### +# Advance WAL of $node by $n segments +sub advance_wal +{ + my ($node, $n) = @_; + + # Advance by $n segments (= (16 * $n) MB) on primary + for (my $i = 0; $i < $n; $i++) + { + $node->safe_psql('postgres', + "CREATE TABLE t (); DROP TABLE t; SELECT pg_switch_wal();"); + } + return; +} + +# return the size of logfile of $node in bytes +sub get_log_size +{ + my ($node) = @_; + + return (stat $node->logfile)[7]; +} + +done_testing(); |