src/test/recovery/t/028_pitr_timelines.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

# Copyright (c) 2022, PostgreSQL Global Development Group

# Test recovering to a point-in-time using WAL archive, such that the
# target point is physically in a WAL segment with a higher TLI than
# the target point's TLI.  For example, imagine that the following WAL
# segments exist in the WAL archive:
#
#      000000010000000000000001
#      000000010000000000000002
#      000000020000000000000003
#
# The timeline switch happened in the middle of WAL segment 3, but it
# was never archived on timeline 1.  The first half of
# 000000020000000000000003 contains the WAL from timeline 1 up to the
# point where the timeline switch happened.  If you now perform
# archive recovery with recovery target point in that first half of
# segment 3, archive recovery will find the WAL up to that point in
# segment 000000020000000000000003, but it will not follow the
# timeline switch to timeline 2, and creates a timeline switching
# end-of-recovery record with TLI 1 -> 3.  That's what this test case
# tests.
#
# The comments below contain lists of WAL segments at different points
# in the tests, to make it easier to follow along.  They are correct
# as of this writing, but the exact WAL segment numbers could change
# if the backend logic for when it switches to a new segment changes.
# The actual checks are not sensitive to that.

use strict;
use warnings;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
use File::Compare;

# Initialize and start primary node with WAL archiving
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
$node_primary->init(has_archiving => 1, allows_streaming => 1);
$node_primary->start;

# Take a backup.
my $backup_name = 'my_backup';
$node_primary->backup($backup_name);

# Workload with some transactions, and the target restore point.
$node_primary->psql(
	'postgres', qq{
CREATE TABLE foo(i int);
INSERT INTO foo VALUES(1);
SELECT pg_create_restore_point('rp');
INSERT INTO foo VALUES(2);
});

# Contents of the WAL archive at this point:
#
# 000000010000000000000001
# 000000010000000000000002
# 000000010000000000000002.00000028.backup
#
# The operations on the test table and the restore point went into WAL
# segment 3, but it hasn't been archived yet.

# Start a standby node, and wait for it to catch up.
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
$node_standby->init_from_backup(
	$node_primary, $backup_name,
	standby       => 1,
	has_streaming => 1,
	has_archiving => 1,
	has_restoring => 0);
$node_standby->append_conf('postgresql.conf', 'archive_mode = always');
$node_standby->start;
$node_primary->wait_for_catchup($node_standby);

# Check that it's really caught up.
my $result = $node_standby->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{2}, "check table contents after archive recovery");

# Kill the old primary, before it archives the most recent WAL segment that
# contains all the INSERTs.
$node_primary->stop('immediate');

# Promote the standby, and switch WAL so that it archives a WAL segment
# that contains all the INSERTs, on a new timeline.
$node_standby->promote;

# Find next WAL segment to be archived.
my $walfile_to_be_archived = $node_standby->safe_psql('postgres',
	"SELECT pg_walfile_name(pg_current_wal_lsn());");

# Make WAL segment eligible for archival
$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()');

# We don't need the standby anymore, request shutdown. The server will
# finish archiving all the WAL on timeline 2 before it exits.
$node_standby->stop;

# Contents of the WAL archive at this point:
#
# 000000010000000000000001
# 000000010000000000000002
# 000000010000000000000002.00000028.backup
# 000000010000000000000003.partial
# 000000020000000000000003
# 00000002.history
#
# The operations on the test table and the restore point are in
# segment 3.  They are part of timeline 1, but were not archived by
# the primary yet.  However, they were copied into the beginning of
# segment 000000020000000000000003, before the timeline switching
# record.  (They are also present in the
# 000000010000000000000003.partial file, but .partial files are not
# used automatically.)

# Now test PITR to the recovery target.  It should find the WAL in
# segment 000000020000000000000003, but not follow the timeline switch
# to timeline 2.
my $node_pitr = PostgreSQL::Test::Cluster->new('node_pitr');
$node_pitr->init_from_backup(
	$node_primary, $backup_name,
	standby       => 0,
	has_restoring => 1);
$node_pitr->append_conf(
	'postgresql.conf', qq{
recovery_target_name = 'rp'
recovery_target_action = 'promote'
});

$node_pitr->start;

# Wait until recovery finishes.
$node_pitr->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
  or die "Timed out while waiting for PITR promotion";

# Check that we see the data we expect.
$result = $node_pitr->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{1}, "check table contents after point-in-time recovery");

# Insert a row so that we can check later that we successfully recover
# back to this timeline.
$node_pitr->safe_psql('postgres', "INSERT INTO foo VALUES(3);");

# Wait for the archiver to be running.  The startup process might have yet to
# exit, in which case the postmaster has not started the archiver.  If we
# stop() without an archiver, the archive will be incomplete.
$node_pitr->poll_query_until('postgres',
	"SELECT true FROM pg_stat_activity WHERE backend_type = 'archiver';")
  or die "Timed out while waiting for archiver to start";

# Stop the node.  This archives the last segment.
$node_pitr->stop();

# Test archive recovery on the timeline created by the PITR.  This
# replays the end-of-recovery record that switches from timeline 1 to
# 3.
my $node_pitr2 = PostgreSQL::Test::Cluster->new('node_pitr2');
$node_pitr2->init_from_backup(
	$node_primary, $backup_name,
	standby       => 0,
	has_restoring => 1);
$node_pitr2->append_conf(
	'postgresql.conf', qq{
recovery_target_action = 'promote'
});

$node_pitr2->start;

# Wait until recovery finishes.
$node_pitr2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
  or die "Timed out while waiting for PITR promotion";

# Verify that we can see the row inserted after the PITR.
$result = $node_pitr2->safe_psql('postgres', "SELECT max(i) FROM foo;");
is($result, qq{3}, "check table contents after point-in-time recovery");

done_testing();