1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
|
/*-------------------------------------------------------------------------
*
* libpq_source.c
* Functions for fetching files from a remote server via libpq.
*
* Copyright (c) 2013-2023, PostgreSQL Global Development Group
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "catalog/pg_type_d.h"
#include "common/connect.h"
#include "datapagemap.h"
#include "file_ops.h"
#include "filemap.h"
#include "lib/stringinfo.h"
#include "pg_rewind.h"
#include "port/pg_bswap.h"
#include "rewind_source.h"
/*
* Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a
* maximum of MAX_CHUNKS_PER_QUERY chunks in a single query.
*/
#define MAX_CHUNK_SIZE (1024 * 1024)
#define MAX_CHUNKS_PER_QUERY 1000
/* represents a request to fetch a piece of a file from the source */
typedef struct
{
const char *path; /* path relative to data directory root */
off_t offset;
size_t length;
} fetch_range_request;
typedef struct
{
rewind_source common; /* common interface functions */
PGconn *conn;
/*
* Queue of chunks that have been requested with the queue_fetch_range()
* function, but have not been fetched from the remote server yet.
*/
int num_requests;
fetch_range_request request_queue[MAX_CHUNKS_PER_QUERY];
/* temporary space for process_queued_fetch_requests() */
StringInfoData paths;
StringInfoData offsets;
StringInfoData lengths;
} libpq_source;
static void init_libpq_conn(PGconn *conn);
static char *run_simple_query(PGconn *conn, const char *sql);
static void run_simple_command(PGconn *conn, const char *sql);
static void appendArrayEscapedString(StringInfo buf, const char *str);
static void process_queued_fetch_requests(libpq_source *src);
/* public interface functions */
static void libpq_traverse_files(rewind_source *source,
process_file_callback_t callback);
static void libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len);
static void libpq_queue_fetch_range(rewind_source *source, const char *path,
off_t off, size_t len);
static void libpq_finish_fetch(rewind_source *source);
static char *libpq_fetch_file(rewind_source *source, const char *path,
size_t *filesize);
static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source);
static void libpq_destroy(rewind_source *source);
/*
* Create a new libpq source.
*
* The caller has already established the connection, but should not try
* to use it while the source is active.
*/
rewind_source *
init_libpq_source(PGconn *conn)
{
libpq_source *src;
init_libpq_conn(conn);
src = pg_malloc0(sizeof(libpq_source));
src->common.traverse_files = libpq_traverse_files;
src->common.fetch_file = libpq_fetch_file;
src->common.queue_fetch_file = libpq_queue_fetch_file;
src->common.queue_fetch_range = libpq_queue_fetch_range;
src->common.finish_fetch = libpq_finish_fetch;
src->common.get_current_wal_insert_lsn = libpq_get_current_wal_insert_lsn;
src->common.destroy = libpq_destroy;
src->conn = conn;
initStringInfo(&src->paths);
initStringInfo(&src->offsets);
initStringInfo(&src->lengths);
return &src->common;
}
/*
* Initialize a libpq connection for use.
*/
static void
init_libpq_conn(PGconn *conn)
{
PGresult *res;
char *str;
/* disable all types of timeouts */
run_simple_command(conn, "SET statement_timeout = 0");
run_simple_command(conn, "SET lock_timeout = 0");
run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0");
/*
* we don't intend to do any updates, put the connection in read-only mode
* to keep us honest
*/
run_simple_command(conn, "SET default_transaction_read_only = on");
/* secure search_path */
res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
pg_fatal("could not clear search_path: %s",
PQresultErrorMessage(res));
PQclear(res);
/*
* Also check that full_page_writes is enabled. We can get torn pages if
* a page is modified while we read it with pg_read_binary_file(), and we
* rely on full page images to fix them.
*/
str = run_simple_query(conn, "SHOW full_page_writes");
if (strcmp(str, "on") != 0)
pg_fatal("full_page_writes must be enabled in the source server");
pg_free(str);
/* Prepare a statement we'll use to fetch files */
res = PQprepare(conn, "fetch_chunks_stmt",
"SELECT path, begin,\n"
" pg_read_binary_file(path, begin, len, true) AS chunk\n"
"FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)",
3, NULL);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
pg_fatal("could not prepare statement to fetch file contents: %s",
PQresultErrorMessage(res));
PQclear(res);
}
/*
* Run a query that returns a single value.
*
* The result should be pg_free'd after use.
*/
static char *
run_simple_query(PGconn *conn, const char *sql)
{
PGresult *res;
char *result;
res = PQexec(conn, sql);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
pg_fatal("error running query (%s) on source server: %s",
sql, PQresultErrorMessage(res));
/* sanity check the result set */
if (PQnfields(res) != 1 || PQntuples(res) != 1 || PQgetisnull(res, 0, 0))
pg_fatal("unexpected result set from query");
result = pg_strdup(PQgetvalue(res, 0, 0));
PQclear(res);
return result;
}
/*
* Run a command.
*
* In the event of a failure, exit immediately.
*/
static void
run_simple_command(PGconn *conn, const char *sql)
{
PGresult *res;
res = PQexec(conn, sql);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
pg_fatal("error running query (%s) in source server: %s",
sql, PQresultErrorMessage(res));
PQclear(res);
}
/*
* Call the pg_current_wal_insert_lsn() function in the remote system.
*/
static XLogRecPtr
libpq_get_current_wal_insert_lsn(rewind_source *source)
{
PGconn *conn = ((libpq_source *) source)->conn;
XLogRecPtr result;
uint32 hi;
uint32 lo;
char *val;
val = run_simple_query(conn, "SELECT pg_current_wal_insert_lsn()");
if (sscanf(val, "%X/%X", &hi, &lo) != 2)
pg_fatal("unrecognized result \"%s\" for current WAL insert location", val);
result = ((uint64) hi) << 32 | lo;
pg_free(val);
return result;
}
/*
* Get a list of all files in the data directory.
*/
static void
libpq_traverse_files(rewind_source *source, process_file_callback_t callback)
{
PGconn *conn = ((libpq_source *) source)->conn;
PGresult *res;
const char *sql;
int i;
/*
* Create a recursive directory listing of the whole data directory.
*
* The WITH RECURSIVE part does most of the work. The second part gets the
* targets of the symlinks in pg_tblspc directory.
*
* XXX: There is no backend function to get a symbolic link's target in
* general, so if the admin has put any custom symbolic links in the data
* directory, they won't be copied correctly.
*/
sql =
"WITH RECURSIVE files (path, filename, size, isdir) AS (\n"
" SELECT '' AS path, filename, size, isdir FROM\n"
" (SELECT pg_ls_dir('.', true, false) AS filename) AS fn,\n"
" pg_stat_file(fn.filename, true) AS this\n"
" UNION ALL\n"
" SELECT parent.path || parent.filename || '/' AS path,\n"
" fn, this.size, this.isdir\n"
" FROM files AS parent,\n"
" pg_ls_dir(parent.path || parent.filename, true, false) AS fn,\n"
" pg_stat_file(parent.path || parent.filename || '/' || fn, true) AS this\n"
" WHERE parent.isdir = 't'\n"
")\n"
"SELECT path || filename, size, isdir,\n"
" pg_tablespace_location(pg_tablespace.oid) AS link_target\n"
"FROM files\n"
"LEFT OUTER JOIN pg_tablespace ON files.path = 'pg_tblspc/'\n"
" AND oid::text = files.filename\n";
res = PQexec(conn, sql);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
pg_fatal("could not fetch file list: %s",
PQresultErrorMessage(res));
/* sanity check the result set */
if (PQnfields(res) != 4)
pg_fatal("unexpected result set while fetching file list");
/* Read result to local variables */
for (i = 0; i < PQntuples(res); i++)
{
char *path;
int64 filesize;
bool isdir;
char *link_target;
file_type_t type;
if (PQgetisnull(res, i, 1))
{
/*
* The file was removed from the server while the query was
* running. Ignore it.
*/
continue;
}
path = PQgetvalue(res, i, 0);
filesize = atol(PQgetvalue(res, i, 1));
isdir = (strcmp(PQgetvalue(res, i, 2), "t") == 0);
link_target = PQgetvalue(res, i, 3);
if (link_target[0])
type = FILE_TYPE_SYMLINK;
else if (isdir)
type = FILE_TYPE_DIRECTORY;
else
type = FILE_TYPE_REGULAR;
callback(path, type, filesize, link_target);
}
PQclear(res);
}
/*
* Queue up a request to fetch a file from remote system.
*/
static void
libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len)
{
/*
* Truncate the target file immediately, and queue a request to fetch it
* from the source. If the file is small, smaller than MAX_CHUNK_SIZE,
* request fetching a full-sized chunk anyway, so that if the file has
* become larger in the source system, after we scanned the source
* directory, we still fetch the whole file. This only works for files up
* to MAX_CHUNK_SIZE, but that's good enough for small configuration files
* and such that are changed every now and then, but not WAL-logged. For
* larger files, we fetch up to the original size.
*
* Even with that mechanism, there is an inherent race condition if the
* file is modified at the same instant that we're copying it, so that we
* might copy a torn version of the file with one half from the old
* version and another half from the new. But pg_basebackup has the same
* problem, and it hasn't been a problem in practice.
*
* It might seem more natural to truncate the file later, when we receive
* it from the source server, but then we'd need to track which
* fetch-requests are for a whole file.
*/
open_target_file(path, true);
libpq_queue_fetch_range(source, path, 0, Max(len, MAX_CHUNK_SIZE));
}
/*
* Queue up a request to fetch a piece of a file from remote system.
*/
static void
libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off,
size_t len)
{
libpq_source *src = (libpq_source *) source;
/*
* Does this request happen to be a continuation of the previous chunk? If
* so, merge it with the previous one.
*
* XXX: We use pointer equality to compare the path. That's good enough
* for our purposes; the caller always passes the same pointer for the
* same filename. If it didn't, we would fail to merge requests, but it
* wouldn't affect correctness.
*/
if (src->num_requests > 0)
{
fetch_range_request *prev = &src->request_queue[src->num_requests - 1];
if (prev->offset + prev->length == off &&
prev->length < MAX_CHUNK_SIZE &&
prev->path == path)
{
/*
* Extend the previous request to cover as much of this new
* request as possible, without exceeding MAX_CHUNK_SIZE.
*/
size_t thislen;
thislen = Min(len, MAX_CHUNK_SIZE - prev->length);
prev->length += thislen;
off += thislen;
len -= thislen;
/*
* Fall through to create new requests for any remaining 'len'
* that didn't fit in the previous chunk.
*/
}
}
/* Divide the request into pieces of MAX_CHUNK_SIZE bytes each */
while (len > 0)
{
int32 thislen;
/* if the queue is full, perform all the work queued up so far */
if (src->num_requests == MAX_CHUNKS_PER_QUERY)
process_queued_fetch_requests(src);
thislen = Min(len, MAX_CHUNK_SIZE);
src->request_queue[src->num_requests].path = path;
src->request_queue[src->num_requests].offset = off;
src->request_queue[src->num_requests].length = thislen;
src->num_requests++;
off += thislen;
len -= thislen;
}
}
/*
* Fetch all the queued chunks and write them to the target data directory.
*/
static void
libpq_finish_fetch(rewind_source *source)
{
process_queued_fetch_requests((libpq_source *) source);
}
static void
process_queued_fetch_requests(libpq_source *src)
{
const char *params[3];
PGresult *res;
int chunkno;
if (src->num_requests == 0)
return;
pg_log_debug("getting %d file chunks", src->num_requests);
/*
* The prepared statement, 'fetch_chunks_stmt', takes three arrays with
* the same length as parameters: paths, offsets and lengths. Construct
* the string representations of them.
*/
resetStringInfo(&src->paths);
resetStringInfo(&src->offsets);
resetStringInfo(&src->lengths);
appendStringInfoChar(&src->paths, '{');
appendStringInfoChar(&src->offsets, '{');
appendStringInfoChar(&src->lengths, '{');
for (int i = 0; i < src->num_requests; i++)
{
fetch_range_request *rq = &src->request_queue[i];
if (i > 0)
{
appendStringInfoChar(&src->paths, ',');
appendStringInfoChar(&src->offsets, ',');
appendStringInfoChar(&src->lengths, ',');
}
appendArrayEscapedString(&src->paths, rq->path);
appendStringInfo(&src->offsets, INT64_FORMAT, (int64) rq->offset);
appendStringInfo(&src->lengths, INT64_FORMAT, (int64) rq->length);
}
appendStringInfoChar(&src->paths, '}');
appendStringInfoChar(&src->offsets, '}');
appendStringInfoChar(&src->lengths, '}');
/*
* Execute the prepared statement.
*/
params[0] = src->paths.data;
params[1] = src->offsets.data;
params[2] = src->lengths.data;
if (PQsendQueryPrepared(src->conn, "fetch_chunks_stmt", 3, params, NULL, NULL, 1) != 1)
pg_fatal("could not send query: %s", PQerrorMessage(src->conn));
if (PQsetSingleRowMode(src->conn) != 1)
pg_fatal("could not set libpq connection to single row mode");
/*----
* The result set is of format:
*
* path text -- path in the data directory, e.g "base/1/123"
* begin int8 -- offset within the file
* chunk bytea -- file content
*----
*/
chunkno = 0;
while ((res = PQgetResult(src->conn)) != NULL)
{
fetch_range_request *rq = &src->request_queue[chunkno];
char *filename;
int filenamelen;
int64 chunkoff;
int chunksize;
char *chunk;
switch (PQresultStatus(res))
{
case PGRES_SINGLE_TUPLE:
break;
case PGRES_TUPLES_OK:
PQclear(res);
continue; /* final zero-row result */
default:
pg_fatal("unexpected result while fetching remote files: %s",
PQresultErrorMessage(res));
}
if (chunkno > src->num_requests)
pg_fatal("received more data chunks than requested");
/* sanity check the result set */
if (PQnfields(res) != 3 || PQntuples(res) != 1)
pg_fatal("unexpected result set size while fetching remote files");
if (PQftype(res, 0) != TEXTOID ||
PQftype(res, 1) != INT8OID ||
PQftype(res, 2) != BYTEAOID)
{
pg_fatal("unexpected data types in result set while fetching remote files: %u %u %u",
PQftype(res, 0), PQftype(res, 1), PQftype(res, 2));
}
if (PQfformat(res, 0) != 1 &&
PQfformat(res, 1) != 1 &&
PQfformat(res, 2) != 1)
{
pg_fatal("unexpected result format while fetching remote files");
}
if (PQgetisnull(res, 0, 0) ||
PQgetisnull(res, 0, 1))
{
pg_fatal("unexpected null values in result while fetching remote files");
}
if (PQgetlength(res, 0, 1) != sizeof(int64))
pg_fatal("unexpected result length while fetching remote files");
/* Read result set to local variables */
memcpy(&chunkoff, PQgetvalue(res, 0, 1), sizeof(int64));
chunkoff = pg_ntoh64(chunkoff);
chunksize = PQgetlength(res, 0, 2);
filenamelen = PQgetlength(res, 0, 0);
filename = pg_malloc(filenamelen + 1);
memcpy(filename, PQgetvalue(res, 0, 0), filenamelen);
filename[filenamelen] = '\0';
chunk = PQgetvalue(res, 0, 2);
/*
* If a file has been deleted on the source, remove it on the target
* as well. Note that multiple unlink() calls may happen on the same
* file if multiple data chunks are associated with it, hence ignore
* unconditionally anything missing.
*/
if (PQgetisnull(res, 0, 2))
{
pg_log_debug("received null value for chunk for file \"%s\", file has been deleted",
filename);
remove_target_file(filename, true);
}
else
{
pg_log_debug("received chunk for file \"%s\", offset %lld, size %d",
filename, (long long int) chunkoff, chunksize);
if (strcmp(filename, rq->path) != 0)
{
pg_fatal("received data for file \"%s\", when requested for \"%s\"",
filename, rq->path);
}
if (chunkoff != rq->offset)
pg_fatal("received data at offset %lld of file \"%s\", when requested for offset %lld",
(long long int) chunkoff, rq->path, (long long int) rq->offset);
/*
* We should not receive more data than we requested, or
* pg_read_binary_file() messed up. We could receive less,
* though, if the file was truncated in the source after we
* checked its size. That's OK, there should be a WAL record of
* the truncation, which will get replayed when you start the
* target system for the first time after pg_rewind has completed.
*/
if (chunksize > rq->length)
pg_fatal("received more than requested for file \"%s\"", rq->path);
open_target_file(filename, false);
write_target_range(chunk, chunkoff, chunksize);
}
pg_free(filename);
PQclear(res);
chunkno++;
}
if (chunkno != src->num_requests)
pg_fatal("unexpected number of data chunks received");
src->num_requests = 0;
}
/*
* Escape a string to be used as element in a text array constant
*/
static void
appendArrayEscapedString(StringInfo buf, const char *str)
{
appendStringInfoCharMacro(buf, '\"');
while (*str)
{
char ch = *str;
if (ch == '"' || ch == '\\')
appendStringInfoCharMacro(buf, '\\');
appendStringInfoCharMacro(buf, ch);
str++;
}
appendStringInfoCharMacro(buf, '\"');
}
/*
* Fetch a single file as a malloc'd buffer.
*/
static char *
libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize)
{
PGconn *conn = ((libpq_source *) source)->conn;
PGresult *res;
char *result;
int len;
const char *paramValues[1];
paramValues[0] = path;
res = PQexecParams(conn, "SELECT pg_read_binary_file($1)",
1, NULL, paramValues, NULL, NULL, 1);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
pg_fatal("could not fetch remote file \"%s\": %s",
path, PQresultErrorMessage(res));
/* sanity check the result set */
if (PQntuples(res) != 1 || PQgetisnull(res, 0, 0))
pg_fatal("unexpected result set while fetching remote file \"%s\"",
path);
/* Read result to local variables */
len = PQgetlength(res, 0, 0);
result = pg_malloc(len + 1);
memcpy(result, PQgetvalue(res, 0, 0), len);
result[len] = '\0';
PQclear(res);
pg_log_debug("fetched file \"%s\", length %d", path, len);
if (filesize)
*filesize = len;
return result;
}
/*
* Close a libpq source.
*/
static void
libpq_destroy(rewind_source *source)
{
libpq_source *src = (libpq_source *) source;
pfree(src->paths.data);
pfree(src->offsets.data);
pfree(src->lengths.data);
pfree(src);
/* NOTE: we don't close the connection here, as it was not opened by us. */
}
|