/*------------------------------------------------------------------------- * * libpq_source.c * Functions for fetching files from a remote server via libpq. * * Copyright (c) 2013-2021, PostgreSQL Global Development Group * *------------------------------------------------------------------------- */ #include "postgres_fe.h" #include "catalog/pg_type_d.h" #include "common/connect.h" #include "datapagemap.h" #include "file_ops.h" #include "filemap.h" #include "lib/stringinfo.h" #include "pg_rewind.h" #include "port/pg_bswap.h" #include "rewind_source.h" /* * Files are fetched MAX_CHUNK_SIZE bytes at a time, and with a * maximum of MAX_CHUNKS_PER_QUERY chunks in a single query. */ #define MAX_CHUNK_SIZE (1024 * 1024) #define MAX_CHUNKS_PER_QUERY 1000 /* represents a request to fetch a piece of a file from the source */ typedef struct { const char *path; /* path relative to data directory root */ off_t offset; size_t length; } fetch_range_request; typedef struct { rewind_source common; /* common interface functions */ PGconn *conn; /* * Queue of chunks that have been requested with the queue_fetch_range() * function, but have not been fetched from the remote server yet. */ int num_requests; fetch_range_request request_queue[MAX_CHUNKS_PER_QUERY]; /* temporary space for process_queued_fetch_requests() */ StringInfoData paths; StringInfoData offsets; StringInfoData lengths; } libpq_source; static void init_libpq_conn(PGconn *conn); static char *run_simple_query(PGconn *conn, const char *sql); static void run_simple_command(PGconn *conn, const char *sql); static void appendArrayEscapedString(StringInfo buf, const char *str); static void process_queued_fetch_requests(libpq_source *src); /* public interface functions */ static void libpq_traverse_files(rewind_source *source, process_file_callback_t callback); static void libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len); static void libpq_finish_fetch(rewind_source *source); static char *libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize); static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source); static void libpq_destroy(rewind_source *source); /* * Create a new libpq source. * * The caller has already established the connection, but should not try * to use it while the source is active. */ rewind_source * init_libpq_source(PGconn *conn) { libpq_source *src; init_libpq_conn(conn); src = pg_malloc0(sizeof(libpq_source)); src->common.traverse_files = libpq_traverse_files; src->common.fetch_file = libpq_fetch_file; src->common.queue_fetch_range = libpq_queue_fetch_range; src->common.finish_fetch = libpq_finish_fetch; src->common.get_current_wal_insert_lsn = libpq_get_current_wal_insert_lsn; src->common.destroy = libpq_destroy; src->conn = conn; initStringInfo(&src->paths); initStringInfo(&src->offsets); initStringInfo(&src->lengths); return &src->common; } /* * Initialize a libpq connection for use. */ static void init_libpq_conn(PGconn *conn) { PGresult *res; char *str; /* disable all types of timeouts */ run_simple_command(conn, "SET statement_timeout = 0"); run_simple_command(conn, "SET lock_timeout = 0"); run_simple_command(conn, "SET idle_in_transaction_session_timeout = 0"); /* * we don't intend to do any updates, put the connection in read-only mode * to keep us honest */ run_simple_command(conn, "SET default_transaction_read_only = on"); /* secure search_path */ res = PQexec(conn, ALWAYS_SECURE_SEARCH_PATH_SQL); if (PQresultStatus(res) != PGRES_TUPLES_OK) pg_fatal("could not clear search_path: %s", PQresultErrorMessage(res)); PQclear(res); /* * Also check that full_page_writes is enabled. We can get torn pages if * a page is modified while we read it with pg_read_binary_file(), and we * rely on full page images to fix them. */ str = run_simple_query(conn, "SHOW full_page_writes"); if (strcmp(str, "on") != 0) pg_fatal("full_page_writes must be enabled in the source server"); pg_free(str); /* Prepare a statement we'll use to fetch files */ res = PQprepare(conn, "fetch_chunks_stmt", "SELECT path, begin,\n" " pg_read_binary_file(path, begin, len, true) AS chunk\n" "FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)", 3, NULL); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("could not prepare statement to fetch file contents: %s", PQresultErrorMessage(res)); PQclear(res); } /* * Run a query that returns a single value. * * The result should be pg_free'd after use. */ static char * run_simple_query(PGconn *conn, const char *sql) { PGresult *res; char *result; res = PQexec(conn, sql); if (PQresultStatus(res) != PGRES_TUPLES_OK) pg_fatal("error running query (%s) on source server: %s", sql, PQresultErrorMessage(res)); /* sanity check the result set */ if (PQnfields(res) != 1 || PQntuples(res) != 1 || PQgetisnull(res, 0, 0)) pg_fatal("unexpected result set from query"); result = pg_strdup(PQgetvalue(res, 0, 0)); PQclear(res); return result; } /* * Run a command. * * In the event of a failure, exit immediately. */ static void run_simple_command(PGconn *conn, const char *sql) { PGresult *res; res = PQexec(conn, sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) pg_fatal("error running query (%s) in source server: %s", sql, PQresultErrorMessage(res)); PQclear(res); } /* * Call the pg_current_wal_insert_lsn() function in the remote system. */ static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source) { PGconn *conn = ((libpq_source *) source)->conn; XLogRecPtr result; uint32 hi; uint32 lo; char *val; val = run_simple_query(conn, "SELECT pg_current_wal_insert_lsn()"); if (sscanf(val, "%X/%X", &hi, &lo) != 2) pg_fatal("unrecognized result \"%s\" for current WAL insert location", val); result = ((uint64) hi) << 32 | lo; pg_free(val); return result; } /* * Get a list of all files in the data directory. */ static void libpq_traverse_files(rewind_source *source, process_file_callback_t callback) { PGconn *conn = ((libpq_source *) source)->conn; PGresult *res; const char *sql; int i; /* * Create a recursive directory listing of the whole data directory. * * The WITH RECURSIVE part does most of the work. The second part gets the * targets of the symlinks in pg_tblspc directory. * * XXX: There is no backend function to get a symbolic link's target in * general, so if the admin has put any custom symbolic links in the data * directory, they won't be copied correctly. */ sql = "WITH RECURSIVE files (path, filename, size, isdir) AS (\n" " SELECT '' AS path, filename, size, isdir FROM\n" " (SELECT pg_ls_dir('.', true, false) AS filename) AS fn,\n" " pg_stat_file(fn.filename, true) AS this\n" " UNION ALL\n" " SELECT parent.path || parent.filename || '/' AS path,\n" " fn, this.size, this.isdir\n" " FROM files AS parent,\n" " pg_ls_dir(parent.path || parent.filename, true, false) AS fn,\n" " pg_stat_file(parent.path || parent.filename || '/' || fn, true) AS this\n" " WHERE parent.isdir = 't'\n" ")\n" "SELECT path || filename, size, isdir,\n" " pg_tablespace_location(pg_tablespace.oid) AS link_target\n" "FROM files\n" "LEFT OUTER JOIN pg_tablespace ON files.path = 'pg_tblspc/'\n" " AND oid::text = files.filename\n"; res = PQexec(conn, sql); if (PQresultStatus(res) != PGRES_TUPLES_OK) pg_fatal("could not fetch file list: %s", PQresultErrorMessage(res)); /* sanity check the result set */ if (PQnfields(res) != 4) pg_fatal("unexpected result set while fetching file list"); /* Read result to local variables */ for (i = 0; i < PQntuples(res); i++) { char *path; int64 filesize; bool isdir; char *link_target; file_type_t type; if (PQgetisnull(res, i, 1)) { /* * The file was removed from the server while the query was * running. Ignore it. */ continue; } path = PQgetvalue(res, i, 0); filesize = atol(PQgetvalue(res, i, 1)); isdir = (strcmp(PQgetvalue(res, i, 2), "t") == 0); link_target = PQgetvalue(res, i, 3); if (link_target[0]) type = FILE_TYPE_SYMLINK; else if (isdir) type = FILE_TYPE_DIRECTORY; else type = FILE_TYPE_REGULAR; process_source_file(path, type, filesize, link_target); } PQclear(res); } /* * Queue up a request to fetch a piece of a file from remote system. */ static void libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) { libpq_source *src = (libpq_source *) source; /* * Does this request happen to be a continuation of the previous chunk? If * so, merge it with the previous one. * * XXX: We use pointer equality to compare the path. That's good enough * for our purposes; the caller always passes the same pointer for the * same filename. If it didn't, we would fail to merge requests, but it * wouldn't affect correctness. */ if (src->num_requests > 0) { fetch_range_request *prev = &src->request_queue[src->num_requests - 1]; if (prev->offset + prev->length == off && prev->length < MAX_CHUNK_SIZE && prev->path == path) { /* * Extend the previous request to cover as much of this new * request as possible, without exceeding MAX_CHUNK_SIZE. */ size_t thislen; thislen = Min(len, MAX_CHUNK_SIZE - prev->length); prev->length += thislen; off += thislen; len -= thislen; /* * Fall through to create new requests for any remaining 'len' * that didn't fit in the previous chunk. */ } } /* Divide the request into pieces of MAX_CHUNK_SIZE bytes each */ while (len > 0) { int32 thislen; /* if the queue is full, perform all the work queued up so far */ if (src->num_requests == MAX_CHUNKS_PER_QUERY) process_queued_fetch_requests(src); thislen = Min(len, MAX_CHUNK_SIZE); src->request_queue[src->num_requests].path = path; src->request_queue[src->num_requests].offset = off; src->request_queue[src->num_requests].length = thislen; src->num_requests++; off += thislen; len -= thislen; } } /* * Fetch all the queued chunks and write them to the target data directory. */ static void libpq_finish_fetch(rewind_source *source) { process_queued_fetch_requests((libpq_source *) source); } static void process_queued_fetch_requests(libpq_source *src) { const char *params[3]; PGresult *res; int chunkno; if (src->num_requests == 0) return; pg_log_debug("getting %d file chunks", src->num_requests); /* * The prepared statement, 'fetch_chunks_stmt', takes three arrays with * the same length as parameters: paths, offsets and lengths. Construct * the string representations of them. */ resetStringInfo(&src->paths); resetStringInfo(&src->offsets); resetStringInfo(&src->lengths); appendStringInfoChar(&src->paths, '{'); appendStringInfoChar(&src->offsets, '{'); appendStringInfoChar(&src->lengths, '{'); for (int i = 0; i < src->num_requests; i++) { fetch_range_request *rq = &src->request_queue[i]; if (i > 0) { appendStringInfoChar(&src->paths, ','); appendStringInfoChar(&src->offsets, ','); appendStringInfoChar(&src->lengths, ','); } appendArrayEscapedString(&src->paths, rq->path); appendStringInfo(&src->offsets, INT64_FORMAT, (int64) rq->offset); appendStringInfo(&src->lengths, INT64_FORMAT, (int64) rq->length); } appendStringInfoChar(&src->paths, '}'); appendStringInfoChar(&src->offsets, '}'); appendStringInfoChar(&src->lengths, '}'); /* * Execute the prepared statement. */ params[0] = src->paths.data; params[1] = src->offsets.data; params[2] = src->lengths.data; if (PQsendQueryPrepared(src->conn, "fetch_chunks_stmt", 3, params, NULL, NULL, 1) != 1) pg_fatal("could not send query: %s", PQerrorMessage(src->conn)); if (PQsetSingleRowMode(src->conn) != 1) pg_fatal("could not set libpq connection to single row mode"); /*---- * The result set is of format: * * path text -- path in the data directory, e.g "base/1/123" * begin int8 -- offset within the file * chunk bytea -- file content *---- */ chunkno = 0; while ((res = PQgetResult(src->conn)) != NULL) { fetch_range_request *rq = &src->request_queue[chunkno]; char *filename; int filenamelen; int64 chunkoff; int chunksize; char *chunk; switch (PQresultStatus(res)) { case PGRES_SINGLE_TUPLE: break; case PGRES_TUPLES_OK: PQclear(res); continue; /* final zero-row result */ default: pg_fatal("unexpected result while fetching remote files: %s", PQresultErrorMessage(res)); } if (chunkno > src->num_requests) pg_fatal("received more data chunks than requested"); /* sanity check the result set */ if (PQnfields(res) != 3 || PQntuples(res) != 1) pg_fatal("unexpected result set size while fetching remote files"); if (PQftype(res, 0) != TEXTOID || PQftype(res, 1) != INT8OID || PQftype(res, 2) != BYTEAOID) { pg_fatal("unexpected data types in result set while fetching remote files: %u %u %u", PQftype(res, 0), PQftype(res, 1), PQftype(res, 2)); } if (PQfformat(res, 0) != 1 && PQfformat(res, 1) != 1 && PQfformat(res, 2) != 1) { pg_fatal("unexpected result format while fetching remote files"); } if (PQgetisnull(res, 0, 0) || PQgetisnull(res, 0, 1)) { pg_fatal("unexpected null values in result while fetching remote files"); } if (PQgetlength(res, 0, 1) != sizeof(int64)) pg_fatal("unexpected result length while fetching remote files"); /* Read result set to local variables */ memcpy(&chunkoff, PQgetvalue(res, 0, 1), sizeof(int64)); chunkoff = pg_ntoh64(chunkoff); chunksize = PQgetlength(res, 0, 2); filenamelen = PQgetlength(res, 0, 0); filename = pg_malloc(filenamelen + 1); memcpy(filename, PQgetvalue(res, 0, 0), filenamelen); filename[filenamelen] = '\0'; chunk = PQgetvalue(res, 0, 2); /* * If a file has been deleted on the source, remove it on the target * as well. Note that multiple unlink() calls may happen on the same * file if multiple data chunks are associated with it, hence ignore * unconditionally anything missing. */ if (PQgetisnull(res, 0, 2)) { pg_log_debug("received null value for chunk for file \"%s\", file has been deleted", filename); remove_target_file(filename, true); } else { pg_log_debug("received chunk for file \"%s\", offset %lld, size %d", filename, (long long int) chunkoff, chunksize); if (strcmp(filename, rq->path) != 0) { pg_fatal("received data for file \"%s\", when requested for \"%s\"", filename, rq->path); } if (chunkoff != rq->offset) pg_fatal("received data at offset %lld of file \"%s\", when requested for offset %lld", (long long int) chunkoff, rq->path, (long long int) rq->offset); /* * We should not receive more data than we requested, or * pg_read_binary_file() messed up. We could receive less, * though, if the file was truncated in the source after we * checked its size. That's OK, there should be a WAL record of * the truncation, which will get replayed when you start the * target system for the first time after pg_rewind has completed. */ if (chunksize > rq->length) pg_fatal("received more than requested for file \"%s\"", rq->path); open_target_file(filename, false); write_target_range(chunk, chunkoff, chunksize); } pg_free(filename); PQclear(res); chunkno++; } if (chunkno != src->num_requests) pg_fatal("unexpected number of data chunks received"); src->num_requests = 0; } /* * Escape a string to be used as element in a text array constant */ static void appendArrayEscapedString(StringInfo buf, const char *str) { appendStringInfoCharMacro(buf, '\"'); while (*str) { char ch = *str; if (ch == '"' || ch == '\\') appendStringInfoCharMacro(buf, '\\'); appendStringInfoCharMacro(buf, ch); str++; } appendStringInfoCharMacro(buf, '\"'); } /* * Fetch a single file as a malloc'd buffer. */ static char * libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize) { PGconn *conn = ((libpq_source *) source)->conn; PGresult *res; char *result; int len; const char *paramValues[1]; paramValues[0] = path; res = PQexecParams(conn, "SELECT pg_read_binary_file($1)", 1, NULL, paramValues, NULL, NULL, 1); if (PQresultStatus(res) != PGRES_TUPLES_OK) pg_fatal("could not fetch remote file \"%s\": %s", path, PQresultErrorMessage(res)); /* sanity check the result set */ if (PQntuples(res) != 1 || PQgetisnull(res, 0, 0)) pg_fatal("unexpected result set while fetching remote file \"%s\"", path); /* Read result to local variables */ len = PQgetlength(res, 0, 0); result = pg_malloc(len + 1); memcpy(result, PQgetvalue(res, 0, 0), len); result[len] = '\0'; PQclear(res); pg_log_debug("fetched file \"%s\", length %d", path, len); if (filesize) *filesize = len; return result; } /* * Close a libpq source. */ static void libpq_destroy(rewind_source *source) { libpq_source *src = (libpq_source *) source; pfree(src->paths.data); pfree(src->offsets.data); pfree(src->lengths.data); pfree(src); /* NOTE: we don't close the connection here, as it was not opened by us. */ }