diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/bin/pg_upgrade/file.c | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/bin/pg_upgrade/file.c')
-rw-r--r-- | src/bin/pg_upgrade/file.c | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c new file mode 100644 index 0000000..d173602 --- /dev/null +++ b/src/bin/pg_upgrade/file.c @@ -0,0 +1,377 @@ +/* + * file.c + * + * file system operations + * + * Copyright (c) 2010-2023, PostgreSQL Global Development Group + * src/bin/pg_upgrade/file.c + */ + +#include "postgres_fe.h" + +#include <sys/stat.h> +#include <fcntl.h> +#ifdef HAVE_COPYFILE_H +#include <copyfile.h> +#endif +#ifdef __linux__ +#include <sys/ioctl.h> +#include <linux/fs.h> +#endif + +#include "access/visibilitymapdefs.h" +#include "common/file_perm.h" +#include "pg_upgrade.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" + + +/* + * cloneFile() + * + * Clones/reflinks a relation file from src to dst. + * + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", + schemaName, relName, src, dst, strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + int src_fd; + int dest_fd; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s", + schemaName, relName, dst, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + int save_errno = errno; + + unlink(dst); + + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", + schemaName, relName, src, dst, strerror(save_errno)); + } + + close(src_fd); + close(dest_fd); +#endif +} + + +/* + * copyFile() + * + * Copies a relation file from src to dst. + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +copyFile(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ +#ifndef WIN32 + int src_fd; + int dest_fd; + char *buffer; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", + schemaName, relName, dst, strerror(errno)); + + /* copy in fairly large chunks for best efficiency */ +#define COPY_BUF_SIZE (50 * BLCKSZ) + + buffer = (char *) pg_malloc(COPY_BUF_SIZE); + + /* perform data copying i.e read src source, write to destination */ + while (true) + { + ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE); + + if (nbytes < 0) + pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", + schemaName, relName, src, strerror(errno)); + + if (nbytes == 0) + break; + + errno = 0; + if (write(dest_fd, buffer, nbytes) != nbytes) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", + schemaName, relName, dst, strerror(errno)); + } + } + + pg_free(buffer); + close(src_fd); + close(dest_fd); + +#else /* WIN32 */ + + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s", + schemaName, relName, src, dst, strerror(errno)); + } + +#endif /* WIN32 */ +} + + +/* + * linkFile() + * + * Hard-links a relation file from src to dst. + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +linkFile(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ + if (link(src, dst) < 0) + pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s", + schemaName, relName, src, dst, strerror(errno)); +} + + +/* + * rewriteVisibilityMap() + * + * Transform a visibility map file, copying from src to dst. + * schemaName/relName are relation's SQL name (used for error messages only). + * + * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's + * visibility map included one bit per heap page; it now includes two. + * When upgrading a cluster from before that time to a current PostgreSQL + * version, we could refuse to copy visibility maps from the old cluster + * to the new cluster; the next VACUUM would recreate them, but at the + * price of scanning the entire table. So, instead, we rewrite the old + * visibility maps in the new format. That way, the all-visible bits + * remain set for the pages for which they were set previously. The + * all-frozen bits are never set by this conversion; we leave that to VACUUM. + */ +void +rewriteVisibilityMap(const char *fromfile, const char *tofile, + const char *schemaName, const char *relName) +{ + int src_fd; + int dst_fd; + PGIOAlignedBlock buffer; + PGIOAlignedBlock new_vmbuf; + ssize_t totalBytesRead = 0; + ssize_t src_filesize; + int rewriteVmBytesPerPage; + BlockNumber new_blkno = 0; + struct stat statbuf; + + /* Compute number of old-format bytes per new page */ + rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; + + if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + + if (fstat(src_fd, &statbuf) != 0) + pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + + if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", + schemaName, relName, tofile, strerror(errno)); + + /* Save old file size */ + src_filesize = statbuf.st_size; + + /* + * Turn each visibility map page into 2 pages one by one. Each new page + * has the same page header as the old one. If the last section of the + * last page is empty, we skip it, mostly to avoid turning one-page + * visibility maps for small relations into two pages needlessly. + */ + while (totalBytesRead < src_filesize) + { + ssize_t bytesRead; + char *old_cur; + char *old_break; + char *old_blkend; + PageHeaderData pageheader; + bool old_lastblk; + + if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ) + { + if (bytesRead < 0) + pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", + schemaName, relName, fromfile, strerror(errno)); + else + pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"", + schemaName, relName, fromfile); + } + + totalBytesRead += BLCKSZ; + old_lastblk = (totalBytesRead == src_filesize); + + /* Save the page header data */ + memcpy(&pageheader, buffer.data, SizeOfPageHeaderData); + + /* + * These old_* variables point to old visibility map page. old_cur + * points to current position on old page. old_blkend points to end of + * old block. old_break is the end+1 position on the old page for the + * data that will be transferred to the current new page. + */ + old_cur = buffer.data + SizeOfPageHeaderData; + old_blkend = buffer.data + bytesRead; + old_break = old_cur + rewriteVmBytesPerPage; + + while (old_break <= old_blkend) + { + char *new_cur; + bool empty = true; + bool old_lastpart; + + /* First, copy old page header to new page */ + memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData); + + /* Rewriting the last part of the last old page? */ + old_lastpart = old_lastblk && (old_break == old_blkend); + + new_cur = new_vmbuf.data + SizeOfPageHeaderData; + + /* Process old page bytes one by one, and turn it into new page. */ + while (old_cur < old_break) + { + uint8 byte = *(uint8 *) old_cur; + uint16 new_vmbits = 0; + int i; + + /* Generate new format bits while keeping old information */ + for (i = 0; i < BITS_PER_BYTE; i++) + { + if (byte & (1 << i)) + { + empty = false; + new_vmbits |= + VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i); + } + } + + /* Copy new visibility map bytes to new-format page */ + new_cur[0] = (char) (new_vmbits & 0xFF); + new_cur[1] = (char) (new_vmbits >> 8); + + old_cur++; + new_cur += BITS_PER_HEAPBLOCK; + } + + /* If the last part of the last page is empty, skip writing it */ + if (old_lastpart && empty) + break; + + /* Set new checksum for visibility map page, if enabled */ + if (new_cluster.controldata.data_checksum_version != 0) + ((PageHeader) new_vmbuf.data)->pd_checksum = + pg_checksum_page(new_vmbuf.data, new_blkno); + + errno = 0; + if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", + schemaName, relName, tofile, strerror(errno)); + } + + /* Advance for next new page */ + old_break += rewriteVmBytesPerPage; + new_blkno++; + } + } + + /* Clean up */ + close(dst_fd); + close(src_fd); +} + +void +check_file_clone(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("could not clone file between old and new data directories: %s", + strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s", + new_link_file, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + pg_fatal("could not clone file between old and new data directories: %s", + strerror(errno)); + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("file cloning not supported on this platform"); +#endif + + unlink(new_link_file); +} + +void +check_hard_link(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + + if (link(existing_file, new_link_file) < 0) + pg_fatal("could not create hard link between old and new data directories: %s\n" + "In link mode the old and new data directories must be on the same file system.", + strerror(errno)); + + unlink(new_link_file); +} |