/* * file.c * * file system operations * * Copyright (c) 2010-2023, PostgreSQL Global Development Group * src/bin/pg_upgrade/file.c */ #include "postgres_fe.h" #include #include #ifdef HAVE_COPYFILE_H #include #endif #ifdef __linux__ #include #include #endif #include "access/visibilitymapdefs.h" #include "common/file_perm.h" #include "pg_upgrade.h" #include "storage/bufpage.h" #include "storage/checksum.h" #include "storage/checksum_impl.h" /* * cloneFile() * * Clones/reflinks a relation file from src to dst. * * schemaName/relName are relation's SQL name (used for error messages only). */ void cloneFile(const char *src, const char *dst, const char *schemaName, const char *relName) { #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", schemaName, relName, src, dst, strerror(errno)); #elif defined(__linux__) && defined(FICLONE) int src_fd; int dest_fd; if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s", schemaName, relName, src, strerror(errno)); if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, pg_file_create_mode)) < 0) pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s", schemaName, relName, dst, strerror(errno)); if (ioctl(dest_fd, FICLONE, src_fd) < 0) { int save_errno = errno; unlink(dst); pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", schemaName, relName, src, dst, strerror(save_errno)); } close(src_fd); close(dest_fd); #endif } /* * copyFile() * * Copies a relation file from src to dst. * schemaName/relName are relation's SQL name (used for error messages only). */ void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName) { #ifndef WIN32 int src_fd; int dest_fd; char *buffer; if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", schemaName, relName, src, strerror(errno)); if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, pg_file_create_mode)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", schemaName, relName, dst, strerror(errno)); /* copy in fairly large chunks for best efficiency */ #define COPY_BUF_SIZE (50 * BLCKSZ) buffer = (char *) pg_malloc(COPY_BUF_SIZE); /* perform data copying i.e read src source, write to destination */ while (true) { ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE); if (nbytes < 0) pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", schemaName, relName, src, strerror(errno)); if (nbytes == 0) break; errno = 0; if (write(dest_fd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", schemaName, relName, dst, strerror(errno)); } } pg_free(buffer); close(src_fd); close(dest_fd); #else /* WIN32 */ if (CopyFile(src, dst, true) == 0) { _dosmaperr(GetLastError()); pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s", schemaName, relName, src, dst, strerror(errno)); } #endif /* WIN32 */ } /* * linkFile() * * Hard-links a relation file from src to dst. * schemaName/relName are relation's SQL name (used for error messages only). */ void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName) { if (link(src, dst) < 0) pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s", schemaName, relName, src, dst, strerror(errno)); } /* * rewriteVisibilityMap() * * Transform a visibility map file, copying from src to dst. * schemaName/relName are relation's SQL name (used for error messages only). * * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's * visibility map included one bit per heap page; it now includes two. * When upgrading a cluster from before that time to a current PostgreSQL * version, we could refuse to copy visibility maps from the old cluster * to the new cluster; the next VACUUM would recreate them, but at the * price of scanning the entire table. So, instead, we rewrite the old * visibility maps in the new format. That way, the all-visible bits * remain set for the pages for which they were set previously. The * all-frozen bits are never set by this conversion; we leave that to VACUUM. */ void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName) { int src_fd; int dst_fd; PGIOAlignedBlock buffer; PGIOAlignedBlock new_vmbuf; ssize_t totalBytesRead = 0; ssize_t src_filesize; int rewriteVmBytesPerPage; BlockNumber new_blkno = 0; struct stat statbuf; /* Compute number of old-format bytes per new page */ rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", schemaName, relName, fromfile, strerror(errno)); if (fstat(src_fd, &statbuf) != 0) pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s", schemaName, relName, fromfile, strerror(errno)); if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, pg_file_create_mode)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", schemaName, relName, tofile, strerror(errno)); /* Save old file size */ src_filesize = statbuf.st_size; /* * Turn each visibility map page into 2 pages one by one. Each new page * has the same page header as the old one. If the last section of the * last page is empty, we skip it, mostly to avoid turning one-page * visibility maps for small relations into two pages needlessly. */ while (totalBytesRead < src_filesize) { ssize_t bytesRead; char *old_cur; char *old_break; char *old_blkend; PageHeaderData pageheader; bool old_lastblk; if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ) { if (bytesRead < 0) pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", schemaName, relName, fromfile, strerror(errno)); else pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"", schemaName, relName, fromfile); } totalBytesRead += BLCKSZ; old_lastblk = (totalBytesRead == src_filesize); /* Save the page header data */ memcpy(&pageheader, buffer.data, SizeOfPageHeaderData); /* * These old_* variables point to old visibility map page. old_cur * points to current position on old page. old_blkend points to end of * old block. old_break is the end+1 position on the old page for the * data that will be transferred to the current new page. */ old_cur = buffer.data + SizeOfPageHeaderData; old_blkend = buffer.data + bytesRead; old_break = old_cur + rewriteVmBytesPerPage; while (old_break <= old_blkend) { char *new_cur; bool empty = true; bool old_lastpart; /* First, copy old page header to new page */ memcpy(new_vmbuf.data, &pageheader, SizeOfPageHeaderData); /* Rewriting the last part of the last old page? */ old_lastpart = old_lastblk && (old_break == old_blkend); new_cur = new_vmbuf.data + SizeOfPageHeaderData; /* Process old page bytes one by one, and turn it into new page. */ while (old_cur < old_break) { uint8 byte = *(uint8 *) old_cur; uint16 new_vmbits = 0; int i; /* Generate new format bits while keeping old information */ for (i = 0; i < BITS_PER_BYTE; i++) { if (byte & (1 << i)) { empty = false; new_vmbits |= VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i); } } /* Copy new visibility map bytes to new-format page */ new_cur[0] = (char) (new_vmbits & 0xFF); new_cur[1] = (char) (new_vmbits >> 8); old_cur++; new_cur += BITS_PER_HEAPBLOCK; } /* If the last part of the last page is empty, skip writing it */ if (old_lastpart && empty) break; /* Set new checksum for visibility map page, if enabled */ if (new_cluster.controldata.data_checksum_version != 0) ((PageHeader) new_vmbuf.data)->pd_checksum = pg_checksum_page(new_vmbuf.data, new_blkno); errno = 0; if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", schemaName, relName, tofile, strerror(errno)); } /* Advance for next new page */ old_break += rewriteVmBytesPerPage; new_blkno++; } } /* Clean up */ close(dst_fd); close(src_fd); } void check_file_clone(void) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); unlink(new_link_file); /* might fail */ #if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) pg_fatal("could not clone file between old and new data directories: %s", strerror(errno)); #elif defined(__linux__) && defined(FICLONE) { int src_fd; int dest_fd; if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("could not open file \"%s\": %s", existing_file, strerror(errno)); if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, pg_file_create_mode)) < 0) pg_fatal("could not create file \"%s\": %s", new_link_file, strerror(errno)); if (ioctl(dest_fd, FICLONE, src_fd) < 0) pg_fatal("could not clone file between old and new data directories: %s", strerror(errno)); close(src_fd); close(dest_fd); } #else pg_fatal("file cloning not supported on this platform"); #endif unlink(new_link_file); } void check_hard_link(void) { char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); unlink(new_link_file); /* might fail */ if (link(existing_file, new_link_file) < 0) pg_fatal("could not create hard link between old and new data directories: %s\n" "In link mode the old and new data directories must be on the same file system.", strerror(errno)); unlink(new_link_file); }