diff options
Diffstat (limited to 'src/backend/storage/file')
-rw-r--r-- | src/backend/storage/file/Makefile | 22 | ||||
-rw-r--r-- | src/backend/storage/file/buffile.c | 949 | ||||
-rw-r--r-- | src/backend/storage/file/copydir.c | 226 | ||||
-rw-r--r-- | src/backend/storage/file/fd.c | 3789 | ||||
-rw-r--r-- | src/backend/storage/file/reinit.c | 410 | ||||
-rw-r--r-- | src/backend/storage/file/sharedfileset.c | 354 |
6 files changed, 5750 insertions, 0 deletions
diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile new file mode 100644 index 0000000..5e1291b --- /dev/null +++ b/src/backend/storage/file/Makefile @@ -0,0 +1,22 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/file +# +# IDENTIFICATION +# src/backend/storage/file/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/file +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + buffile.o \ + copydir.o \ + fd.o \ + reinit.o \ + sharedfileset.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c new file mode 100644 index 0000000..a4be5fe --- /dev/null +++ b/src/backend/storage/file/buffile.c @@ -0,0 +1,949 @@ +/*------------------------------------------------------------------------- + * + * buffile.c + * Management of large buffered temporary files. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/buffile.c + * + * NOTES: + * + * BufFiles provide a very incomplete emulation of stdio atop virtual Files + * (as managed by fd.c). Currently, we only support the buffered-I/O + * aspect of stdio: a read or write of the low-level File occurs only + * when the buffer is filled or emptied. This is an even bigger win + * for virtual Files than for ordinary kernel files, since reducing the + * frequency with which a virtual File is touched reduces "thrashing" + * of opening/closing file descriptors. + * + * Note that BufFile structs are allocated with palloc(), and therefore + * will go away automatically at query/transaction end. Since the underlying + * virtual Files are made with OpenTemporaryFile, all resources for + * the file are certain to be cleaned up even if processing is aborted + * by ereport(ERROR). The data structures required are made in the + * palloc context that was current when the BufFile was created, and + * any external resources such as temp files are owned by the ResourceOwner + * that was current at that time. + * + * BufFile also supports temporary files that exceed the OS file size limit + * (by opening multiple fd.c temporary files). This is an essential feature + * for sorts and hashjoins on large amounts of data. + * + * BufFile supports temporary files that can be shared with other backends, as + * infrastructure for parallel execution. Such files need to be created as a + * member of a SharedFileSet that all participants are attached to. + * + * BufFile also supports temporary files that can be used by the single backend + * when the corresponding files need to be survived across the transaction and + * need to be opened and closed multiple times. Such files need to be created + * as a member of a SharedFileSet. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "commands/tablespace.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/buffile.h" +#include "storage/fd.h" +#include "utils/resowner.h" + +/* + * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE. + * The reason is that we'd like large BufFiles to be spread across multiple + * tablespaces when available. + */ +#define MAX_PHYSICAL_FILESIZE 0x40000000 +#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) + +/* + * This data structure represents a buffered file that consists of one or + * more physical files (each accessed through a virtual file descriptor + * managed by fd.c). + */ +struct BufFile +{ + int numFiles; /* number of physical files in set */ + /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ + File *files; /* palloc'd array with numFiles entries */ + + bool isInterXact; /* keep open over transactions? */ + bool dirty; /* does buffer need to be written? */ + bool readOnly; /* has the file been set to read only? */ + + SharedFileSet *fileset; /* space for segment files if shared */ + const char *name; /* name of this BufFile if shared */ + + /* + * resowner is the ResourceOwner to use for underlying temp files. (We + * don't need to remember the memory context we're using explicitly, + * because after creation we only repalloc our arrays larger.) + */ + ResourceOwner resowner; + + /* + * "current pos" is position of start of buffer within the logical file. + * Position as seen by user of BufFile is (curFile, curOffset + pos). + */ + int curFile; /* file index (0..n) part of current pos */ + off_t curOffset; /* offset part of current pos */ + int pos; /* next read/write position in buffer */ + int nbytes; /* total # of valid bytes in buffer */ + PGAlignedBlock buffer; +}; + +static BufFile *makeBufFileCommon(int nfiles); +static BufFile *makeBufFile(File firstfile); +static void extendBufFile(BufFile *file); +static void BufFileLoadBuffer(BufFile *file); +static void BufFileDumpBuffer(BufFile *file); +static void BufFileFlush(BufFile *file); +static File MakeNewSharedSegment(BufFile *file, int segment); + +/* + * Create BufFile and perform the common initialization. + */ +static BufFile * +makeBufFileCommon(int nfiles) +{ + BufFile *file = (BufFile *) palloc(sizeof(BufFile)); + + file->numFiles = nfiles; + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; + file->curFile = 0; + file->curOffset = 0L; + file->pos = 0; + file->nbytes = 0; + + return file; +} + +/* + * Create a BufFile given the first underlying physical file. + * NOTE: caller must set isInterXact if appropriate. + */ +static BufFile * +makeBufFile(File firstfile) +{ + BufFile *file = makeBufFileCommon(1); + + file->files = (File *) palloc(sizeof(File)); + file->files[0] = firstfile; + file->readOnly = false; + file->fileset = NULL; + file->name = NULL; + + return file; +} + +/* + * Add another component temp file. + */ +static void +extendBufFile(BufFile *file) +{ + File pfile; + ResourceOwner oldowner; + + /* Be sure to associate the file with the BufFile's resource owner */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = file->resowner; + + if (file->fileset == NULL) + pfile = OpenTemporaryFile(file->isInterXact); + else + pfile = MakeNewSharedSegment(file, file->numFiles); + + Assert(pfile >= 0); + + CurrentResourceOwner = oldowner; + + file->files = (File *) repalloc(file->files, + (file->numFiles + 1) * sizeof(File)); + file->files[file->numFiles] = pfile; + file->numFiles++; +} + +/* + * Create a BufFile for a new temporary file (which will expand to become + * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are + * written to it). + * + * If interXact is true, the temp file will not be automatically deleted + * at end of transaction. + * + * Note: if interXact is true, the caller had better be calling us in a + * memory context, and with a resource owner, that will survive across + * transaction boundaries. + */ +BufFile * +BufFileCreateTemp(bool interXact) +{ + BufFile *file; + File pfile; + + /* + * Ensure that temp tablespaces are set up for OpenTemporaryFile to use. + * Possibly the caller will have done this already, but it seems useful to + * double-check here. Failure to do this at all would result in the temp + * files always getting placed in the default tablespace, which is a + * pretty hard-to-detect bug. Callers may prefer to do it earlier if they + * want to be sure that any required catalog access is done in some other + * resource context. + */ + PrepareTempTablespaces(); + + pfile = OpenTemporaryFile(interXact); + Assert(pfile >= 0); + + file = makeBufFile(pfile); + file->isInterXact = interXact; + + return file; +} + +/* + * Build the name for a given segment of a given BufFile. + */ +static void +SharedSegmentName(char *name, const char *buffile_name, int segment) +{ + snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment); +} + +/* + * Create a new segment file backing a shared BufFile. + */ +static File +MakeNewSharedSegment(BufFile *buffile, int segment) +{ + char name[MAXPGPATH]; + File file; + + /* + * It is possible that there are files left over from before a crash + * restart with the same name. In order for BufFileOpenShared() not to + * get confused about how many segments there are, we'll unlink the next + * segment number if it already exists. + */ + SharedSegmentName(name, buffile->name, segment + 1); + SharedFileSetDelete(buffile->fileset, name, true); + + /* Create the new segment. */ + SharedSegmentName(name, buffile->name, segment); + file = SharedFileSetCreate(buffile->fileset, name); + + /* SharedFileSetCreate would've errored out */ + Assert(file > 0); + + return file; +} + +/* + * Create a BufFile that can be discovered and opened read-only by other + * backends that are attached to the same SharedFileSet using the same name. + * + * The naming scheme for shared BufFiles is left up to the calling code. The + * name will appear as part of one or more filenames on disk, and might + * provide clues to administrators about which subsystem is generating + * temporary file data. Since each SharedFileSet object is backed by one or + * more uniquely named temporary directory, names don't conflict with + * unrelated SharedFileSet objects. + */ +BufFile * +BufFileCreateShared(SharedFileSet *fileset, const char *name) +{ + BufFile *file; + + file = makeBufFileCommon(1); + file->fileset = fileset; + file->name = pstrdup(name); + file->files = (File *) palloc(sizeof(File)); + file->files[0] = MakeNewSharedSegment(file, 0); + file->readOnly = false; + + return file; +} + +/* + * Open a file that was previously created in another backend (or this one) + * with BufFileCreateShared in the same SharedFileSet using the same name. + * The backend that created the file must have called BufFileClose() or + * BufFileExportShared() to make sure that it is ready to be opened by other + * backends and render it read-only. + */ +BufFile * +BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode) +{ + BufFile *file; + char segment_name[MAXPGPATH]; + Size capacity = 16; + File *files; + int nfiles = 0; + + files = palloc(sizeof(File) * capacity); + + /* + * We don't know how many segments there are, so we'll probe the + * filesystem to find out. + */ + for (;;) + { + /* See if we need to expand our file segment array. */ + if (nfiles + 1 > capacity) + { + capacity *= 2; + files = repalloc(files, sizeof(File) * capacity); + } + /* Try to load a segment. */ + SharedSegmentName(segment_name, name, nfiles); + files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode); + if (files[nfiles] <= 0) + break; + ++nfiles; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we didn't find any files at all, then no BufFile exists with this + * name. + */ + if (nfiles == 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m", + segment_name, name))); + + file = makeBufFileCommon(nfiles); + file->files = files; + file->readOnly = (mode == O_RDONLY) ? true : false; + file->fileset = fileset; + file->name = pstrdup(name); + + return file; +} + +/* + * Delete a BufFile that was created by BufFileCreateShared in the given + * SharedFileSet using the given name. + * + * It is not necessary to delete files explicitly with this function. It is + * provided only as a way to delete files proactively, rather than waiting for + * the SharedFileSet to be cleaned up. + * + * Only one backend should attempt to delete a given name, and should know + * that it exists and has been exported or closed. + */ +void +BufFileDeleteShared(SharedFileSet *fileset, const char *name) +{ + char segment_name[MAXPGPATH]; + int segment = 0; + bool found = false; + + /* + * We don't know how many segments the file has. We'll keep deleting + * until we run out. If we don't manage to find even an initial segment, + * raise an error. + */ + for (;;) + { + SharedSegmentName(segment_name, name, segment); + if (!SharedFileSetDelete(fileset, segment_name, true)) + break; + found = true; + ++segment; + + CHECK_FOR_INTERRUPTS(); + } + + if (!found) + elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name); +} + +/* + * BufFileExportShared --- flush and make read-only, in preparation for sharing. + */ +void +BufFileExportShared(BufFile *file) +{ + /* Must be a file belonging to a SharedFileSet. */ + Assert(file->fileset != NULL); + + /* It's probably a bug if someone calls this twice. */ + Assert(!file->readOnly); + + BufFileFlush(file); + file->readOnly = true; +} + +/* + * Close a BufFile + * + * Like fclose(), this also implicitly FileCloses the underlying File. + */ +void +BufFileClose(BufFile *file) +{ + int i; + + /* flush any unwritten data */ + BufFileFlush(file); + /* close and delete the underlying file(s) */ + for (i = 0; i < file->numFiles; i++) + FileClose(file->files[i]); + /* release the buffer space */ + pfree(file->files); + pfree(file); +} + +/* + * BufFileLoadBuffer + * + * Load some data into buffer, if possible, starting from curOffset. + * At call, must have dirty = false, pos and nbytes = 0. + * On exit, nbytes is number of bytes loaded. + */ +static void +BufFileLoadBuffer(BufFile *file) +{ + File thisfile; + + /* + * Advance to next component file if necessary and possible. + */ + if (file->curOffset >= MAX_PHYSICAL_FILESIZE && + file->curFile + 1 < file->numFiles) + { + file->curFile++; + file->curOffset = 0L; + } + + /* + * Read whatever we can get, up to a full bufferload. + */ + thisfile = file->files[file->curFile]; + file->nbytes = FileRead(thisfile, + file->buffer.data, + sizeof(file->buffer), + file->curOffset, + WAIT_EVENT_BUFFILE_READ); + if (file->nbytes < 0) + { + file->nbytes = 0; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + FilePathName(thisfile)))); + } + + /* we choose not to advance curOffset here */ + + if (file->nbytes > 0) + pgBufferUsage.temp_blks_read++; +} + +/* + * BufFileDumpBuffer + * + * Dump buffer contents starting at curOffset. + * At call, should have dirty = true, nbytes > 0. + * On exit, dirty is cleared if successful write, and curOffset is advanced. + */ +static void +BufFileDumpBuffer(BufFile *file) +{ + int wpos = 0; + int bytestowrite; + File thisfile; + + /* + * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it + * crosses a component-file boundary; so we need a loop. + */ + while (wpos < file->nbytes) + { + off_t availbytes; + + /* + * Advance to next component file if necessary and possible. + */ + if (file->curOffset >= MAX_PHYSICAL_FILESIZE) + { + while (file->curFile + 1 >= file->numFiles) + extendBufFile(file); + file->curFile++; + file->curOffset = 0L; + } + + /* + * Determine how much we need to write into this file. + */ + bytestowrite = file->nbytes - wpos; + availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; + + if ((off_t) bytestowrite > availbytes) + bytestowrite = (int) availbytes; + + thisfile = file->files[file->curFile]; + bytestowrite = FileWrite(thisfile, + file->buffer.data + wpos, + bytestowrite, + file->curOffset, + WAIT_EVENT_BUFFILE_WRITE); + if (bytestowrite <= 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + FilePathName(thisfile)))); + file->curOffset += bytestowrite; + wpos += bytestowrite; + + pgBufferUsage.temp_blks_written++; + } + file->dirty = false; + + /* + * At this point, curOffset has been advanced to the end of the buffer, + * ie, its original value + nbytes. We need to make it point to the + * logical file position, ie, original value + pos, in case that is less + * (as could happen due to a small backwards seek in a dirty buffer!) + */ + file->curOffset -= (file->nbytes - file->pos); + if (file->curOffset < 0) /* handle possible segment crossing */ + { + file->curFile--; + Assert(file->curFile >= 0); + file->curOffset += MAX_PHYSICAL_FILESIZE; + } + + /* + * Now we can set the buffer empty without changing the logical position + */ + file->pos = 0; + file->nbytes = 0; +} + +/* + * BufFileRead + * + * Like fread() except we assume 1-byte element size and report I/O errors via + * ereport(). + */ +size_t +BufFileRead(BufFile *file, void *ptr, size_t size) +{ + size_t nread = 0; + size_t nthistime; + + BufFileFlush(file); + + while (size > 0) + { + if (file->pos >= file->nbytes) + { + /* Try to load more data into buffer. */ + file->curOffset += file->pos; + file->pos = 0; + file->nbytes = 0; + BufFileLoadBuffer(file); + if (file->nbytes <= 0) + break; /* no more data available */ + } + + nthistime = file->nbytes - file->pos; + if (nthistime > size) + nthistime = size; + Assert(nthistime > 0); + + memcpy(ptr, file->buffer.data + file->pos, nthistime); + + file->pos += nthistime; + ptr = (void *) ((char *) ptr + nthistime); + size -= nthistime; + nread += nthistime; + } + + return nread; +} + +/* + * BufFileWrite + * + * Like fwrite() except we assume 1-byte element size and report errors via + * ereport(). + */ +void +BufFileWrite(BufFile *file, void *ptr, size_t size) +{ + size_t nthistime; + + Assert(!file->readOnly); + + while (size > 0) + { + if (file->pos >= BLCKSZ) + { + /* Buffer full, dump it out */ + if (file->dirty) + BufFileDumpBuffer(file); + else + { + /* Hmm, went directly from reading to writing? */ + file->curOffset += file->pos; + file->pos = 0; + file->nbytes = 0; + } + } + + nthistime = BLCKSZ - file->pos; + if (nthistime > size) + nthistime = size; + Assert(nthistime > 0); + + memcpy(file->buffer.data + file->pos, ptr, nthistime); + + file->dirty = true; + file->pos += nthistime; + if (file->nbytes < file->pos) + file->nbytes = file->pos; + ptr = (void *) ((char *) ptr + nthistime); + size -= nthistime; + } +} + +/* + * BufFileFlush + * + * Like fflush(), except that I/O errors are reported with ereport(). + */ +static void +BufFileFlush(BufFile *file) +{ + if (file->dirty) + BufFileDumpBuffer(file); + + Assert(!file->dirty); +} + +/* + * BufFileSeek + * + * Like fseek(), except that target position needs two values in order to + * work when logical filesize exceeds maximum value representable by off_t. + * We do not support relative seeks across more than that, however. + * I/O errors are reported by ereport(). + * + * Result is 0 if OK, EOF if not. Logical position is not moved if an + * impossible seek is attempted. + */ +int +BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) +{ + int newFile; + off_t newOffset; + + switch (whence) + { + case SEEK_SET: + if (fileno < 0) + return EOF; + newFile = fileno; + newOffset = offset; + break; + case SEEK_CUR: + + /* + * Relative seek considers only the signed offset, ignoring + * fileno. Note that large offsets (> 1 GB) risk overflow in this + * add, unless we have 64-bit off_t. + */ + newFile = file->curFile; + newOffset = (file->curOffset + file->pos) + offset; + break; + case SEEK_END: + + /* + * The file size of the last file gives us the end offset of that + * file. + */ + newFile = file->numFiles - 1; + newOffset = FileSize(file->files[file->numFiles - 1]); + if (newOffset < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m", + FilePathName(file->files[file->numFiles - 1]), + file->name))); + break; + default: + elog(ERROR, "invalid whence: %d", whence); + return EOF; + } + while (newOffset < 0) + { + if (--newFile < 0) + return EOF; + newOffset += MAX_PHYSICAL_FILESIZE; + } + if (newFile == file->curFile && + newOffset >= file->curOffset && + newOffset <= file->curOffset + file->nbytes) + { + /* + * Seek is to a point within existing buffer; we can just adjust + * pos-within-buffer, without flushing buffer. Note this is OK + * whether reading or writing, but buffer remains dirty if we were + * writing. + */ + file->pos = (int) (newOffset - file->curOffset); + return 0; + } + /* Otherwise, must reposition buffer, so flush any dirty data */ + BufFileFlush(file); + + /* + * At this point and no sooner, check for seek past last segment. The + * above flush could have created a new segment, so checking sooner would + * not work (at least not with this code). + */ + + /* convert seek to "start of next seg" to "end of last seg" */ + if (newFile == file->numFiles && newOffset == 0) + { + newFile--; + newOffset = MAX_PHYSICAL_FILESIZE; + } + while (newOffset > MAX_PHYSICAL_FILESIZE) + { + if (++newFile >= file->numFiles) + return EOF; + newOffset -= MAX_PHYSICAL_FILESIZE; + } + if (newFile >= file->numFiles) + return EOF; + /* Seek is OK! */ + file->curFile = newFile; + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + return 0; +} + +void +BufFileTell(BufFile *file, int *fileno, off_t *offset) +{ + *fileno = file->curFile; + *offset = file->curOffset + file->pos; +} + +/* + * BufFileSeekBlock --- block-oriented seek + * + * Performs absolute seek to the start of the n'th BLCKSZ-sized block of + * the file. Note that users of this interface will fail if their files + * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work + * with tables bigger than that, either... + * + * Result is 0 if OK, EOF if not. Logical position is not moved if an + * impossible seek is attempted. + */ +int +BufFileSeekBlock(BufFile *file, long blknum) +{ + return BufFileSeek(file, + (int) (blknum / BUFFILE_SEG_SIZE), + (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, + SEEK_SET); +} + +#ifdef NOT_USED +/* + * BufFileTellBlock --- block-oriented tell + * + * Any fractional part of a block in the current seek position is ignored. + */ +long +BufFileTellBlock(BufFile *file) +{ + long blknum; + + blknum = (file->curOffset + file->pos) / BLCKSZ; + blknum += file->curFile * BUFFILE_SEG_SIZE; + return blknum; +} + +#endif + +/* + * Return the current shared BufFile size. + * + * Counts any holes left behind by BufFileAppend as part of the size. + * ereport()s on failure. + */ +int64 +BufFileSize(BufFile *file) +{ + int64 lastFileSize; + + Assert(file->fileset != NULL); + + /* Get the size of the last physical file. */ + lastFileSize = FileSize(file->files[file->numFiles - 1]); + if (lastFileSize < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m", + FilePathName(file->files[file->numFiles - 1]), + file->name))); + + return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) + + lastFileSize; +} + +/* + * Append the contents of source file (managed within shared fileset) to + * end of target file (managed within same shared fileset). + * + * Note that operation subsumes ownership of underlying resources from + * "source". Caller should never call BufFileClose against source having + * called here first. Resource owners for source and target must match, + * too. + * + * This operation works by manipulating lists of segment files, so the + * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned + * boundary, typically creating empty holes before the boundary. These + * areas do not contain any interesting data, and cannot be read from by + * caller. + * + * Returns the block number within target where the contents of source + * begins. Caller should apply this as an offset when working off block + * positions that are in terms of the original BufFile space. + */ +long +BufFileAppend(BufFile *target, BufFile *source) +{ + long startBlock = target->numFiles * BUFFILE_SEG_SIZE; + int newNumFiles = target->numFiles + source->numFiles; + int i; + + Assert(target->fileset != NULL); + Assert(source->readOnly); + Assert(!source->dirty); + Assert(source->fileset != NULL); + + if (target->resowner != source->resowner) + elog(ERROR, "could not append BufFile with non-matching resource owner"); + + target->files = (File *) + repalloc(target->files, sizeof(File) * newNumFiles); + for (i = target->numFiles; i < newNumFiles; i++) + target->files[i] = source->files[i - target->numFiles]; + target->numFiles = newNumFiles; + + return startBlock; +} + +/* + * Truncate a BufFile created by BufFileCreateShared up to the given fileno and + * the offset. + */ +void +BufFileTruncateShared(BufFile *file, int fileno, off_t offset) +{ + int numFiles = file->numFiles; + int newFile = fileno; + off_t newOffset = file->curOffset; + char segment_name[MAXPGPATH]; + int i; + + /* + * Loop over all the files up to the given fileno and remove the files + * that are greater than the fileno and truncate the given file up to the + * offset. Note that we also remove the given fileno if the offset is 0 + * provided it is not the first file in which we truncate it. + */ + for (i = file->numFiles - 1; i >= fileno; i--) + { + if ((i != fileno || offset == 0) && i != 0) + { + SharedSegmentName(segment_name, file->name, i); + FileClose(file->files[i]); + if (!SharedFileSetDelete(file->fileset, segment_name, true)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not delete shared fileset \"%s\": %m", + segment_name))); + numFiles--; + newOffset = MAX_PHYSICAL_FILESIZE; + + /* + * This is required to indicate that we have deleted the given + * fileno. + */ + if (i == fileno) + newFile--; + } + else + { + if (FileTruncate(file->files[i], offset, + WAIT_EVENT_BUFFILE_TRUNCATE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", + FilePathName(file->files[i])))); + newOffset = offset; + } + } + + file->numFiles = numFiles; + + /* + * If the truncate point is within existing buffer then we can just adjust + * pos within buffer. + */ + if (newFile == file->curFile && + newOffset >= file->curOffset && + newOffset <= file->curOffset + file->nbytes) + { + /* No need to reset the current pos if the new pos is greater. */ + if (newOffset <= file->curOffset + file->pos) + file->pos = (int) (newOffset - file->curOffset); + + /* Adjust the nbytes for the current buffer. */ + file->nbytes = (int) (newOffset - file->curOffset); + } + else if (newFile == file->curFile && + newOffset < file->curOffset) + { + /* + * The truncate point is within the existing file but prior to the + * current position, so we can forget the current buffer and reset the + * current position. + */ + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + } + else if (newFile < file->curFile) + { + /* + * The truncate point is prior to the current file, so need to reset + * the current position accordingly. + */ + file->curFile = newFile; + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + } + /* Nothing to do, if the truncate point is beyond current file. */ +} diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c new file mode 100644 index 0000000..da8b7cb --- /dev/null +++ b/src/backend/storage/file/copydir.c @@ -0,0 +1,226 @@ +/*------------------------------------------------------------------------- + * + * copydir.c + * copies a directory + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * While "xcopy /e /i /q" works fine for copying directories, on Windows XP + * it requires a Window handle which prevents it from working when invoked + * as a service. + * + * IDENTIFICATION + * src/backend/storage/file/copydir.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <fcntl.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/copydir.h" +#include "storage/fd.h" + +/* + * copydir: copy a directory + * + * If recurse is false, subdirectories are ignored. Anything that's not + * a directory or a regular file is ignored. + */ +void +copydir(char *fromdir, char *todir, bool recurse) +{ + DIR *xldir; + struct dirent *xlde; + char fromfile[MAXPGPATH * 2]; + char tofile[MAXPGPATH * 2]; + + if (MakePGDirectory(todir) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", todir))); + + xldir = AllocateDir(fromdir); + + while ((xlde = ReadDir(xldir, fromdir)) != NULL) + { + struct stat fst; + + /* If we got a cancel signal during the copy of the directory, quit */ + CHECK_FOR_INTERRUPTS(); + + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name); + snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); + + if (lstat(fromfile, &fst) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", fromfile))); + + if (S_ISDIR(fst.st_mode)) + { + /* recurse to handle subdirectories */ + if (recurse) + copydir(fromfile, tofile, true); + } + else if (S_ISREG(fst.st_mode)) + copy_file(fromfile, tofile); + } + FreeDir(xldir); + + /* + * Be paranoid here and fsync all files to ensure the copy is really done. + * But if fsync is disabled, we're done. + */ + if (!enableFsync) + return; + + xldir = AllocateDir(todir); + + while ((xlde = ReadDir(xldir, todir)) != NULL) + { + struct stat fst; + + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); + + /* + * We don't need to sync subdirectories here since the recursive + * copydir will do it before it returns + */ + if (lstat(tofile, &fst) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", tofile))); + + if (S_ISREG(fst.st_mode)) + fsync_fname(tofile, false); + } + FreeDir(xldir); + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. Recent versions of ext4 have made the window much wider but + * it's been true for ext3 and other filesystems in the past. + */ + fsync_fname(todir, true); +} + +/* + * copy one file + */ +void +copy_file(char *fromfile, char *tofile) +{ + char *buffer; + int srcfd; + int dstfd; + int nbytes; + off_t offset; + off_t flush_offset; + + /* Size of copy buffer (read and write requests) */ +#define COPY_BUF_SIZE (8 * BLCKSZ) + + /* + * Size of data flush requests. It seems beneficial on most platforms to + * do this every 1MB or so. But macOS, at least with early releases of + * APFS, is really unfriendly to small mmap/msync requests, so there do it + * only every 32MB. + */ +#if defined(__darwin__) +#define FLUSH_DISTANCE (32 * 1024 * 1024) +#else +#define FLUSH_DISTANCE (1024 * 1024) +#endif + + /* Use palloc to ensure we get a maxaligned buffer */ + buffer = palloc(COPY_BUF_SIZE); + + /* + * Open the files + */ + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fromfile))); + + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (dstfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tofile))); + + /* + * Do the data copying. + */ + flush_offset = 0; + for (offset = 0;; offset += nbytes) + { + /* If we got a cancel signal during the copy of the file, quit */ + CHECK_FOR_INTERRUPTS(); + + /* + * We fsync the files later, but during the copy, flush them every so + * often to avoid spamming the cache and hopefully get the kernel to + * start writing them out before the fsync comes. + */ + if (offset - flush_offset >= FLUSH_DISTANCE) + { + pg_flush_data(dstfd, flush_offset, offset - flush_offset); + flush_offset = offset; + } + + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); + nbytes = read(srcfd, buffer, COPY_BUF_SIZE); + pgstat_report_wait_end(); + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", fromfile))); + if (nbytes == 0) + break; + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE); + if ((int) write(dstfd, buffer, nbytes) != nbytes) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tofile))); + } + pgstat_report_wait_end(); + } + + if (offset > flush_offset) + pg_flush_data(dstfd, flush_offset, offset - flush_offset); + + if (CloseTransientFile(dstfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tofile))); + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fromfile))); + + pfree(buffer); +} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c new file mode 100644 index 0000000..e76daff --- /dev/null +++ b/src/backend/storage/file/fd.c @@ -0,0 +1,3789 @@ +/*------------------------------------------------------------------------- + * + * fd.c + * Virtual file descriptor code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/fd.c + * + * NOTES: + * + * This code manages a cache of 'virtual' file descriptors (VFDs). + * The server opens many file descriptors for a variety of reasons, + * including base tables, scratch files (e.g., sort and hash spool + * files), and random calls to C library routines like system(3); it + * is quite easy to exceed system limits on the number of open files a + * single process can have. (This is around 1024 on many modern + * operating systems, but may be lower on others.) + * + * VFDs are managed as an LRU pool, with actual OS file descriptors + * being opened and closed as needed. Obviously, if a routine is + * opened using these interfaces, all subsequent operations must also + * be through these interfaces (the File type is not a real file + * descriptor). + * + * For this scheme to work, most (if not all) routines throughout the + * server should use these interfaces instead of calling the C library + * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we + * may find ourselves short of real file descriptors anyway. + * + * INTERFACE ROUTINES + * + * PathNameOpenFile and OpenTemporaryFile are used to open virtual files. + * A File opened with OpenTemporaryFile is automatically deleted when the + * File is closed, either explicitly or implicitly at end of transaction or + * process exit. PathNameOpenFile is intended for files that are held open + * for a long time, like relation files. It is the caller's responsibility + * to close them, there is no automatic mechanism in fd.c for that. + * + * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage + * temporary files that have names so that they can be shared between + * backends. Such files are automatically closed and count against the + * temporary file limit of the backend that creates them, but unlike anonymous + * files they are not automatically deleted. See sharedfileset.c for a shared + * ownership mechanism that provides automatic cleanup for shared files when + * the last of a group of backends detaches. + * + * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are + * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. + * They behave like the corresponding native functions, except that the handle + * is registered with the current subtransaction, and will be automatically + * closed at abort. These are intended mainly for short operations like + * reading a configuration file; there is a limit on the number of files that + * can be opened using these functions at any one time. + * + * Finally, BasicOpenFile is just a thin wrapper around open() that can + * release file descriptors in use by the virtual file descriptors if + * necessary. There is no automatic cleanup of file descriptors returned by + * BasicOpenFile, it is solely the caller's responsibility to close the file + * descriptor by calling close(2). + * + * If a non-virtual file descriptor needs to be held open for any length of + * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD + * (and eventually ReleaseExternalFD), so that we can take it into account + * while deciding how many VFDs can be open. This applies to FDs obtained + * with BasicOpenFile as well as those obtained without use of any fd.c API. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <dirent.h> +#include <sys/file.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/types.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif +#include <limits.h> +#include <unistd.h> +#include <fcntl.h> +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> /* for getrlimit */ +#endif + +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/pg_tablespace.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/pg_iovec.h" +#include "portability/mem.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "utils/guc.h" +#include "utils/resowner_private.h" + +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(HAVE_SYNC_FILE_RANGE) +#define PG_FLUSH_DATA_WORKS 1 +#elif !defined(WIN32) && defined(MS_ASYNC) +#define PG_FLUSH_DATA_WORKS 1 +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif + +/* + * We must leave some file descriptors free for system(), the dynamic loader, + * and other code that tries to open files without consulting fd.c. This + * is the number left free. (While we try fairly hard to prevent EMFILE + * errors, there's never any guarantee that we won't get ENFILE due to + * other processes chewing up FDs. So it's a bad idea to try to open files + * without consulting fd.c. Nonetheless we cannot control all code.) + * + * Because this is just a fixed setting, we are effectively assuming that + * no such code will leave FDs open over the long term; otherwise the slop + * is likely to be insufficient. Note in particular that we expect that + * loading a shared library does not result in any permanent increase in + * the number of open files. (This appears to be true on most if not + * all platforms as of Feb 2004.) + */ +#define NUM_RESERVED_FDS 10 + +/* + * If we have fewer than this many usable FDs after allowing for the reserved + * ones, choke. (This value is chosen to work with "ulimit -n 64", but not + * much less than that. Note that this value ensures numExternalFDs can be + * at least 16; as of this writing, the contrib/postgres_fdw regression tests + * will not pass unless that can grow to at least 14.) + */ +#define FD_MINFREE 48 + +/* + * A number of platforms allow individual processes to open many more files + * than they can really support when *many* processes do the same thing. + * This GUC parameter lets the DBA limit max_safe_fds to something less than + * what the postmaster's initial probe suggests will work. + */ +int max_files_per_process = 1000; + +/* + * Maximum number of file descriptors to open for operations that fd.c knows + * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized + * to a conservative value, and remains that way indefinitely in bootstrap or + * standalone-backend cases. In normal postmaster operation, the postmaster + * calls set_max_safe_fds() late in initialization to update the value, and + * that value is then inherited by forked subprocesses. + * + * Note: the value of max_files_per_process is taken into account while + * setting this variable, and so need not be tested separately. + */ +int max_safe_fds = FD_MINFREE; /* default if not changed */ + +/* Whether it is safe to continue running after fsync() fails. */ +bool data_sync_retry = false; + +/* How SyncDataDirectory() should do its job. */ +int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; + +/* Debugging.... */ + +#ifdef FDDEBUG +#define DO_DB(A) \ + do { \ + int _do_db_save_errno = errno; \ + A; \ + errno = _do_db_save_errno; \ + } while (0) +#else +#define DO_DB(A) \ + ((void) 0) +#endif + +#define VFD_CLOSED (-1) + +#define FileIsValid(file) \ + ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL) + +#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) + +/* these are the assigned bits in fdstate below: */ +#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ +#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ +#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ + +typedef struct vfd +{ + int fd; /* current FD, or VFD_CLOSED if none */ + unsigned short fdstate; /* bitflags for VFD's state */ + ResourceOwner resowner; /* owner, for automatic cleanup */ + File nextFree; /* link to next free VFD, if in freelist */ + File lruMoreRecently; /* doubly linked recency-of-use list */ + File lruLessRecently; + off_t fileSize; /* current size of file (0 if not temporary) */ + char *fileName; /* name of file, or NULL for unused VFD */ + /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ + int fileFlags; /* open(2) flags for (re)opening the file */ + mode_t fileMode; /* mode to pass to open(2) */ +} Vfd; + +/* + * Virtual File Descriptor array pointer and size. This grows as + * needed. 'File' values are indexes into this array. + * Note that VfdCache[0] is not a usable VFD, just a list header. + */ +static Vfd *VfdCache; +static Size SizeVfdCache = 0; + +/* + * Number of file descriptors known to be in use by VFD entries. + */ +static int nfile = 0; + +/* + * Flag to tell whether it's worth scanning VfdCache looking for temp files + * to close + */ +static bool have_xact_temporary_files = false; + +/* + * Tracks the total size of all temporary files. Note: when temp_file_limit + * is being enforced, this cannot overflow since the limit cannot be more + * than INT_MAX kilobytes. When not enforcing, it could theoretically + * overflow, but we don't care. + */ +static uint64 temporary_files_size = 0; + +/* + * List of OS handles opened with AllocateFile, AllocateDir and + * OpenTransientFile. + */ +typedef enum +{ + AllocateDescFile, + AllocateDescPipe, + AllocateDescDir, + AllocateDescRawFD +} AllocateDescKind; + +typedef struct +{ + AllocateDescKind kind; + SubTransactionId create_subid; + union + { + FILE *file; + DIR *dir; + int fd; + } desc; +} AllocateDesc; + +static int numAllocatedDescs = 0; +static int maxAllocatedDescs = 0; +static AllocateDesc *allocatedDescs = NULL; + +/* + * Number of open "external" FDs reported to Reserve/ReleaseExternalFD. + */ +static int numExternalFDs = 0; + +/* + * Number of temporary files opened during the current session; + * this is used in generation of tempfile names. + */ +static long tempFileCounter = 0; + +/* + * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid, + * indicating that the current database's default tablespace should be used.) + * When numTempTableSpaces is -1, this has not been set in the current + * transaction. + */ +static Oid *tempTableSpaces = NULL; +static int numTempTableSpaces = -1; +static int nextTempTableSpace = 0; + + +/*-------------------- + * + * Private Routines + * + * Delete - delete a file from the Lru ring + * LruDelete - remove a file from the Lru ring and close its FD + * Insert - put a file at the front of the Lru ring + * LruInsert - put a file at the front of the Lru ring and open it + * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring + * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit + * AllocateVfd - grab a free (or new) file record (from VfdCache) + * FreeVfd - free a file record + * + * The Least Recently Used ring is a doubly linked list that begins and + * ends on element zero. Element zero is special -- it doesn't represent + * a file and its "fd" field always == VFD_CLOSED. Element zero is just an + * anchor that shows us the beginning/end of the ring. + * Only VFD elements that are currently really open (have an FD assigned) are + * in the Lru ring. Elements that are "virtually" open can be recognized + * by having a non-null fileName field. + * + * example: + * + * /--less----\ /---------\ + * v \ v \ + * #0 --more---> LeastRecentlyUsed --more-\ \ + * ^\ | | + * \\less--> MostRecentlyUsedFile <---/ | + * \more---/ \--less--/ + * + *-------------------- + */ +static void Delete(File file); +static void LruDelete(File file); +static void Insert(File file); +static int LruInsert(File file); +static bool ReleaseLruFile(void); +static void ReleaseLruFiles(void); +static File AllocateVfd(void); +static void FreeVfd(File file); + +static int FileAccess(File file); +static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); +static bool reserveAllocatedDesc(void); +static int FreeDesc(AllocateDesc *desc); + +static void AtProcExit_Files(int code, Datum arg); +static void CleanupTempFiles(bool isCommit, bool isProcExit); +static void RemovePgTempRelationFiles(const char *tsdirname); +static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); + +static void walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel); +#ifdef PG_FLUSH_DATA_WORKS +static void pre_sync_fname(const char *fname, bool isdir, int elevel); +#endif +static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); +static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); + +static int fsync_parent_path(const char *fname, int elevel); + + +/* + * pg_fsync --- do fsync with or without writethrough + */ +int +pg_fsync(int fd) +{ +#if !defined(WIN32) && defined(USE_ASSERT_CHECKING) + struct stat st; + + /* + * Some operating system implementations of fsync() have requirements + * about the file access modes that were used when their file descriptor + * argument was opened, and these requirements differ depending on whether + * the file descriptor is for a directory. + * + * For any file descriptor that may eventually be handed to fsync(), we + * should have opened it with access modes that are compatible with + * fsync() on all supported systems, otherwise the code may not be + * portable, even if it runs ok on the current system. + * + * We assert here that a descriptor for a file was opened with write + * permissions (either O_RDWR or O_WRONLY) and for a directory without + * write permissions (O_RDONLY). + * + * Ignore any fstat errors and let the follow-up fsync() do its work. + * Doing this sanity check here counts for the case where fsync() is + * disabled. + */ + if (fstat(fd, &st) == 0) + { + int desc_flags = fcntl(fd, F_GETFL); + + /* + * O_RDONLY is historically 0, so just make sure that for directories + * no write flags are used. + */ + if (S_ISDIR(st.st_mode)) + Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + else + Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + } + errno = 0; +#endif + + /* #if is to skip the sync_method test if there's no need for it */ +#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC) + if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH) + return pg_fsync_writethrough(fd); + else +#endif + return pg_fsync_no_writethrough(fd); +} + + +/* + * pg_fsync_no_writethrough --- same as fsync except does nothing if + * enableFsync is off + */ +int +pg_fsync_no_writethrough(int fd) +{ + if (enableFsync) + return fsync(fd); + else + return 0; +} + +/* + * pg_fsync_writethrough + */ +int +pg_fsync_writethrough(int fd) +{ + if (enableFsync) + { +#ifdef WIN32 + return _commit(fd); +#elif defined(F_FULLFSYNC) + return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; +#else + errno = ENOSYS; + return -1; +#endif + } + else + return 0; +} + +/* + * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off + * + * Not all platforms have fdatasync; treat as fsync if not available. + */ +int +pg_fdatasync(int fd) +{ + if (enableFsync) + { +#ifdef HAVE_FDATASYNC + return fdatasync(fd); +#else + return fsync(fd); +#endif + } + else + return 0; +} + +/* + * pg_flush_data --- advise OS that the described dirty data should be flushed + * + * offset of 0 with nbytes 0 means that the entire file should be flushed + */ +void +pg_flush_data(int fd, off_t offset, off_t nbytes) +{ + /* + * Right now file flushing is primarily used to avoid making later + * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes + * if fsyncs are disabled - that's a decision we might want to make + * configurable at some point. + */ + if (!enableFsync) + return; + + /* + * We compile all alternatives that are supported on the current platform, + * to find portability problems more easily. + */ +#if defined(HAVE_SYNC_FILE_RANGE) + { + int rc; + static bool not_implemented_by_kernel = false; + + if (not_implemented_by_kernel) + return; + + /* + * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, + * tells the OS that writeback for the specified blocks should be + * started, but that we don't want to wait for completion. Note that + * this call might block if too much dirty data exists in the range. + * This is the preferable method on OSs supporting it, as it works + * reliably when available (contrast to msync()) and doesn't flush out + * clean data (like FADV_DONTNEED). + */ + rc = sync_file_range(fd, offset, nbytes, + SYNC_FILE_RANGE_WRITE); + if (rc != 0) + { + int elevel; + + /* + * For systems that don't have an implementation of + * sync_file_range() such as Windows WSL, generate only one + * warning and then suppress all further attempts by this process. + */ + if (errno == ENOSYS) + { + elevel = WARNING; + not_implemented_by_kernel = true; + } + else + elevel = data_sync_elevel(WARNING); + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } +#endif +#if !defined(WIN32) && defined(MS_ASYNC) + { + void *p; + static int pagesize = 0; + + /* + * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers + * writeback. On linux it only does so if MS_SYNC is specified, but + * then it does the writeback synchronously. Luckily all common linux + * systems have sync_file_range(). This is preferable over + * FADV_DONTNEED because it doesn't flush out clean data. + * + * We map the file (mmap()), tell the kernel to sync back the contents + * (msync()), and then remove the mapping again (munmap()). + */ + + /* mmap() needs actual length if we want to map whole file */ + if (offset == 0 && nbytes == 0) + { + nbytes = lseek(fd, 0, SEEK_END); + if (nbytes < 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not determine dirty data size: %m"))); + return; + } + } + + /* + * Some platforms reject partial-page mmap() attempts. To deal with + * that, just truncate the request to a page boundary. If any extra + * bytes don't get flushed, well, it's only a hint anyway. + */ + + /* fetch pagesize only once */ + if (pagesize == 0) + pagesize = sysconf(_SC_PAGESIZE); + + /* align length to pagesize, dropping any fractional page */ + if (pagesize > 0) + nbytes = (nbytes / pagesize) * pagesize; + + /* fractional-page request is a no-op */ + if (nbytes <= 0) + return; + + /* + * mmap could well fail, particularly on 32-bit platforms where there + * may simply not be enough address space. If so, silently fall + * through to the next implementation. + */ + if (nbytes <= (off_t) SSIZE_MAX) + p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); + else + p = MAP_FAILED; + + if (p != MAP_FAILED) + { + int rc; + + rc = msync(p, (size_t) nbytes, MS_ASYNC); + if (rc != 0) + { + ereport(data_sync_elevel(WARNING), + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + /* NB: need to fall through to munmap()! */ + } + + rc = munmap(p, (size_t) nbytes); + if (rc != 0) + { + /* FATAL error because mapping would remain */ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not munmap() while flushing data: %m"))); + } + + return; + } + } +#endif +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + { + int rc; + + /* + * Signal the kernel that the passed in range should not be cached + * anymore. This has the, desired, side effect of writing out dirty + * data, and the, undesired, side effect of likely discarding useful + * clean cached blocks. For the latter reason this is the least + * preferable method. + */ + + rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); + + if (rc != 0) + { + /* don't error out, this is just a performance optimization */ + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } +#endif +} + +/* + * Truncate a file to a given length by name. + */ +int +pg_truncate(const char *path, off_t length) +{ +#ifdef WIN32 + int save_errno; + int ret; + int fd; + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd >= 0) + { + ret = ftruncate(fd, 0); + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + } + else + ret = -1; + + return ret; +#else + return truncate(path, length); +#endif +} + +/* + * fsync_fname -- fsync a file or directory, handling errors properly + * + * Try to fsync a file or directory. When doing the latter, ignore errors that + * indicate the OS just doesn't allow/require fsyncing directories. + */ +void +fsync_fname(const char *fname, bool isdir) +{ + fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR)); +} + +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * This routine ensures that, after returning, the effect of renaming file + * persists in case of a crash. A crash while this routine is running will + * leave you with either the pre-existing or the moved file in place of the + * new file; no mixed state or truncated files are possible. + * + * It does so by using fsync on the old filename and the possibly existing + * target filename before the rename, and the target file and directory after. + * + * Note that rename() cannot be used across arbitrary directories, as they + * might not be on the same filesystem. Therefore this routine does not + * support renaming across directories. + * + * Log errors with the caller specified severity. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_rename(const char *oldfile, const char *newfile, int elevel) +{ + int fd; + + /* + * First fsync the old and target path (if it exists), to ensure that they + * are properly persistent on disk. Syncing the target file is not + * strictly necessary, but it makes it easier to reason about crashes; + * because it's then guaranteed that either source or target file exists + * after a crash. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + + fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR); + if (fd < 0) + { + if (errno != ENOENT) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", newfile))); + return -1; + } + } + else + { + if (pg_fsync(fd) != 0) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", newfile))); + return -1; + } + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", newfile))); + return -1; + } + } + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} + +/* + * durable_unlink -- remove a file in a durable manner + * + * This routine ensures that, after returning, the effect of removing file + * persists in case of a crash. A crash while this routine is running will + * leave the system in no mixed state. + * + * It does so by using fsync on the parent directory of the file after the + * actual removal is done. + * + * Log errors with the severity specified by caller. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_unlink(const char *fname, int elevel) +{ + if (unlink(fname) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + fname))); + return -1; + } + + /* + * To guarantee that the removal of the file is persistent, fsync its + * parent directory. + */ + if (fsync_parent_path(fname, elevel) != 0) + return -1; + + return 0; +} + +/* + * durable_rename_excl -- rename a file in a durable manner. + * + * Similar to durable_rename(), except that this routine tries (but does not + * guarantee) not to overwrite the target file. + * + * Note that a crash in an unfortunate moment can leave you with two links to + * the target file. + * + * Log errors with the caller specified severity. + * + * On Windows, using a hard link followed by unlink() causes concurrency + * issues, while a simple rename() does not cause that, so be careful when + * changing the logic of this routine. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_rename_excl(const char *oldfile, const char *newfile, int elevel) +{ + /* + * Ensure that, if we crash directly after the rename/link, a file with + * valid contents is moved into place. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + +#ifdef HAVE_WORKING_LINK + if (link(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not link file \"%s\" to \"%s\": %m", + oldfile, newfile), + (AmCheckpointerProcess() ? + errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : + 0))); + return -1; + } + unlink(oldfile); +#else + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile), + (AmCheckpointerProcess() ? + errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : + 0))); + return -1; + } +#endif + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + /* Same for parent directory */ + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} + +/* + * InitFileAccess --- initialize this module during backend startup + * + * This is called during either normal or standalone backend start. + * It is *not* called in the postmaster. + */ +void +InitFileAccess(void) +{ + Assert(SizeVfdCache == 0); /* call me only once */ + + /* initialize cache header entry */ + VfdCache = (Vfd *) malloc(sizeof(Vfd)); + if (VfdCache == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); + VfdCache->fd = VFD_CLOSED; + + SizeVfdCache = 1; + + /* register proc-exit hook to ensure temp files are dropped at exit */ + on_proc_exit(AtProcExit_Files, 0); +} + +/* + * count_usable_fds --- count how many FDs the system will let us open, + * and estimate how many are already open. + * + * We stop counting if usable_fds reaches max_to_probe. Note: a small + * value of max_to_probe might result in an underestimate of already_open; + * we must fill in any "gaps" in the set of used FDs before the calculation + * of already_open will give the right answer. In practice, max_to_probe + * of a couple of dozen should be enough to ensure good results. + * + * We assume stderr (FD 2) is available for dup'ing. While the calling + * script could theoretically close that, it would be a really bad idea, + * since then one risks loss of error messages from, e.g., libc. + */ +static void +count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) +{ + int *fd; + int size; + int used = 0; + int highestfd = 0; + int j; + +#ifdef HAVE_GETRLIMIT + struct rlimit rlim; + int getrlimit_status; +#endif + + size = 1024; + fd = (int *) palloc(size * sizeof(int)); + +#ifdef HAVE_GETRLIMIT +#ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */ + getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim); +#else /* but BSD doesn't ... */ + getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim); +#endif /* RLIMIT_NOFILE */ + if (getrlimit_status != 0) + ereport(WARNING, (errmsg("getrlimit failed: %m"))); +#endif /* HAVE_GETRLIMIT */ + + /* dup until failure or probe limit reached */ + for (;;) + { + int thisfd; + +#ifdef HAVE_GETRLIMIT + + /* + * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on + * some platforms + */ + if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1) + break; +#endif + + thisfd = dup(2); + if (thisfd < 0) + { + /* Expect EMFILE or ENFILE, else it's fishy */ + if (errno != EMFILE && errno != ENFILE) + elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used); + break; + } + + if (used >= size) + { + size *= 2; + fd = (int *) repalloc(fd, size * sizeof(int)); + } + fd[used++] = thisfd; + + if (highestfd < thisfd) + highestfd = thisfd; + + if (used >= max_to_probe) + break; + } + + /* release the files we opened */ + for (j = 0; j < used; j++) + close(fd[j]); + + pfree(fd); + + /* + * Return results. usable_fds is just the number of successful dups. We + * assume that the system limit is highestfd+1 (remember 0 is a legal FD + * number) and so already_open is highestfd+1 - usable_fds. + */ + *usable_fds = used; + *already_open = highestfd + 1 - used; +} + +/* + * set_max_safe_fds + * Determine number of file descriptors that fd.c is allowed to use + */ +void +set_max_safe_fds(void) +{ + int usable_fds; + int already_open; + + /*---------- + * We want to set max_safe_fds to + * MIN(usable_fds, max_files_per_process - already_open) + * less the slop factor for files that are opened without consulting + * fd.c. This ensures that we won't exceed either max_files_per_process + * or the experimentally-determined EMFILE limit. + *---------- + */ + count_usable_fds(max_files_per_process, + &usable_fds, &already_open); + + max_safe_fds = Min(usable_fds, max_files_per_process - already_open); + + /* + * Take off the FDs reserved for system() etc. + */ + max_safe_fds -= NUM_RESERVED_FDS; + + /* + * Make sure we still have enough to get by. + */ + if (max_safe_fds < FD_MINFREE) + ereport(FATAL, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("insufficient file descriptors available to start server process"), + errdetail("System allows %d, we need at least %d.", + max_safe_fds + NUM_RESERVED_FDS, + FD_MINFREE + NUM_RESERVED_FDS))); + + elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d", + max_safe_fds, usable_fds, already_open); +} + +/* + * Open a file with BasicOpenFilePerm() and pass default file mode for the + * fileMode parameter. + */ +int +BasicOpenFile(const char *fileName, int fileFlags) +{ + return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed + * + * This is exported for use by places that really want a plain kernel FD, + * but need to be proof against running out of FDs. Once an FD has been + * successfully returned, it is the caller's responsibility to ensure that + * it will not be leaked on ereport()! Most users should *not* call this + * routine directly, but instead use the VFD abstraction level, which + * provides protection against descriptor leaks as well as management of + * files that need to be open for more than a short period of time. + * + * Ideally this should be the *only* direct call of open() in the backend. + * In practice, the postmaster calls open() directly, and there are some + * direct open() calls done early in backend startup. Those are OK since + * this module wouldn't have any open files to close at that point anyway. + */ +int +BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + int fd; + +tryAgain: + fd = open(fileName, fileFlags, fileMode); + + if (fd >= 0) + return fd; /* success! */ + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto tryAgain; + errno = save_errno; + } + + return -1; /* failure */ +} + +/* + * AcquireExternalFD - attempt to reserve an external file descriptor + * + * This should be used by callers that need to hold a file descriptor open + * over more than a short interval, but cannot use any of the other facilities + * provided by this module. + * + * The difference between this and the underlying ReserveExternalFD function + * is that this will report failure (by setting errno and returning false) + * if "too many" external FDs are already reserved. This should be used in + * any code where the total number of FDs to be reserved is not predictable + * and small. + */ +bool +AcquireExternalFD(void) +{ + /* + * We don't want more than max_safe_fds / 3 FDs to be consumed for + * "external" FDs. + */ + if (numExternalFDs < max_safe_fds / 3) + { + ReserveExternalFD(); + return true; + } + errno = EMFILE; + return false; +} + +/* + * ReserveExternalFD - report external consumption of a file descriptor + * + * This should be used by callers that need to hold a file descriptor open + * over more than a short interval, but cannot use any of the other facilities + * provided by this module. This just tracks the use of the FD and closes + * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available. + * + * Call this directly only in code where failure to reserve the FD would be + * fatal; for example, the WAL-writing code does so, since the alternative is + * session failure. Also, it's very unwise to do so in code that could + * consume more than one FD per process. + * + * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain + * available, it doesn't matter too much whether this is called before or + * after actually opening the FD; but doing so beforehand reduces the risk of + * an EMFILE failure if not everybody played nice. In any case, it's solely + * caller's responsibility to keep the external-FD count in sync with reality. + */ +void +ReserveExternalFD(void) +{ + /* + * Release VFDs if needed to stay safe. Because we do this before + * incrementing numExternalFDs, the final state will be as desired, i.e., + * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds. + */ + ReleaseLruFiles(); + + numExternalFDs++; +} + +/* + * ReleaseExternalFD - report release of an external file descriptor + * + * This is guaranteed not to change errno, so it can be used in failure paths. + */ +void +ReleaseExternalFD(void) +{ + Assert(numExternalFDs > 0); + numExternalFDs--; +} + + +#if defined(FDDEBUG) + +static void +_dump_lru(void) +{ + int mru = VfdCache[0].lruLessRecently; + Vfd *vfdP = &VfdCache[mru]; + char buf[2048]; + + snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru); + while (mru != 0) + { + mru = vfdP->lruLessRecently; + vfdP = &VfdCache[mru]; + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru); + } + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST"); + elog(LOG, "%s", buf); +} +#endif /* FDDEBUG */ + +static void +Delete(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "Delete %d (%s)", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + vfdP = &VfdCache[file]; + + VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently; + VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently; + + DO_DB(_dump_lru()); +} + +static void +LruDelete(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "LruDelete %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + /* + * Close the file. We aren't expecting this to fail; if it does, better + * to leak the FD than to mess up our internal state. + */ + if (close(vfdP->fd) != 0) + elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), + "could not close file \"%s\": %m", vfdP->fileName); + vfdP->fd = VFD_CLOSED; + --nfile; + + /* delete the vfd record from the LRU ring */ + Delete(file); +} + +static void +Insert(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "Insert %d (%s)", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + vfdP = &VfdCache[file]; + + vfdP->lruMoreRecently = 0; + vfdP->lruLessRecently = VfdCache[0].lruLessRecently; + VfdCache[0].lruLessRecently = file; + VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; + + DO_DB(_dump_lru()); +} + +/* returns 0 on success, -1 on re-open failure (with errno set) */ +static int +LruInsert(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "LruInsert %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + if (FileIsNotOpen(file)) + { + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + /* + * The open could still fail for lack of file descriptors, eg due to + * overall system file table being full. So, be prepared to release + * another FD if necessary... + */ + vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, + vfdP->fileMode); + if (vfdP->fd < 0) + { + DO_DB(elog(LOG, "re-open failed: %m")); + return -1; + } + else + { + ++nfile; + } + } + + /* + * put it at the head of the Lru ring + */ + + Insert(file); + + return 0; +} + +/* + * Release one kernel FD by closing the least-recently-used VFD. + */ +static bool +ReleaseLruFile(void) +{ + DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile)); + + if (nfile > 0) + { + /* + * There are opened files and so there should be at least one used vfd + * in the ring. + */ + Assert(VfdCache[0].lruMoreRecently != 0); + LruDelete(VfdCache[0].lruMoreRecently); + return true; /* freed a file */ + } + return false; /* no files available to free */ +} + +/* + * Release kernel FDs as needed to get under the max_safe_fds limit. + * After calling this, it's OK to try to open another file. + */ +static void +ReleaseLruFiles(void) +{ + while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds) + { + if (!ReleaseLruFile()) + break; + } +} + +static File +AllocateVfd(void) +{ + Index i; + File file; + + DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache)); + + Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ + + if (VfdCache[0].nextFree == 0) + { + /* + * The free list is empty so it is time to increase the size of the + * array. We choose to double it each time this happens. However, + * there's not much point in starting *real* small. + */ + Size newCacheSize = SizeVfdCache * 2; + Vfd *newVfdCache; + + if (newCacheSize < 32) + newCacheSize = 32; + + /* + * Be careful not to clobber VfdCache ptr if realloc fails. + */ + newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize); + if (newVfdCache == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + VfdCache = newVfdCache; + + /* + * Initialize the new entries and link them into the free list. + */ + for (i = SizeVfdCache; i < newCacheSize; i++) + { + MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd)); + VfdCache[i].nextFree = i + 1; + VfdCache[i].fd = VFD_CLOSED; + } + VfdCache[newCacheSize - 1].nextFree = 0; + VfdCache[0].nextFree = SizeVfdCache; + + /* + * Record the new size + */ + SizeVfdCache = newCacheSize; + } + + file = VfdCache[0].nextFree; + + VfdCache[0].nextFree = VfdCache[file].nextFree; + + return file; +} + +static void +FreeVfd(File file) +{ + Vfd *vfdP = &VfdCache[file]; + + DO_DB(elog(LOG, "FreeVfd: %d (%s)", + file, vfdP->fileName ? vfdP->fileName : "")); + + if (vfdP->fileName != NULL) + { + free(vfdP->fileName); + vfdP->fileName = NULL; + } + vfdP->fdstate = 0x0; + + vfdP->nextFree = VfdCache[0].nextFree; + VfdCache[0].nextFree = file; +} + +/* returns 0 on success, -1 on re-open failure (with errno set) */ +static int +FileAccess(File file) +{ + int returnValue; + + DO_DB(elog(LOG, "FileAccess %d (%s)", + file, VfdCache[file].fileName)); + + /* + * Is the file open? If not, open it and put it at the head of the LRU + * ring (possibly closing the least recently used file to get an FD). + */ + + if (FileIsNotOpen(file)) + { + returnValue = LruInsert(file); + if (returnValue != 0) + return returnValue; + } + else if (VfdCache[0].lruLessRecently != file) + { + /* + * We now know that the file is open and that it is not the last one + * accessed, so we need to move it to the head of the Lru ring. + */ + + Delete(file); + Insert(file); + } + + return 0; +} + +/* + * Called whenever a temporary file is deleted to report its size. + */ +static void +ReportTemporaryFileUsage(const char *path, off_t size) +{ + pgstat_report_tempfile(size); + + if (log_temp_files >= 0) + { + if ((size / 1024) >= log_temp_files) + ereport(LOG, + (errmsg("temporary file: path \"%s\", size %lu", + path, (unsigned long) size))); + } +} + +/* + * Called to register a temporary file for automatic close. + * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called + * before the file was opened. + */ +static void +RegisterTemporaryFile(File file) +{ + ResourceOwnerRememberFile(CurrentResourceOwner, file); + VfdCache[file].resowner = CurrentResourceOwner; + + /* Backup mechanism for closing at end of xact. */ + VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; + have_xact_temporary_files = true; +} + +/* + * Called when we get a shared invalidation message on some relation. + */ +#ifdef NOT_USED +void +FileInvalidate(File file) +{ + Assert(FileIsValid(file)); + if (!FileIsNotOpen(file)) + LruDelete(file); +} +#endif + +/* + * Open a file with PathNameOpenFilePerm() and pass default file mode for the + * fileMode parameter. + */ +File +PathNameOpenFile(const char *fileName, int fileFlags) +{ + return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * open a file in an arbitrary directory + * + * NB: if the passed pathname is relative (which it usually is), + * it will be interpreted relative to the process' working directory + * (which should always be $PGDATA when this code is running). + */ +File +PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + char *fnamecopy; + File file; + Vfd *vfdP; + + DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", + fileName, fileFlags, fileMode)); + + /* + * We need a malloc'd copy of the file name; fail cleanly if no room. + */ + fnamecopy = strdup(fileName); + if (fnamecopy == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + file = AllocateVfd(); + vfdP = &VfdCache[file]; + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); + + if (vfdP->fd < 0) + { + int save_errno = errno; + + FreeVfd(file); + free(fnamecopy); + errno = save_errno; + return -1; + } + ++nfile; + DO_DB(elog(LOG, "PathNameOpenFile: success %d", + vfdP->fd)); + + vfdP->fileName = fnamecopy; + /* Saved flags are adjusted to be OK for re-opening file */ + vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); + vfdP->fileMode = fileMode; + vfdP->fileSize = 0; + vfdP->fdstate = 0x0; + vfdP->resowner = NULL; + + Insert(file); + + return file; +} + +/* + * Create directory 'directory'. If necessary, create 'basedir', which must + * be the directory above it. This is designed for creating the top-level + * temporary directory on demand before creating a directory underneath it. + * Do nothing if the directory already exists. + * + * Directories created within the top-level temporary directory should begin + * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and + * deleted at startup by RemovePgTempFiles(). Further subdirectories below + * that do not need any particular prefix. +*/ +void +PathNameCreateTemporaryDir(const char *basedir, const char *directory) +{ + if (MakePGDirectory(directory) < 0) + { + if (errno == EEXIST) + return; + + /* + * Failed. Try to create basedir first in case it's missing. Tolerate + * EEXIST to close a race against another process following the same + * algorithm. + */ + if (MakePGDirectory(basedir) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary directory \"%s\": %m", + basedir))); + + /* Try again. */ + if (MakePGDirectory(directory) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary subdirectory \"%s\": %m", + directory))); + } +} + +/* + * Delete a directory and everything in it, if it exists. + */ +void +PathNameDeleteTemporaryDir(const char *dirname) +{ + struct stat statbuf; + + /* Silently ignore missing directory. */ + if (stat(dirname, &statbuf) != 0 && errno == ENOENT) + return; + + /* + * Currently, walkdir doesn't offer a way for our passed in function to + * maintain state. Perhaps it should, so that we could tell the caller + * whether this operation succeeded or failed. Since this operation is + * used in a cleanup path, we wouldn't actually behave differently: we'll + * just log failures. + */ + walkdir(dirname, unlink_if_exists_fname, false, LOG); +} + +/* + * Open a temporary file that will disappear when we close it. + * + * This routine takes care of generating an appropriate tempfile name. + * There's no need to pass in fileFlags or fileMode either, since only + * one setting makes any sense for a temp file. + * + * Unless interXact is true, the file is remembered by CurrentResourceOwner + * to ensure it's closed and deleted when it's no longer needed, typically at + * the end-of-transaction. In most cases, you don't want temporary files to + * outlive the transaction that created them, so this should be false -- but + * if you need "somewhat" temporary storage, this might be useful. In either + * case, the file is removed when the File is explicitly closed. + */ +File +OpenTemporaryFile(bool interXact) +{ + File file = 0; + + /* + * Make sure the current resource owner has space for this File before we + * open it, if we'll be registering it below. + */ + if (!interXact) + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * If some temp tablespace(s) have been given to us, try to use the next + * one. If a given tablespace can't be found, we silently fall back to + * the database's default tablespace. + * + * BUT: if the temp file is slated to outlive the current transaction, + * force it into the database's default tablespace, so that it will not + * pose a threat to possible tablespace drop attempts. + */ + if (numTempTableSpaces > 0 && !interXact) + { + Oid tblspcOid = GetNextTempTableSpace(); + + if (OidIsValid(tblspcOid)) + file = OpenTemporaryFileInTablespace(tblspcOid, false); + } + + /* + * If not, or if tablespace is bad, create in database's default + * tablespace. MyDatabaseTableSpace should normally be set before we get + * here, but just in case it isn't, fall back to pg_default tablespace. + */ + if (file <= 0) + file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ? + MyDatabaseTableSpace : + DEFAULTTABLESPACE_OID, + true); + + /* Mark it for deletion at close and temporary file size limit */ + VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; + + /* Register it with the current resource owner */ + if (!interXact) + RegisterTemporaryFile(file); + + return file; +} + +/* + * Return the path of the temp directory in a given tablespace. + */ +void +TempTablespacePath(char *path, Oid tablespace) +{ + /* + * Identify the tempfile directory for this tablespace. + * + * If someone tries to specify pg_global, use pg_default instead. + */ + if (tablespace == InvalidOid || + tablespace == DEFAULTTABLESPACE_OID || + tablespace == GLOBALTABLESPACE_OID) + snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); + else + { + /* All other tablespaces are accessed via symlinks */ + snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", + tablespace, TABLESPACE_VERSION_DIRECTORY, + PG_TEMP_FILES_DIR); + } +} + +/* + * Open a temporary file in a specific tablespace. + * Subroutine for OpenTemporaryFile, which see for details. + */ +static File +OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) +{ + char tempdirpath[MAXPGPATH]; + char tempfilepath[MAXPGPATH]; + File file; + + TempTablespacePath(tempdirpath, tblspcOid); + + /* + * Generate a tempfile name that should be unique within the current + * database instance. + */ + snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld", + tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(tempfilepath, + O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + /* + * We might need to create the tablespace's tempfile directory, if no + * one has yet done so. + * + * Don't check for an error from MakePGDirectory; it could fail if + * someone else just did the same thing. If it doesn't work then + * we'll bomb out on the second create attempt, instead. + */ + (void) MakePGDirectory(tempdirpath); + + file = PathNameOpenFile(tempfilepath, + O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0 && rejectError) + elog(ERROR, "could not create temporary file \"%s\": %m", + tempfilepath); + } + + return file; +} + + +/* + * Create a new file. The directory containing it must already exist. Files + * created this way are subject to temp_file_limit and are automatically + * closed at end of transaction, but are not automatically deleted on close + * because they are intended to be shared between cooperating backends. + * + * If the file is inside the top-level temporary directory, its name should + * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary + * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be + * inside a directory created with PathNameCreateTemporaryDir(), in which case + * the prefix isn't needed. + */ +File +PathNameCreateTemporaryFile(const char *path, bool error_on_failure) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + if (error_on_failure) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create temporary file \"%s\": %m", + path))); + else + return file; + } + + /* Mark it for temp_file_limit accounting. */ + VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; + + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + + return file; +} + +/* + * Open a file that was created with PathNameCreateTemporaryFile, possibly in + * another backend. Files opened this way don't count against the + * temp_file_limit of the caller, are automatically closed at the end of the + * transaction but are not deleted on close. + */ +File +PathNameOpenTemporaryFile(const char *path, int mode) +{ + File file; + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + file = PathNameOpenFile(path, mode | PG_BINARY); + + /* If no such file, then we don't raise an error. */ + if (file <= 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\": %m", + path))); + + if (file > 0) + { + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + } + + return file; +} + +/* + * Delete a file by pathname. Return true if the file existed, false if + * didn't. + */ +bool +PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) +{ + struct stat filestats; + int stat_errno; + + /* Get the final size for pgstat reporting. */ + if (stat(path, &filestats) != 0) + stat_errno = errno; + else + stat_errno = 0; + + /* + * Unlike FileClose's automatic file deletion code, we tolerate + * non-existence to support BufFileDeleteShared which doesn't know how + * many segments it has to delete until it runs out. + */ + if (stat_errno == ENOENT) + return false; + + if (unlink(path) < 0) + { + if (errno != ENOENT) + ereport(error_on_failure ? ERROR : LOG, + (errcode_for_file_access(), + errmsg("could not unlink temporary file \"%s\": %m", + path))); + return false; + } + + if (stat_errno == 0) + ReportTemporaryFileUsage(path, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + } + + return true; +} + +/* + * close a file when done with it + */ +void +FileClose(File file) +{ + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileClose: %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + if (!FileIsNotOpen(file)) + { + /* close the file */ + if (close(vfdP->fd) != 0) + { + /* + * We may need to panic on failure to close non-temporary files; + * see LruDelete. + */ + elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), + "could not close file \"%s\": %m", vfdP->fileName); + } + + --nfile; + vfdP->fd = VFD_CLOSED; + + /* remove the file from the lru ring */ + Delete(file); + } + + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + /* Subtract its size from current usage (do first in case of error) */ + temporary_files_size -= vfdP->fileSize; + vfdP->fileSize = 0; + } + + /* + * Delete the file if it was temporary, and make a log entry if wanted + */ + if (vfdP->fdstate & FD_DELETE_AT_CLOSE) + { + struct stat filestats; + int stat_errno; + + /* + * If we get an error, as could happen within the ereport/elog calls, + * we'll come right back here during transaction abort. Reset the + * flag to ensure that we can't get into an infinite loop. This code + * is arranged to ensure that the worst-case consequence is failing to + * emit log message(s), not failing to attempt the unlink. + */ + vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; + + + /* first try the stat() */ + if (stat(vfdP->fileName, &filestats)) + stat_errno = errno; + else + stat_errno = 0; + + /* in any case do the unlink */ + if (unlink(vfdP->fileName)) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not delete file \"%s\": %m", vfdP->fileName))); + + /* and last report the stat results */ + if (stat_errno == 0) + ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", vfdP->fileName))); + } + } + + /* Unregister it from the resource owner */ + if (vfdP->resowner) + ResourceOwnerForgetFile(vfdP->resowner, file); + + /* + * Return the Vfd slot to the free list + */ + FreeVfd(file); +} + +/* + * FilePrefetch - initiate asynchronous read of a given range of the file. + * + * Currently the only implementation of this function is using posix_fadvise + * which is the simplest standardized interface that accomplishes this. + * We could add an implementation using libaio in the future; but note that + * this API is inappropriate for libaio, which wants to have a buffer provided + * to read into. + */ +int +FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info) +{ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d", + file, VfdCache[file].fileName, + (int64) offset, amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, + POSIX_FADV_WILLNEED); + pgstat_report_wait_end(); + + return returnCode; +#else + Assert(FileIsValid(file)); + return 0; +#endif +} + +void +FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) nbytes)); + + if (nbytes <= 0) + return; + + returnCode = FileAccess(file); + if (returnCode < 0) + return; + + pgstat_report_wait_start(wait_event_info); + pg_flush_data(VfdCache[file].fd, offset, nbytes); + pgstat_report_wait_end(); +} + +int +FileRead(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info) +{ + int returnCode; + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p", + file, VfdCache[file].fileName, + (int64) offset, + amount, buffer)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + vfdP = &VfdCache[file]; + +retry: + pgstat_report_wait_start(wait_event_info); + returnCode = pg_pread(vfdP->fd, buffer, amount, offset); + pgstat_report_wait_end(); + + if (returnCode < 0) + { + /* + * Windows may run out of kernel buffers and return "Insufficient + * system resources" error. Wait a bit and retry to solve it. + * + * It is rumored that EINTR is also possible on some Unix filesystems, + * in which case immediate retry is indicated. + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + } + + return returnCode; +} + +int +FileWrite(File file, char *buffer, int amount, off_t offset, + uint32 wait_event_info) +{ + int returnCode; + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", + file, VfdCache[file].fileName, + (int64) offset, + amount, buffer)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + vfdP = &VfdCache[file]; + + /* + * If enforcing temp_file_limit and it's a temp file, check to see if the + * write would overrun temp_file_limit, and throw error if so. Note: it's + * really a modularity violation to throw error here; we should set errno + * and return -1. However, there's no way to report a suitable error + * message if we do that. All current callers would just throw error + * immediately anyway, so this is safe at present. + */ + if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) + { + off_t past_write = offset + amount; + + if (past_write > vfdP->fileSize) + { + uint64 newTotal = temporary_files_size; + + newTotal += past_write - vfdP->fileSize; + if (newTotal > (uint64) temp_file_limit * (uint64) 1024) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("temporary file size exceeds temp_file_limit (%dkB)", + temp_file_limit))); + } + } + +retry: + errno = 0; + pgstat_report_wait_start(wait_event_info); + returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset); + pgstat_report_wait_end(); + + /* if write didn't set errno, assume problem is no disk space */ + if (returnCode != amount && errno == 0) + errno = ENOSPC; + + if (returnCode >= 0) + { + /* + * Maintain fileSize and temporary_files_size if it's a temp file. + */ + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + off_t past_write = offset + amount; + + if (past_write > vfdP->fileSize) + { + temporary_files_size += past_write - vfdP->fileSize; + vfdP->fileSize = past_write; + } + } + } + else + { + /* + * See comments in FileRead() + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + } + + return returnCode; +} + +int +FileSync(File file, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSync: %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + returnCode = pg_fsync(VfdCache[file].fd); + pgstat_report_wait_end(); + + return returnCode; +} + +off_t +FileSize(File file) +{ + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSize %d (%s)", + file, VfdCache[file].fileName)); + + if (FileIsNotOpen(file)) + { + if (FileAccess(file) < 0) + return (off_t) -1; + } + + return lseek(VfdCache[file].fd, 0, SEEK_END); +} + +int +FileTruncate(File file, off_t offset, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileTruncate %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + returnCode = ftruncate(VfdCache[file].fd, offset); + pgstat_report_wait_end(); + + if (returnCode == 0 && VfdCache[file].fileSize > offset) + { + /* adjust our state for truncation of a temp file */ + Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); + temporary_files_size -= VfdCache[file].fileSize - offset; + VfdCache[file].fileSize = offset; + } + + return returnCode; +} + +/* + * Return the pathname associated with an open file. + * + * The returned string points to an internal buffer, which is valid until + * the file is closed. + */ +char * +FilePathName(File file) +{ + Assert(FileIsValid(file)); + + return VfdCache[file].fileName; +} + +/* + * Return the raw file descriptor of an opened file. + * + * The returned file descriptor will be valid until the file is closed, but + * there are a lot of things that can make that happen. So the caller should + * be careful not to do much of anything else before it finishes using the + * returned file descriptor. + */ +int +FileGetRawDesc(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fd; +} + +/* + * FileGetRawFlags - returns the file flags on open(2) + */ +int +FileGetRawFlags(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fileFlags; +} + +/* + * FileGetRawMode - returns the mode bitmask passed to open(2) + */ +mode_t +FileGetRawMode(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fileMode; +} + +/* + * Make room for another allocatedDescs[] array entry if needed and possible. + * Returns true if an array element is available. + */ +static bool +reserveAllocatedDesc(void) +{ + AllocateDesc *newDescs; + int newMax; + + /* Quick out if array already has a free slot. */ + if (numAllocatedDescs < maxAllocatedDescs) + return true; + + /* + * If the array hasn't yet been created in the current process, initialize + * it with FD_MINFREE / 3 elements. In many scenarios this is as many as + * we will ever need, anyway. We don't want to look at max_safe_fds + * immediately because set_max_safe_fds() may not have run yet. + */ + if (allocatedDescs == NULL) + { + newMax = FD_MINFREE / 3; + newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc)); + /* Out of memory already? Treat as fatal error. */ + if (newDescs == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + allocatedDescs = newDescs; + maxAllocatedDescs = newMax; + return true; + } + + /* + * Consider enlarging the array beyond the initial allocation used above. + * By the time this happens, max_safe_fds should be known accurately. + * + * We mustn't let allocated descriptors hog all the available FDs, and in + * practice we'd better leave a reasonable number of FDs for VFD use. So + * set the maximum to max_safe_fds / 3. (This should certainly be at + * least as large as the initial size, FD_MINFREE / 3, so we aren't + * tightening the restriction here.) Recall that "external" FDs are + * allowed to consume another third of max_safe_fds. + */ + newMax = max_safe_fds / 3; + if (newMax > maxAllocatedDescs) + { + newDescs = (AllocateDesc *) realloc(allocatedDescs, + newMax * sizeof(AllocateDesc)); + /* Treat out-of-memory as a non-fatal error. */ + if (newDescs == NULL) + return false; + allocatedDescs = newDescs; + maxAllocatedDescs = newMax; + return true; + } + + /* Can't enlarge allocatedDescs[] any more. */ + return false; +} + +/* + * Routines that want to use stdio (ie, FILE*) should use AllocateFile + * rather than plain fopen(). This lets fd.c deal with freeing FDs if + * necessary to open the file. When done, call FreeFile rather than fclose. + * + * Note that files that will be open for any significant length of time + * should NOT be handled this way, since they cannot share kernel file + * descriptors with other files; there is grave risk of running out of FDs + * if anyone locks down too many FDs. Most callers of this routine are + * simply reading a config file that they will read and close immediately. + * + * fd.c will automatically close all files opened with AllocateFile at + * transaction commit or abort; this prevents FD leakage if a routine + * that calls AllocateFile is terminated prematurely by ereport(ERROR). + * + * Ideally this should be the *only* direct call of fopen() in the backend. + */ +FILE * +AllocateFile(const char *name, const char *mode) +{ + FILE *file; + + DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)", + numAllocatedDescs, name)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", + maxAllocatedDescs, name))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + if ((file = fopen(name, mode)) != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescFile; + desc->desc.file = file; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.file; + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Open a file with OpenTransientFilePerm() and pass default file mode for + * the fileMode parameter. + */ +int +OpenTransientFile(const char *fileName, int fileFlags) +{ + return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * Like AllocateFile, but returns an unbuffered fd like open(2) + */ +int +OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + int fd; + + DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)", + numAllocatedDescs, fileName)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", + maxAllocatedDescs, fileName))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); + + if (fd >= 0) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescRawFD; + desc->desc.fd = fd; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + + return fd; + } + + return -1; /* failure */ +} + +/* + * Routines that want to initiate a pipe stream should use OpenPipeStream + * rather than plain popen(). This lets fd.c deal with freeing FDs if + * necessary. When done, call ClosePipeStream rather than pclose. + * + * This function also ensures that the popen'd program is run with default + * SIGPIPE processing, rather than the SIG_IGN setting the backend normally + * uses. This ensures desirable response to, eg, closing a read pipe early. + */ +FILE * +OpenPipeStream(const char *command, const char *mode) +{ + FILE *file; + int save_errno; + + DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)", + numAllocatedDescs, command)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"", + maxAllocatedDescs, command))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + fflush(stdout); + fflush(stderr); + pqsignal(SIGPIPE, SIG_DFL); + errno = 0; + file = popen(command, mode); + save_errno = errno; + pqsignal(SIGPIPE, SIG_IGN); + errno = save_errno; + if (file != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescPipe; + desc->desc.file = file; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.file; + } + + if (errno == EMFILE || errno == ENFILE) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Free an AllocateDesc of any type. + * + * The argument *must* point into the allocatedDescs[] array. + */ +static int +FreeDesc(AllocateDesc *desc) +{ + int result; + + /* Close the underlying object */ + switch (desc->kind) + { + case AllocateDescFile: + result = fclose(desc->desc.file); + break; + case AllocateDescPipe: + result = pclose(desc->desc.file); + break; + case AllocateDescDir: + result = closedir(desc->desc.dir); + break; + case AllocateDescRawFD: + result = close(desc->desc.fd); + break; + default: + elog(ERROR, "AllocateDesc kind not recognized"); + result = 0; /* keep compiler quiet */ + break; + } + + /* Compact storage in the allocatedDescs array */ + numAllocatedDescs--; + *desc = allocatedDescs[numAllocatedDescs]; + + return result; +} + +/* + * Close a file returned by AllocateFile. + * + * Note we do not check fclose's return value --- it is up to the caller + * to handle close errors. + */ +int +FreeFile(FILE *file) +{ + int i; + + DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs)); + + /* Remove file from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescFile && desc->desc.file == file) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); + + return fclose(file); +} + +/* + * Close a file returned by OpenTransientFile. + * + * Note we do not check close's return value --- it is up to the caller + * to handle close errors. + */ +int +CloseTransientFile(int fd) +{ + int i; + + DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs)); + + /* Remove fd from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); + + return close(fd); +} + +/* + * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir + * rather than plain opendir(). This lets fd.c deal with freeing FDs if + * necessary to open the directory, and with closing it after an elog. + * When done, call FreeDir rather than closedir. + * + * Returns NULL, with errno set, on failure. Note that failure detection + * is commonly left to the following call of ReadDir or ReadDirExtended; + * see the comments for ReadDir. + * + * Ideally this should be the *only* direct call of opendir() in the backend. + */ +DIR * +AllocateDir(const char *dirname) +{ + DIR *dir; + + DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)", + numAllocatedDescs, dirname)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"", + maxAllocatedDescs, dirname))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + if ((dir = opendir(dirname)) != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescDir; + desc->desc.dir = dir; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.dir; + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Read a directory opened with AllocateDir, ereport'ing any error. + * + * This is easier to use than raw readdir() since it takes care of some + * otherwise rather tedious and error-prone manipulation of errno. Also, + * if you are happy with a generic error message for AllocateDir failure, + * you can just do + * + * dir = AllocateDir(path); + * while ((dirent = ReadDir(dir, path)) != NULL) + * process dirent; + * FreeDir(dir); + * + * since a NULL dir parameter is taken as indicating AllocateDir failed. + * (Make sure errno isn't changed between AllocateDir and ReadDir if you + * use this shortcut.) + * + * The pathname passed to AllocateDir must be passed to this routine too, + * but it is only used for error reporting. + */ +struct dirent * +ReadDir(DIR *dir, const char *dirname) +{ + return ReadDirExtended(dir, dirname, ERROR); +} + +/* + * Alternate version of ReadDir that allows caller to specify the elevel + * for any error report (whether it's reporting an initial failure of + * AllocateDir or a subsequent directory read failure). + * + * If elevel < ERROR, returns NULL after any error. With the normal coding + * pattern, this will result in falling out of the loop immediately as + * though the directory contained no (more) entries. + */ +struct dirent * +ReadDirExtended(DIR *dir, const char *dirname, int elevel) +{ + struct dirent *dent; + + /* Give a generic message for AllocateDir failure, if caller didn't */ + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + dirname))); + return NULL; + } + + errno = 0; + if ((dent = readdir(dir)) != NULL) + return dent; + + if (errno) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not read directory \"%s\": %m", + dirname))); + return NULL; +} + +/* + * Close a directory opened with AllocateDir. + * + * Returns closedir's return value (with errno set if it's not 0). + * Note we do not check the return value --- it is up to the caller + * to handle close errors if wanted. + * + * Does nothing if dir == NULL; we assume that directory open failure was + * already reported if desired. + */ +int +FreeDir(DIR *dir) +{ + int i; + + /* Nothing to do if AllocateDir failed */ + if (dir == NULL) + return 0; + + DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs)); + + /* Remove dir from list of allocated dirs, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescDir && desc->desc.dir == dir) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a dir not in allocatedDescs */ + elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir"); + + return closedir(dir); +} + + +/* + * Close a pipe stream returned by OpenPipeStream. + */ +int +ClosePipeStream(FILE *file) +{ + int i; + + DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs)); + + /* Remove file from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescPipe && desc->desc.file == file) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream"); + + return pclose(file); +} + +/* + * closeAllVfds + * + * Force all VFDs into the physically-closed state, so that the fewest + * possible number of kernel file descriptors are in use. There is no + * change in the logical state of the VFDs. + */ +void +closeAllVfds(void) +{ + Index i; + + if (SizeVfdCache > 0) + { + Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ + for (i = 1; i < SizeVfdCache; i++) + { + if (!FileIsNotOpen(i)) + LruDelete(i); + } + } +} + + +/* + * SetTempTablespaces + * + * Define a list (actually an array) of OIDs of tablespaces to use for + * temporary files. This list will be used until end of transaction, + * unless this function is called again before then. It is caller's + * responsibility that the passed-in array has adequate lifespan (typically + * it'd be allocated in TopTransactionContext). + * + * Some entries of the array may be InvalidOid, indicating that the current + * database's default tablespace should be used. + */ +void +SetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + Assert(numSpaces >= 0); + tempTableSpaces = tableSpaces; + numTempTableSpaces = numSpaces; + + /* + * Select a random starting point in the list. This is to minimize + * conflicts between backends that are most likely sharing the same list + * of temp tablespaces. Note that if we create multiple temp files in the + * same transaction, we'll advance circularly through the list --- this + * ensures that large temporary sort files are nicely spread across all + * available tablespaces. + */ + if (numSpaces > 1) + nextTempTableSpace = random() % numSpaces; + else + nextTempTableSpace = 0; +} + +/* + * TempTablespacesAreSet + * + * Returns true if SetTempTablespaces has been called in current transaction. + * (This is just so that tablespaces.c doesn't need its own per-transaction + * state.) + */ +bool +TempTablespacesAreSet(void) +{ + return (numTempTableSpaces >= 0); +} + +/* + * GetTempTablespaces + * + * Populate an array with the OIDs of the tablespaces that should be used for + * temporary files. (Some entries may be InvalidOid, indicating that the + * current database's default tablespace should be used.) At most numSpaces + * entries will be filled. + * Returns the number of OIDs that were copied into the output array. + */ +int +GetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + int i; + + Assert(TempTablespacesAreSet()); + for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) + tableSpaces[i] = tempTableSpaces[i]; + + return i; +} + +/* + * GetNextTempTableSpace + * + * Select the next temp tablespace to use. A result of InvalidOid means + * to use the current database's default tablespace. + */ +Oid +GetNextTempTableSpace(void) +{ + if (numTempTableSpaces > 0) + { + /* Advance nextTempTableSpace counter with wraparound */ + if (++nextTempTableSpace >= numTempTableSpaces) + nextTempTableSpace = 0; + return tempTableSpaces[nextTempTableSpace]; + } + return InvalidOid; +} + + +/* + * AtEOSubXact_Files + * + * Take care of subtransaction commit/abort. At abort, we close temp files + * that the subtransaction may have opened. At commit, we reassign the + * files that were opened to the parent subtransaction. + */ +void +AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, + SubTransactionId parentSubid) +{ + Index i; + + for (i = 0; i < numAllocatedDescs; i++) + { + if (allocatedDescs[i].create_subid == mySubid) + { + if (isCommit) + allocatedDescs[i].create_subid = parentSubid; + else + { + /* have to recheck the item after FreeDesc (ugly) */ + FreeDesc(&allocatedDescs[i--]); + } + } + } +} + +/* + * AtEOXact_Files + * + * This routine is called during transaction commit or abort. All still-open + * per-transaction temporary file VFDs are closed, which also causes the + * underlying files to be deleted (although they should've been closed already + * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are + * closed. We also forget any transaction-local temp tablespace list. + * + * The isCommit flag is used only to decide whether to emit warnings about + * unclosed files. + */ +void +AtEOXact_Files(bool isCommit) +{ + CleanupTempFiles(isCommit, false); + tempTableSpaces = NULL; + numTempTableSpaces = -1; +} + +/* + * AtProcExit_Files + * + * on_proc_exit hook to clean up temp files during backend shutdown. + * Here, we want to clean up *all* temp files including interXact ones. + */ +static void +AtProcExit_Files(int code, Datum arg) +{ + CleanupTempFiles(false, true); +} + +/* + * Close temporary files and delete their underlying files. + * + * isCommit: if true, this is normal transaction commit, and we don't + * expect any remaining files; warn if there are some. + * + * isProcExit: if true, this is being called as the backend process is + * exiting. If that's the case, we should remove all temporary files; if + * that's not the case, we are being called for transaction commit/abort + * and should only remove transaction-local temp files. In either case, + * also clean up "allocated" stdio files, dirs and fds. + */ +static void +CleanupTempFiles(bool isCommit, bool isProcExit) +{ + Index i; + + /* + * Careful here: at proc_exit we need extra cleanup, not just + * xact_temporary files. + */ + if (isProcExit || have_xact_temporary_files) + { + Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ + for (i = 1; i < SizeVfdCache; i++) + { + unsigned short fdstate = VfdCache[i].fdstate; + + if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && + VfdCache[i].fileName != NULL) + { + /* + * If we're in the process of exiting a backend process, close + * all temporary files. Otherwise, only close temporary files + * local to the current transaction. They should be closed by + * the ResourceOwner mechanism already, so this is just a + * debugging cross-check. + */ + if (isProcExit) + FileClose(i); + else if (fdstate & FD_CLOSE_AT_EOXACT) + { + elog(WARNING, + "temporary file %s not closed at end-of-transaction", + VfdCache[i].fileName); + FileClose(i); + } + } + } + + have_xact_temporary_files = false; + } + + /* Complain if any allocated files remain open at commit. */ + if (isCommit && numAllocatedDescs > 0) + elog(WARNING, "%d temporary files and directories not closed at end-of-transaction", + numAllocatedDescs); + + /* Clean up "allocated" stdio files, dirs and fds. */ + while (numAllocatedDescs > 0) + FreeDesc(&allocatedDescs[0]); +} + + +/* + * Remove temporary and temporary relation files left over from a prior + * postmaster session + * + * This should be called during postmaster startup. It will forcibly + * remove any leftover files created by OpenTemporaryFile and any leftover + * temporary relation files created by mdcreate. + * + * During post-backend-crash restart cycle, this routine is called when + * remove_temp_files_after_crash GUC is enabled. Multiple crashes while + * queries are using temp files could result in useless storage usage that can + * only be reclaimed by a service restart. The argument against enabling it is + * that someone might want to examine the temporary files for debugging + * purposes. This does however mean that OpenTemporaryFile had better allow for + * collision with an existing temp file name. + * + * NOTE: this function and its subroutines generally report syscall failures + * with ereport(LOG) and keep going. Removing temp files is not so critical + * that we should fail to start the database when we can't do it. + */ +void +RemovePgTempFiles(void) +{ + char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)]; + DIR *spc_dir; + struct dirent *spc_de; + + /* + * First process temp files in pg_default ($PGDATA/base) + */ + snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); + RemovePgTempFilesInDir(temp_path, true, false); + RemovePgTempRelationFiles("base"); + + /* + * Cycle through temp directories for all non-default tablespaces. + */ + spc_dir = AllocateDir("pg_tblspc"); + + while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL) + { + if (strcmp(spc_de->d_name, ".") == 0 || + strcmp(spc_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); + RemovePgTempFilesInDir(temp_path, true, false); + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); + RemovePgTempRelationFiles(temp_path); + } + + FreeDir(spc_dir); + + /* + * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of + * DataDir as well. However, that is *not* cleaned here because doing so + * would create a race condition. It's done separately, earlier in + * postmaster startup. + */ +} + +/* + * Process one pgsql_tmp directory for RemovePgTempFiles. + * + * If missing_ok is true, it's all right for the named directory to not exist. + * Any other problem results in a LOG message. (missing_ok should be true at + * the top level, since pgsql_tmp directories are not created until needed.) + * + * At the top level, this should be called with unlink_all = false, so that + * only files matching the temporary name prefix will be unlinked. When + * recursing it will be called with unlink_all = true to unlink everything + * under a top-level temporary directory. + * + * (These two flags could be replaced by one, but it seems clearer to keep + * them separate.) + */ +void +RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all) +{ + DIR *temp_dir; + struct dirent *temp_de; + char rm_path[MAXPGPATH * 2]; + + temp_dir = AllocateDir(tmpdirname); + + if (temp_dir == NULL && errno == ENOENT && missing_ok) + return; + + while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL) + { + if (strcmp(temp_de->d_name, ".") == 0 || + strcmp(temp_de->d_name, "..") == 0) + continue; + + snprintf(rm_path, sizeof(rm_path), "%s/%s", + tmpdirname, temp_de->d_name); + + if (unlink_all || + strncmp(temp_de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + { + struct stat statbuf; + + if (lstat(rm_path, &statbuf) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", rm_path))); + continue; + } + + if (S_ISDIR(statbuf.st_mode)) + { + /* recursively remove contents, then directory itself */ + RemovePgTempFilesInDir(rm_path, false, true); + + if (rmdir(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove directory \"%s\": %m", + rm_path))); + } + else + { + if (unlink(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + } + } + else + ereport(LOG, + (errmsg("unexpected file found in temporary-files directory: \"%s\"", + rm_path))); + } + + FreeDir(temp_dir); +} + +/* Process one tablespace directory, look for per-DB subdirectories */ +static void +RemovePgTempRelationFiles(const char *tsdirname) +{ + DIR *ts_dir; + struct dirent *de; + char dbspace_path[MAXPGPATH * 2]; + + ts_dir = AllocateDir(tsdirname); + + while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL) + { + /* + * We're only interested in the per-database directories, which have + * numeric names. Note that this code will also (properly) ignore "." + * and "..". + */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", + tsdirname, de->d_name); + RemovePgTempRelationFilesInDbspace(dbspace_path); + } + + FreeDir(ts_dir); +} + +/* Process one per-dbspace directory for RemovePgTempRelationFiles */ +static void +RemovePgTempRelationFilesInDbspace(const char *dbspacedirname) +{ + DIR *dbspace_dir; + struct dirent *de; + char rm_path[MAXPGPATH * 2]; + + dbspace_dir = AllocateDir(dbspacedirname); + + while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL) + { + if (!looks_like_temp_rel_name(de->d_name)) + continue; + + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + + if (unlink(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + } + + FreeDir(dbspace_dir); +} + +/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */ +bool +looks_like_temp_rel_name(const char *name) +{ + int pos; + int savepos; + + /* Must start with "t". */ + if (name[0] != 't') + return false; + + /* Followed by a non-empty string of digits and then an underscore. */ + for (pos = 1; isdigit((unsigned char) name[pos]); ++pos) + ; + if (pos == 1 || name[pos] != '_') + return false; + + /* Followed by another nonempty string of digits. */ + for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos) + ; + if (savepos == pos) + return false; + + /* We might have _forkname or .segment or both. */ + if (name[pos] == '_') + { + int forkchar = forkname_chars(&name[pos + 1], NULL); + + if (forkchar <= 0) + return false; + pos += forkchar + 1; + } + if (name[pos] == '.') + { + int segchar; + + for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) + ; + if (segchar <= 1) + return false; + pos += segchar; + } + + /* Now we should be at the end. */ + if (name[pos] != '\0') + return false; + return true; +} + +#ifdef HAVE_SYNCFS +static void +do_syncfs(const char *path) +{ + int fd; + + fd = OpenTransientFile(path, O_RDONLY); + if (fd < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return; + } + if (syncfs(fd) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not synchronize file system for file \"%s\": %m", path))); + CloseTransientFile(fd); +} +#endif + +/* + * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for + * all potential filesystem, depending on recovery_init_sync_method setting. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_wal and immediately under pg_tblspc. + * Other symlinks are presumed to point at files we're not responsible + * for fsyncing, and might not have privileges to write at all. + * + * Errors are logged but not considered fatal; that's because this is used + * only during database startup, to deal with the possibility that there are + * issued-but-unsynced writes pending against the data directory. We want to + * ensure that such writes reach disk before anything that's done in the new + * run. However, aborting on error would result in failure to start for + * harmless cases such as read-only files in the data directory, and that's + * not good either. + * + * Note that if we previously crashed due to a PANIC on fsync(), we'll be + * rewriting all changes again during recovery. + * + * Note we assume we're chdir'd into PGDATA to begin with. + */ +void +SyncDataDirectory(void) +{ + bool xlog_is_symlink; + + /* We can skip this whole thing if fsync is disabled. */ + if (!enableFsync) + return; + + /* + * If pg_wal is a symlink, we'll need to recurse into it separately, + * because the first walkdir below will ignore it. + */ + xlog_is_symlink = false; + +#ifndef WIN32 + { + struct stat st; + + if (lstat("pg_wal", &st) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + "pg_wal"))); + else if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; + } +#else + if (pgwin32_is_junction("pg_wal")) + xlog_is_symlink = true; +#endif + +#ifdef HAVE_SYNCFS + if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS) + { + DIR *dir; + struct dirent *de; + + /* + * On Linux, we don't have to open every single file one by one. We + * can use syncfs() to sync whole filesystems. We only expect + * filesystem boundaries to exist where we tolerate symlinks, namely + * pg_wal and the tablespaces, so we call syncfs() for each of those + * directories. + */ + + /* Sync the top level pgdata directory. */ + do_syncfs("."); + /* If any tablespaces are configured, sync each of those. */ + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDirExtended(dir, "pg_tblspc", LOG))) + { + char path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); + do_syncfs(path); + } + FreeDir(dir); + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs("pg_wal"); + return; + } +#endif /* !HAVE_SYNCFS */ + + /* + * If possible, hint to the kernel that we're soon going to fsync the data + * directory and its contents. Errors in this step are even less + * interesting than normal, so log them only at DEBUG1. + */ +#ifdef PG_FLUSH_DATA_WORKS + walkdir(".", pre_sync_fname, false, DEBUG1); + if (xlog_is_symlink) + walkdir("pg_wal", pre_sync_fname, false, DEBUG1); + walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); +#endif + + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially processing + * pg_wal if it's a symlink, pg_tblspc has to be visited separately with + * process_symlinks = true. Note that if there are any plain directories + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case + * so we don't worry about optimizing it. + */ + walkdir(".", datadir_fsync_fname, false, LOG); + if (xlog_is_symlink) + walkdir("pg_wal", datadir_fsync_fname, false, LOG); + walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); +} + +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself). + * + * If process_symlinks is true, the action and recursion are also applied + * to regular files and directories that are pointed to by symlinks in the + * given directory; otherwise symlinks are ignored. Symlinks are always + * ignored in subdirectories, ie we intentionally don't pass down the + * process_symlinks flag to recursive calls. + * + * Errors are reported at level elevel, which might be ERROR or less. + * + * See also walkdir in file_utils.c, which is a frontend version of this + * logic. + */ +static void +walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir(path); + + while ((de = ReadDirExtended(dir, path, elevel)) != NULL) + { + char subpath[MAXPGPATH * 2]; + + CHECK_FOR_INTERRUPTS(); + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); + + switch (get_dirent_type(subpath, de, process_symlinks, elevel)) + { + case PGFILETYPE_REG: + (*action) (subpath, false, elevel); + break; + case PGFILETYPE_DIR: + walkdir(subpath, action, false, elevel); + break; + default: + + /* + * Errors are already reported directly by get_dirent_type(), + * and any remaining symlinks and unknown file types are + * ignored. + */ + break; + } + } + + FreeDir(dir); /* we ignore any error here */ + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. However, skip this if AllocateDir failed; the action function + * might not be robust against that. + */ + if (dir) + (*action) (path, true, elevel); +} + + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Ignores errors trying to open unreadable files, and logs other errors at a + * caller-specified level. + */ +#ifdef PG_FLUSH_DATA_WORKS + +static void +pre_sync_fname(const char *fname, bool isdir, int elevel) +{ + int fd; + + /* Don't try to flush directories, it'll likely just fail */ + if (isdir) + return; + + fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY); + + if (fd < 0) + { + if (errno == EACCES) + return; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } + + /* + * pg_flush_data() ignores errors, which is ok because this is only a + * hint. + */ + pg_flush_data(fd, 0, 0); + + if (CloseTransientFile(fd) != 0) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fname))); +} + +#endif /* PG_FLUSH_DATA_WORKS */ + +static void +datadir_fsync_fname(const char *fname, bool isdir, int elevel) +{ + /* + * We want to silently ignoring errors about unreadable files. Pass that + * desire on to fsync_fname_ext(). + */ + fsync_fname_ext(fname, isdir, true, elevel); +} + +static void +unlink_if_exists_fname(const char *fname, bool isdir, int elevel) +{ + if (isdir) + { + if (rmdir(fname) != 0 && errno != ENOENT) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not remove directory \"%s\": %m", fname))); + } + else + { + /* Use PathNameDeleteTemporaryFile to report filesize */ + PathNameDeleteTemporaryFile(fname, false); + } +} + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * If ignore_perm is true, ignore errors upon trying to open unreadable + * files. Logs other errors at a caller-specified level. + * + * Returns 0 if the operation succeeded, -1 otherwise. + */ +int +fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + fd = OpenTransientFile(fname, flags); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES), just ignore the error in that case. If desired also silently + * ignoring errors about unreadable files. Log others. + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return 0; + else if (fd < 0 && ignore_perm && errno == EACCES) + return 0; + else if (fd < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return -1; + } + + returncode = pg_fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be logged. + */ + if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + (void) CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + return -1; + } + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fname))); + return -1; + } + + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static int +fsync_parent_path(const char *fname, int elevel) +{ + char parentpath[MAXPGPATH]; + + strlcpy(parentpath, fname, MAXPGPATH); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument is + * just a file name (see comments in path.c), so handle that as being the + * current directory. + */ + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + + if (fsync_fname_ext(parentpath, true, false, elevel) != 0) + return -1; + + return 0; +} + +/* + * Create a PostgreSQL data sub-directory + * + * The data directory itself, and most of its sub-directories, are created at + * initdb time, but we do have some occasions when we create directories in + * the backend (CREATE TABLESPACE, for example). In those cases, we want to + * make sure that those directories are created consistently. Today, that means + * making sure that the created directory has the correct permissions, which is + * what pg_dir_create_mode tracks for us. + * + * Note that we also set the umask() based on what we understand the correct + * permissions to be (see file_perm.c). + * + * For permissions other than the default, mkdir() can be used directly, but + * be sure to consider carefully such cases -- a sub-directory with incorrect + * permissions in a PostgreSQL data directory could cause backups and other + * processes to fail. + */ +int +MakePGDirectory(const char *directoryName) +{ + return mkdir(directoryName, pg_dir_create_mode); +} + +/* + * Return the passed-in error level, or PANIC if data_sync_retry is off. + * + * Failure to fsync any data file is cause for immediate panic, unless + * data_sync_retry is enabled. Data may have been written to the operating + * system and removed from our buffer pool already, and if we are running on + * an operating system that forgets dirty data on write-back failure, there + * may be only one copy of the data remaining: in the WAL. A later attempt to + * fsync again might falsely report success. Therefore we must not allow any + * further checkpoints to be attempted. data_sync_retry can in theory be + * enabled on systems known not to drop dirty buffered data on write-back + * failure (with the likely outcome that checkpoints will continue to fail + * until the underlying problem is fixed). + * + * Any code that reports a failure from fsync() or related functions should + * filter the error level with this function. + */ +int +data_sync_elevel(int elevel) +{ + return data_sync_retry ? elevel : PANIC; +} + +/* + * A convenience wrapper for pg_pwritev() that retries on partial write. If an + * error is returned, it is unspecified how much has been written. + */ +ssize_t +pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + struct iovec iov_copy[PG_IOV_MAX]; + ssize_t sum = 0; + ssize_t part; + + /* We'd better have space to make a copy, in case we need to retry. */ + if (iovcnt > PG_IOV_MAX) + { + errno = EINVAL; + return -1; + } + + for (;;) + { + /* Write as much as we can. */ + part = pg_pwritev(fd, iov, iovcnt, offset); + if (part < 0) + return -1; + +#ifdef SIMULATE_SHORT_WRITE + part = Min(part, 4096); +#endif + + /* Count our progress. */ + sum += part; + offset += part; + + /* Step over iovecs that are done. */ + while (iovcnt > 0 && iov->iov_len <= part) + { + part -= iov->iov_len; + ++iov; + --iovcnt; + } + + /* Are they all done? */ + if (iovcnt == 0) + { + /* We don't expect the kernel to write more than requested. */ + Assert(part == 0); + break; + } + + /* + * Move whatever's left to the front of our mutable copy and adjust + * the leading iovec. + */ + Assert(iovcnt > 0); + memmove(iov_copy, iov, sizeof(*iov) * iovcnt); + Assert(iov->iov_len > part); + iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part; + iov_copy[0].iov_len -= part; + iov = iov_copy; + } + + return sum; +} diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c new file mode 100644 index 0000000..40c758d --- /dev/null +++ b/src/backend/storage/file/reinit.c @@ -0,0 +1,410 @@ +/*------------------------------------------------------------------------- + * + * reinit.c + * Reinitialization of unlogged relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/reinit.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "common/relpath.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/reinit.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + +static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, + int op); +static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, + int op); + +typedef struct +{ + Oid reloid; /* hash key */ +} unlogged_relation_entry; + +/* + * Reset unlogged relations from before the last restart. + * + * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any + * relation with an "init" fork, except for the "init" fork itself. + * + * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main + * fork. + */ +void +ResetUnloggedRelations(int op) +{ + char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)]; + DIR *spc_dir; + struct dirent *spc_de; + MemoryContext tmpctx, + oldctx; + + /* Log it. */ + elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d", + (op & UNLOGGED_RELATION_CLEANUP) != 0, + (op & UNLOGGED_RELATION_INIT) != 0); + + /* + * Just to be sure we don't leak any memory, let's create a temporary + * memory context for this operation. + */ + tmpctx = AllocSetContextCreate(CurrentMemoryContext, + "ResetUnloggedRelations", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(tmpctx); + + /* + * First process unlogged files in pg_default ($PGDATA/base) + */ + ResetUnloggedRelationsInTablespaceDir("base", op); + + /* + * Cycle through directories for all non-default tablespaces. + */ + spc_dir = AllocateDir("pg_tblspc"); + + while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) + { + if (strcmp(spc_de->d_name, ".") == 0 || + strcmp(spc_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); + ResetUnloggedRelationsInTablespaceDir(temp_path, op); + } + + FreeDir(spc_dir); + + /* + * Restore memory context. + */ + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(tmpctx); +} + +/* + * Process one tablespace directory for ResetUnloggedRelations + */ +static void +ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) +{ + DIR *ts_dir; + struct dirent *de; + char dbspace_path[MAXPGPATH * 2]; + + ts_dir = AllocateDir(tsdirname); + + /* + * If we get ENOENT on a tablespace directory, log it and return. This + * can happen if a previous DROP TABLESPACE crashed between removing the + * tablespace directory and removing the symlink in pg_tblspc. We don't + * really want to prevent database startup in that scenario, so let it + * pass instead. Any other type of error will be reported by ReadDir + * (causing a startup failure). + */ + if (ts_dir == NULL && errno == ENOENT) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + tsdirname))); + return; + } + + while ((de = ReadDir(ts_dir, tsdirname)) != NULL) + { + /* + * We're only interested in the per-database directories, which have + * numeric names. Note that this code will also (properly) ignore "." + * and "..". + */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", + tsdirname, de->d_name); + ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); + } + + FreeDir(ts_dir); +} + +/* + * Process one per-dbspace directory for ResetUnloggedRelations + */ +static void +ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) +{ + DIR *dbspace_dir; + struct dirent *de; + char rm_path[MAXPGPATH * 2]; + + /* Caller must specify at least one operation. */ + Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); + + /* + * Cleanup is a two-pass operation. First, we go through and identify all + * the files with init forks. Then, we go through again and nuke + * everything with the same OID except the init fork. + */ + if ((op & UNLOGGED_RELATION_CLEANUP) != 0) + { + HTAB *hash; + HASHCTL ctl; + + /* + * It's possible that someone could create a ton of unlogged relations + * in the same database & tablespace, so we'd better use a hash table + * rather than an array or linked list to keep track of which files + * need to be reset. Otherwise, this cleanup operation would be + * O(n^2). + */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(unlogged_relation_entry); + ctl.hcxt = CurrentMemoryContext; + hash = hash_create("unlogged relation OIDs", 32, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Scan the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* + * Put the OID portion of the name into the hash table, if it + * isn't already. + */ + ent.reloid = atooid(de->d_name); + (void) hash_search(hash, &ent, HASH_ENTER, NULL); + } + + /* Done with the first pass. */ + FreeDir(dbspace_dir); + + /* + * If we didn't find any init forks, there's no point in continuing; + * we can bail out now. + */ + if (hash_get_num_entries(hash) == 0) + { + hash_destroy(hash); + return; + } + + /* + * Now, make a second pass and remove anything that matches. + */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* We never remove the init fork. */ + if (forkNum == INIT_FORKNUM) + continue; + + /* + * See whether the OID portion of the name shows up in the hash + * table. If so, nuke it! + */ + ent.reloid = atooid(de->d_name); + if (hash_search(hash, &ent, HASH_FIND, NULL)) + { + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + if (unlink(rm_path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + else + elog(DEBUG2, "unlinked file \"%s\"", rm_path); + } + } + + /* Cleanup is complete. */ + FreeDir(dbspace_dir); + hash_destroy(hash); + } + + /* + * Initialization happens after cleanup is complete: we copy each init + * fork file to the corresponding main fork file. Note that if we are + * asked to do both cleanup and init, we may never get here: if the + * cleanup code determines that there are no init forks in this dbspace, + * it will return before we get to this point. + */ + if ((op & UNLOGGED_RELATION_INIT) != 0) + { + /* Scan the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + char oidbuf[OIDCHARS + 1]; + char srcpath[MAXPGPATH * 2]; + char dstpath[MAXPGPATH]; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* Construct source pathname. */ + snprintf(srcpath, sizeof(srcpath), "%s/%s", + dbspacedirname, de->d_name); + + /* Construct destination pathname. */ + memcpy(oidbuf, de->d_name, oidchars); + oidbuf[oidchars] = '\0'; + snprintf(dstpath, sizeof(dstpath), "%s/%s%s", + dbspacedirname, oidbuf, de->d_name + oidchars + 1 + + strlen(forkNames[INIT_FORKNUM])); + + /* OK, we're ready to perform the actual copy. */ + elog(DEBUG2, "copying %s to %s", srcpath, dstpath); + copy_file(srcpath, dstpath); + } + + FreeDir(dbspace_dir); + + /* + * copy_file() above has already called pg_flush_data() on the files + * it created. Now we need to fsync those files, because a checkpoint + * won't do it for us while we're in recovery. We do this in a + * separate pass to allow the kernel to perform all the flushes + * (especially the metadata ones) at once. + */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + char oidbuf[OIDCHARS + 1]; + char mainpath[MAXPGPATH]; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* Construct main fork pathname. */ + memcpy(oidbuf, de->d_name, oidchars); + oidbuf[oidchars] = '\0'; + snprintf(mainpath, sizeof(mainpath), "%s/%s%s", + dbspacedirname, oidbuf, de->d_name + oidchars + 1 + + strlen(forkNames[INIT_FORKNUM])); + + fsync_fname(mainpath, false); + } + + FreeDir(dbspace_dir); + + /* + * Lastly, fsync the database directory itself, ensuring the + * filesystem remembers the file creations and deletions we've done. + * We don't bother with this during a call that does only + * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we + * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step + * too at the next startup attempt. + */ + fsync_fname(dbspacedirname, true); + } +} + +/* + * Basic parsing of putative relation filenames. + * + * This function returns true if the file appears to be in the correct format + * for a non-temporary relation and false otherwise. + * + * NB: If this function returns true, the caller is entitled to assume that + * *oidchars has been set to the a value no more than OIDCHARS, and thus + * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID + * portion of the filename. This is critical to protect against a possible + * buffer overrun. + */ +bool +parse_filename_for_nontemp_relation(const char *name, int *oidchars, + ForkNumber *fork) +{ + int pos; + + /* Look for a non-empty string of digits (that isn't too long). */ + for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) + ; + if (pos == 0 || pos > OIDCHARS) + return false; + *oidchars = pos; + + /* Check for a fork name. */ + if (name[pos] != '_') + *fork = MAIN_FORKNUM; + else + { + int forkchar; + + forkchar = forkname_chars(&name[pos + 1], fork); + if (forkchar <= 0) + return false; + pos += forkchar + 1; + } + + /* Check for a segment number. */ + if (name[pos] == '.') + { + int segchar; + + for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) + ; + if (segchar <= 1) + return false; + pos += segchar; + } + + /* Now we should be at the end. */ + if (name[pos] != '\0') + return false; + return true; +} diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c new file mode 100644 index 0000000..ed37c94 --- /dev/null +++ b/src/backend/storage/file/sharedfileset.c @@ -0,0 +1,354 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.c + * Shared temporary file management. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/sharedfileset.c + * + * SharedFileSets provide a temporary namespace (think directory) so that + * files can be discovered by name, and a shared ownership semantics so that + * shared files survive until the last user detaches. + * + * SharedFileSets can be used by backends when the temporary files need to be + * opened/closed multiple times and the underlying files need to survive across + * transactions. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <limits.h> + +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "common/hashfn.h" +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/sharedfileset.h" +#include "utils/builtins.h" + +static List *filesetlist = NIL; + +static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum); +static void SharedFileSetDeleteOnProcExit(int status, Datum arg); +static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace); +static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name); +static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name); + +/* + * Initialize a space for temporary files that can be opened by other backends. + * Other backends must attach to it before accessing it. Associate this + * SharedFileSet with 'seg'. Any contained files will be deleted when the + * last backend detaches. + * + * We can also use this interface if the temporary files are used only by + * single backend but the files need to be opened and closed multiple times + * and also the underlying files need to survive across transactions. For + * such cases, dsm segment 'seg' should be passed as NULL. Callers are + * expected to explicitly remove such files by using SharedFileSetDelete/ + * SharedFileSetDeleteAll or we remove such files on proc exit. + * + * Files will be distributed over the tablespaces configured in + * temp_tablespaces. + * + * Under the covers the set is one or more directories which will eventually + * be deleted. + */ +void +SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) +{ + static uint32 counter = 0; + + SpinLockInit(&fileset->mutex); + fileset->refcnt = 1; + fileset->creator_pid = MyProcPid; + fileset->number = counter; + counter = (counter + 1) % INT_MAX; + + /* Capture the tablespace OIDs so that all backends agree on them. */ + PrepareTempTablespaces(); + fileset->ntablespaces = + GetTempTablespaces(&fileset->tablespaces[0], + lengthof(fileset->tablespaces)); + if (fileset->ntablespaces == 0) + { + /* If the GUC is empty, use current database's default tablespace */ + fileset->tablespaces[0] = MyDatabaseTableSpace; + fileset->ntablespaces = 1; + } + else + { + int i; + + /* + * An entry of InvalidOid means use the default tablespace for the + * current database. Replace that now, to be sure that all users of + * the SharedFileSet agree on what to do. + */ + for (i = 0; i < fileset->ntablespaces; i++) + { + if (fileset->tablespaces[i] == InvalidOid) + fileset->tablespaces[i] = MyDatabaseTableSpace; + } + } + + /* Register our cleanup callback. */ + if (seg) + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); + else + { + static bool registered_cleanup = false; + + if (!registered_cleanup) + { + /* + * We must not have registered any fileset before registering the + * fileset clean up. + */ + Assert(filesetlist == NIL); + on_proc_exit(SharedFileSetDeleteOnProcExit, 0); + registered_cleanup = true; + } + + filesetlist = lcons((void *) fileset, filesetlist); + } +} + +/* + * Attach to a set of directories that was created with SharedFileSetInit. + */ +void +SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg) +{ + bool success; + + SpinLockAcquire(&fileset->mutex); + if (fileset->refcnt == 0) + success = false; + else + { + ++fileset->refcnt; + success = true; + } + SpinLockRelease(&fileset->mutex); + + if (!success) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to a SharedFileSet that is already destroyed"))); + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Create a new file in the given set. + */ +File +SharedFileSetCreate(SharedFileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameCreateTemporaryFile(path, false); + + /* If we failed, see if we need to create the directory on demand. */ + if (file <= 0) + { + char tempdirpath[MAXPGPATH]; + char filesetpath[MAXPGPATH]; + Oid tablespace = ChooseTablespace(fileset, name); + + TempTablespacePath(tempdirpath, tablespace); + SharedFileSetPath(filesetpath, fileset, tablespace); + PathNameCreateTemporaryDir(tempdirpath, filesetpath); + file = PathNameCreateTemporaryFile(path, true); + } + + return file; +} + +/* + * Open a file that was created with SharedFileSetCreate(), possibly in + * another backend. + */ +File +SharedFileSetOpen(SharedFileSet *fileset, const char *name, int mode) +{ + char path[MAXPGPATH]; + File file; + + SharedFilePath(path, fileset, name); + file = PathNameOpenTemporaryFile(path, mode); + + return file; +} + +/* + * Delete a file that was created with SharedFileSetCreate(). + * Return true if the file existed, false if didn't. + */ +bool +SharedFileSetDelete(SharedFileSet *fileset, const char *name, + bool error_on_failure) +{ + char path[MAXPGPATH]; + + SharedFilePath(path, fileset, name); + + return PathNameDeleteTemporaryFile(path, error_on_failure); +} + +/* + * Delete all files in the set. + */ +void +SharedFileSetDeleteAll(SharedFileSet *fileset) +{ + char dirpath[MAXPGPATH]; + int i; + + /* + * Delete the directory we created in each tablespace. Doesn't fail + * because we use this in error cleanup paths, but can generate LOG + * message on IO error. + */ + for (i = 0; i < fileset->ntablespaces; ++i) + { + SharedFileSetPath(dirpath, fileset, fileset->tablespaces[i]); + PathNameDeleteTemporaryDir(dirpath); + } + + /* Unregister the shared fileset */ + SharedFileSetUnregister(fileset); +} + +/* + * Callback function that will be invoked when this backend detaches from a + * DSM segment holding a SharedFileSet that it has created or attached to. If + * we are the last to detach, then try to remove the directories and + * everything in them. We can't raise an error on failures, because this runs + * in error cleanup paths. + */ +static void +SharedFileSetOnDetach(dsm_segment *segment, Datum datum) +{ + bool unlink_all = false; + SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum); + + SpinLockAcquire(&fileset->mutex); + Assert(fileset->refcnt > 0); + if (--fileset->refcnt == 0) + unlink_all = true; + SpinLockRelease(&fileset->mutex); + + /* + * If we are the last to detach, we delete the directory in all + * tablespaces. Note that we are still actually attached for the rest of + * this function so we can safely access its data. + */ + if (unlink_all) + SharedFileSetDeleteAll(fileset); +} + +/* + * Callback function that will be invoked on the process exit. This will + * process the list of all the registered sharedfilesets and delete the + * underlying files. + */ +static void +SharedFileSetDeleteOnProcExit(int status, Datum arg) +{ + /* + * Remove all the pending shared fileset entries. We don't use foreach() + * here because SharedFileSetDeleteAll will remove the current element in + * filesetlist. Though we have used foreach_delete_current() to remove the + * element from filesetlist it could only fix up the state of one of the + * loops, see SharedFileSetUnregister. + */ + while (list_length(filesetlist) > 0) + { + SharedFileSet *fileset = (SharedFileSet *) linitial(filesetlist); + + SharedFileSetDeleteAll(fileset); + } + + filesetlist = NIL; +} + +/* + * Unregister the shared fileset entry registered for cleanup on proc exit. + */ +void +SharedFileSetUnregister(SharedFileSet *input_fileset) +{ + ListCell *l; + + /* + * If the caller is following the dsm based cleanup then we don't maintain + * the filesetlist so return. + */ + if (filesetlist == NIL) + return; + + foreach(l, filesetlist) + { + SharedFileSet *fileset = (SharedFileSet *) lfirst(l); + + /* Remove the entry from the list */ + if (input_fileset == fileset) + { + filesetlist = foreach_delete_current(filesetlist, l); + return; + } + } + + /* Should have found a match */ + Assert(false); +} + +/* + * Build the path for the directory holding the files backing a SharedFileSet + * in a given tablespace. + */ +static void +SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, tablespace); + snprintf(path, MAXPGPATH, "%s/%s%lu.%u.sharedfileset", + tempdirpath, PG_TEMP_FILE_PREFIX, + (unsigned long) fileset->creator_pid, fileset->number); +} + +/* + * Sorting hat to determine which tablespace a given shared temporary file + * belongs in. + */ +static Oid +ChooseTablespace(const SharedFileSet *fileset, const char *name) +{ + uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + + return fileset->tablespaces[hash % fileset->ntablespaces]; +} + +/* + * Compute the full path of a file in a SharedFileSet. + */ +static void +SharedFilePath(char *path, SharedFileSet *fileset, const char *name) +{ + char dirpath[MAXPGPATH]; + + SharedFileSetPath(dirpath, fileset, ChooseTablespace(fileset, name)); + snprintf(path, MAXPGPATH, "%s/%s", dirpath, name); +} |