Adding upstream version 16.2.upstream/16.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-13 13:44:03 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-13 13:44:03 +0000
commit: 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
tree: fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/storage/file
parent: Initial commit. (diff)
download: postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz
postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip
8 files changed, 6011 insertions, 0 deletions
diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile
new file mode 100644
index 0000000..660ac51
--- /dev/null
+++ b/src/backend/storage/file/Makefile
@@ -0,0 +1,23 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/file
+#
+# IDENTIFICATION
+#    src/backend/storage/file/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/file
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+	buffile.o \
+	copydir.o \
+	fd.o \
+	fileset.o \
+	reinit.o \
+	sharedfileset.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
new file mode 100644
index 0000000..41ab641
--- /dev/null
+++ b/src/backend/storage/file/buffile.c
@@ -0,0 +1,1039 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.c
+ *	  Management of large buffered temporary files.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/buffile.c
+ *
+ * NOTES:
+ *
+ * BufFiles provide a very incomplete emulation of stdio atop virtual Files
+ * (as managed by fd.c).  Currently, we only support the buffered-I/O
+ * aspect of stdio: a read or write of the low-level File occurs only
+ * when the buffer is filled or emptied.  This is an even bigger win
+ * for virtual Files than for ordinary kernel files, since reducing the
+ * frequency with which a virtual File is touched reduces "thrashing"
+ * of opening/closing file descriptors.
+ *
+ * Note that BufFile structs are allocated with palloc(), and therefore
+ * will go away automatically at query/transaction end.  Since the underlying
+ * virtual Files are made with OpenTemporaryFile, all resources for
+ * the file are certain to be cleaned up even if processing is aborted
+ * by ereport(ERROR).  The data structures required are made in the
+ * palloc context that was current when the BufFile was created, and
+ * any external resources such as temp files are owned by the ResourceOwner
+ * that was current at that time.
+ *
+ * BufFile also supports temporary files that exceed the OS file size limit
+ * (by opening multiple fd.c temporary files).  This is an essential feature
+ * for sorts and hashjoins on large amounts of data.
+ *
+ * BufFile supports temporary files that can be shared with other backends, as
+ * infrastructure for parallel execution.  Such files need to be created as a
+ * member of a SharedFileSet that all participants are attached to.
+ *
+ * BufFile also supports temporary files that can be used by the single backend
+ * when the corresponding files need to be survived across the transaction and
+ * need to be opened and closed multiple times.  Such files need to be created
+ * as a member of a FileSet.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "commands/tablespace.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/buffile.h"
+#include "storage/fd.h"
+#include "utils/resowner.h"
+
+/*
+ * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
+ * The reason is that we'd like large BufFiles to be spread across multiple
+ * tablespaces when available.
+ */
+#define MAX_PHYSICAL_FILESIZE	0x40000000
+#define BUFFILE_SEG_SIZE		(MAX_PHYSICAL_FILESIZE / BLCKSZ)
+
+/*
+ * This data structure represents a buffered file that consists of one or
+ * more physical files (each accessed through a virtual file descriptor
+ * managed by fd.c).
+ */
+struct BufFile
+{
+	int			numFiles;		/* number of physical files in set */
+	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
+	File	   *files;			/* palloc'd array with numFiles entries */
+
+	bool		isInterXact;	/* keep open over transactions? */
+	bool		dirty;			/* does buffer need to be written? */
+	bool		readOnly;		/* has the file been set to read only? */
+
+	FileSet    *fileset;		/* space for fileset based segment files */
+	const char *name;			/* name of fileset based BufFile */
+
+	/*
+	 * resowner is the ResourceOwner to use for underlying temp files.  (We
+	 * don't need to remember the memory context we're using explicitly,
+	 * because after creation we only repalloc our arrays larger.)
+	 */
+	ResourceOwner resowner;
+
+	/*
+	 * "current pos" is position of start of buffer within the logical file.
+	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
+	 */
+	int			curFile;		/* file index (0..n) part of current pos */
+	off_t		curOffset;		/* offset part of current pos */
+	int			pos;			/* next read/write position in buffer */
+	int			nbytes;			/* total # of valid bytes in buffer */
+
+	/*
+	 * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
+	 * wasting per-file alignment padding when some users create many files.
+	 */
+	PGAlignedBlock buffer;
+};
+
+static BufFile *makeBufFileCommon(int nfiles);
+static BufFile *makeBufFile(File firstfile);
+static void extendBufFile(BufFile *file);
+static void BufFileLoadBuffer(BufFile *file);
+static void BufFileDumpBuffer(BufFile *file);
+static void BufFileFlush(BufFile *file);
+static File MakeNewFileSetSegment(BufFile *buffile, int segment);
+
+/*
+ * Create BufFile and perform the common initialization.
+ */
+static BufFile *
+makeBufFileCommon(int nfiles)
+{
+	BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
+
+	file->numFiles = nfiles;
+	file->isInterXact = false;
+	file->dirty = false;
+	file->resowner = CurrentResourceOwner;
+	file->curFile = 0;
+	file->curOffset = 0;
+	file->pos = 0;
+	file->nbytes = 0;
+
+	return file;
+}
+
+/*
+ * Create a BufFile given the first underlying physical file.
+ * NOTE: caller must set isInterXact if appropriate.
+ */
+static BufFile *
+makeBufFile(File firstfile)
+{
+	BufFile    *file = makeBufFileCommon(1);
+
+	file->files = (File *) palloc(sizeof(File));
+	file->files[0] = firstfile;
+	file->readOnly = false;
+	file->fileset = NULL;
+	file->name = NULL;
+
+	return file;
+}
+
+/*
+ * Add another component temp file.
+ */
+static void
+extendBufFile(BufFile *file)
+{
+	File		pfile;
+	ResourceOwner oldowner;
+
+	/* Be sure to associate the file with the BufFile's resource owner */
+	oldowner = CurrentResourceOwner;
+	CurrentResourceOwner = file->resowner;
+
+	if (file->fileset == NULL)
+		pfile = OpenTemporaryFile(file->isInterXact);
+	else
+		pfile = MakeNewFileSetSegment(file, file->numFiles);
+
+	Assert(pfile >= 0);
+
+	CurrentResourceOwner = oldowner;
+
+	file->files = (File *) repalloc(file->files,
+									(file->numFiles + 1) * sizeof(File));
+	file->files[file->numFiles] = pfile;
+	file->numFiles++;
+}
+
+/*
+ * Create a BufFile for a new temporary file (which will expand to become
+ * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
+ * written to it).
+ *
+ * If interXact is true, the temp file will not be automatically deleted
+ * at end of transaction.
+ *
+ * Note: if interXact is true, the caller had better be calling us in a
+ * memory context, and with a resource owner, that will survive across
+ * transaction boundaries.
+ */
+BufFile *
+BufFileCreateTemp(bool interXact)
+{
+	BufFile    *file;
+	File		pfile;
+
+	/*
+	 * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
+	 * Possibly the caller will have done this already, but it seems useful to
+	 * double-check here.  Failure to do this at all would result in the temp
+	 * files always getting placed in the default tablespace, which is a
+	 * pretty hard-to-detect bug.  Callers may prefer to do it earlier if they
+	 * want to be sure that any required catalog access is done in some other
+	 * resource context.
+	 */
+	PrepareTempTablespaces();
+
+	pfile = OpenTemporaryFile(interXact);
+	Assert(pfile >= 0);
+
+	file = makeBufFile(pfile);
+	file->isInterXact = interXact;
+
+	return file;
+}
+
+/*
+ * Build the name for a given segment of a given BufFile.
+ */
+static void
+FileSetSegmentName(char *name, const char *buffile_name, int segment)
+{
+	snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
+}
+
+/*
+ * Create a new segment file backing a fileset based BufFile.
+ */
+static File
+MakeNewFileSetSegment(BufFile *buffile, int segment)
+{
+	char		name[MAXPGPATH];
+	File		file;
+
+	/*
+	 * It is possible that there are files left over from before a crash
+	 * restart with the same name.  In order for BufFileOpenFileSet() not to
+	 * get confused about how many segments there are, we'll unlink the next
+	 * segment number if it already exists.
+	 */
+	FileSetSegmentName(name, buffile->name, segment + 1);
+	FileSetDelete(buffile->fileset, name, true);
+
+	/* Create the new segment. */
+	FileSetSegmentName(name, buffile->name, segment);
+	file = FileSetCreate(buffile->fileset, name);
+
+	/* FileSetCreate would've errored out */
+	Assert(file > 0);
+
+	return file;
+}
+
+/*
+ * Create a BufFile that can be discovered and opened read-only by other
+ * backends that are attached to the same SharedFileSet using the same name.
+ *
+ * The naming scheme for fileset based BufFiles is left up to the calling code.
+ * The name will appear as part of one or more filenames on disk, and might
+ * provide clues to administrators about which subsystem is generating
+ * temporary file data.  Since each SharedFileSet object is backed by one or
+ * more uniquely named temporary directory, names don't conflict with
+ * unrelated SharedFileSet objects.
+ */
+BufFile *
+BufFileCreateFileSet(FileSet *fileset, const char *name)
+{
+	BufFile    *file;
+
+	file = makeBufFileCommon(1);
+	file->fileset = fileset;
+	file->name = pstrdup(name);
+	file->files = (File *) palloc(sizeof(File));
+	file->files[0] = MakeNewFileSetSegment(file, 0);
+	file->readOnly = false;
+
+	return file;
+}
+
+/*
+ * Open a file that was previously created in another backend (or this one)
+ * with BufFileCreateFileSet in the same FileSet using the same name.
+ * The backend that created the file must have called BufFileClose() or
+ * BufFileExportFileSet() to make sure that it is ready to be opened by other
+ * backends and render it read-only.  If missing_ok is true, which indicates
+ * that missing files can be safely ignored, then return NULL if the BufFile
+ * with the given name is not found, otherwise, throw an error.
+ */
+BufFile *
+BufFileOpenFileSet(FileSet *fileset, const char *name, int mode,
+				   bool missing_ok)
+{
+	BufFile    *file;
+	char		segment_name[MAXPGPATH];
+	Size		capacity = 16;
+	File	   *files;
+	int			nfiles = 0;
+
+	files = palloc(sizeof(File) * capacity);
+
+	/*
+	 * We don't know how many segments there are, so we'll probe the
+	 * filesystem to find out.
+	 */
+	for (;;)
+	{
+		/* See if we need to expand our file segment array. */
+		if (nfiles + 1 > capacity)
+		{
+			capacity *= 2;
+			files = repalloc(files, sizeof(File) * capacity);
+		}
+		/* Try to load a segment. */
+		FileSetSegmentName(segment_name, name, nfiles);
+		files[nfiles] = FileSetOpen(fileset, segment_name, mode);
+		if (files[nfiles] <= 0)
+			break;
+		++nfiles;
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	/*
+	 * If we didn't find any files at all, then no BufFile exists with this
+	 * name.
+	 */
+	if (nfiles == 0)
+	{
+		/* free the memory */
+		pfree(files);
+
+		if (missing_ok)
+			return NULL;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
+						segment_name, name)));
+	}
+
+	file = makeBufFileCommon(nfiles);
+	file->files = files;
+	file->readOnly = (mode == O_RDONLY);
+	file->fileset = fileset;
+	file->name = pstrdup(name);
+
+	return file;
+}
+
+/*
+ * Delete a BufFile that was created by BufFileCreateFileSet in the given
+ * FileSet using the given name.
+ *
+ * It is not necessary to delete files explicitly with this function.  It is
+ * provided only as a way to delete files proactively, rather than waiting for
+ * the FileSet to be cleaned up.
+ *
+ * Only one backend should attempt to delete a given name, and should know
+ * that it exists and has been exported or closed otherwise missing_ok should
+ * be passed true.
+ */
+void
+BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
+{
+	char		segment_name[MAXPGPATH];
+	int			segment = 0;
+	bool		found = false;
+
+	/*
+	 * We don't know how many segments the file has.  We'll keep deleting
+	 * until we run out.  If we don't manage to find even an initial segment,
+	 * raise an error.
+	 */
+	for (;;)
+	{
+		FileSetSegmentName(segment_name, name, segment);
+		if (!FileSetDelete(fileset, segment_name, true))
+			break;
+		found = true;
+		++segment;
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	if (!found && !missing_ok)
+		elog(ERROR, "could not delete unknown BufFile \"%s\"", name);
+}
+
+/*
+ * BufFileExportFileSet --- flush and make read-only, in preparation for sharing.
+ */
+void
+BufFileExportFileSet(BufFile *file)
+{
+	/* Must be a file belonging to a FileSet. */
+	Assert(file->fileset != NULL);
+
+	/* It's probably a bug if someone calls this twice. */
+	Assert(!file->readOnly);
+
+	BufFileFlush(file);
+	file->readOnly = true;
+}
+
+/*
+ * Close a BufFile
+ *
+ * Like fclose(), this also implicitly FileCloses the underlying File.
+ */
+void
+BufFileClose(BufFile *file)
+{
+	int			i;
+
+	/* flush any unwritten data */
+	BufFileFlush(file);
+	/* close and delete the underlying file(s) */
+	for (i = 0; i < file->numFiles; i++)
+		FileClose(file->files[i]);
+	/* release the buffer space */
+	pfree(file->files);
+	pfree(file);
+}
+
+/*
+ * BufFileLoadBuffer
+ *
+ * Load some data into buffer, if possible, starting from curOffset.
+ * At call, must have dirty = false, pos and nbytes = 0.
+ * On exit, nbytes is number of bytes loaded.
+ */
+static void
+BufFileLoadBuffer(BufFile *file)
+{
+	File		thisfile;
+	instr_time	io_start;
+	instr_time	io_time;
+
+	/*
+	 * Advance to next component file if necessary and possible.
+	 */
+	if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
+		file->curFile + 1 < file->numFiles)
+	{
+		file->curFile++;
+		file->curOffset = 0;
+	}
+
+	thisfile = file->files[file->curFile];
+
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+	else
+		INSTR_TIME_SET_ZERO(io_start);
+
+	/*
+	 * Read whatever we can get, up to a full bufferload.
+	 */
+	file->nbytes = FileRead(thisfile,
+							file->buffer.data,
+							sizeof(file->buffer),
+							file->curOffset,
+							WAIT_EVENT_BUFFILE_READ);
+	if (file->nbytes < 0)
+	{
+		file->nbytes = 0;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						FilePathName(thisfile))));
+	}
+
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+	}
+
+	/* we choose not to advance curOffset here */
+
+	if (file->nbytes > 0)
+		pgBufferUsage.temp_blks_read++;
+}
+
+/*
+ * BufFileDumpBuffer
+ *
+ * Dump buffer contents starting at curOffset.
+ * At call, should have dirty = true, nbytes > 0.
+ * On exit, dirty is cleared if successful write, and curOffset is advanced.
+ */
+static void
+BufFileDumpBuffer(BufFile *file)
+{
+	int			wpos = 0;
+	int			bytestowrite;
+	File		thisfile;
+
+	/*
+	 * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
+	 * crosses a component-file boundary; so we need a loop.
+	 */
+	while (wpos < file->nbytes)
+	{
+		off_t		availbytes;
+		instr_time	io_start;
+		instr_time	io_time;
+
+		/*
+		 * Advance to next component file if necessary and possible.
+		 */
+		if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
+		{
+			while (file->curFile + 1 >= file->numFiles)
+				extendBufFile(file);
+			file->curFile++;
+			file->curOffset = 0;
+		}
+
+		/*
+		 * Determine how much we need to write into this file.
+		 */
+		bytestowrite = file->nbytes - wpos;
+		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
+
+		if ((off_t) bytestowrite > availbytes)
+			bytestowrite = (int) availbytes;
+
+		thisfile = file->files[file->curFile];
+
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+		else
+			INSTR_TIME_SET_ZERO(io_start);
+
+		bytestowrite = FileWrite(thisfile,
+								 file->buffer.data + wpos,
+								 bytestowrite,
+								 file->curOffset,
+								 WAIT_EVENT_BUFFILE_WRITE);
+		if (bytestowrite <= 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m",
+							FilePathName(thisfile))));
+
+		if (track_io_timing)
+		{
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+		}
+
+		file->curOffset += bytestowrite;
+		wpos += bytestowrite;
+
+		pgBufferUsage.temp_blks_written++;
+	}
+	file->dirty = false;
+
+	/*
+	 * At this point, curOffset has been advanced to the end of the buffer,
+	 * ie, its original value + nbytes.  We need to make it point to the
+	 * logical file position, ie, original value + pos, in case that is less
+	 * (as could happen due to a small backwards seek in a dirty buffer!)
+	 */
+	file->curOffset -= (file->nbytes - file->pos);
+	if (file->curOffset < 0)	/* handle possible segment crossing */
+	{
+		file->curFile--;
+		Assert(file->curFile >= 0);
+		file->curOffset += MAX_PHYSICAL_FILESIZE;
+	}
+
+	/*
+	 * Now we can set the buffer empty without changing the logical position
+	 */
+	file->pos = 0;
+	file->nbytes = 0;
+}
+
+/*
+ * BufFileRead variants
+ *
+ * Like fread() except we assume 1-byte element size and report I/O errors via
+ * ereport().
+ *
+ * If 'exact' is true, then an error is also raised if the number of bytes
+ * read is not exactly 'size' (no short reads).  If 'exact' and 'eofOK' are
+ * true, then reading zero bytes is ok.
+ */
+static size_t
+BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
+{
+	size_t		start_size = size;
+	size_t		nread = 0;
+	size_t		nthistime;
+
+	BufFileFlush(file);
+
+	while (size > 0)
+	{
+		if (file->pos >= file->nbytes)
+		{
+			/* Try to load more data into buffer. */
+			file->curOffset += file->pos;
+			file->pos = 0;
+			file->nbytes = 0;
+			BufFileLoadBuffer(file);
+			if (file->nbytes <= 0)
+				break;			/* no more data available */
+		}
+
+		nthistime = file->nbytes - file->pos;
+		if (nthistime > size)
+			nthistime = size;
+		Assert(nthistime > 0);
+
+		memcpy(ptr, file->buffer.data + file->pos, nthistime);
+
+		file->pos += nthistime;
+		ptr = (char *) ptr + nthistime;
+		size -= nthistime;
+		nread += nthistime;
+	}
+
+	if (exact &&
+		(nread != start_size && !(nread == 0 && eofOK)))
+		ereport(ERROR,
+				errcode_for_file_access(),
+				file->name ?
+				errmsg("could not read from file set \"%s\": read only %zu of %zu bytes",
+					   file->name, nread, start_size) :
+				errmsg("could not read from temporary file: read only %zu of %zu bytes",
+					   nread, start_size));
+
+	return nread;
+}
+
+/*
+ * Legacy interface where the caller needs to check for end of file or short
+ * reads.
+ */
+size_t
+BufFileRead(BufFile *file, void *ptr, size_t size)
+{
+	return BufFileReadCommon(file, ptr, size, false, false);
+}
+
+/*
+ * Require read of exactly the specified size.
+ */
+void
+BufFileReadExact(BufFile *file, void *ptr, size_t size)
+{
+	BufFileReadCommon(file, ptr, size, true, false);
+}
+
+/*
+ * Require read of exactly the specified size, but optionally allow end of
+ * file (in which case 0 is returned).
+ */
+size_t
+BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
+{
+	return BufFileReadCommon(file, ptr, size, true, eofOK);
+}
+
+/*
+ * BufFileWrite
+ *
+ * Like fwrite() except we assume 1-byte element size and report errors via
+ * ereport().
+ */
+void
+BufFileWrite(BufFile *file, const void *ptr, size_t size)
+{
+	size_t		nthistime;
+
+	Assert(!file->readOnly);
+
+	while (size > 0)
+	{
+		if (file->pos >= BLCKSZ)
+		{
+			/* Buffer full, dump it out */
+			if (file->dirty)
+				BufFileDumpBuffer(file);
+			else
+			{
+				/* Hmm, went directly from reading to writing? */
+				file->curOffset += file->pos;
+				file->pos = 0;
+				file->nbytes = 0;
+			}
+		}
+
+		nthistime = BLCKSZ - file->pos;
+		if (nthistime > size)
+			nthistime = size;
+		Assert(nthistime > 0);
+
+		memcpy(file->buffer.data + file->pos, ptr, nthistime);
+
+		file->dirty = true;
+		file->pos += nthistime;
+		if (file->nbytes < file->pos)
+			file->nbytes = file->pos;
+		ptr = (const char *) ptr + nthistime;
+		size -= nthistime;
+	}
+}
+
+/*
+ * BufFileFlush
+ *
+ * Like fflush(), except that I/O errors are reported with ereport().
+ */
+static void
+BufFileFlush(BufFile *file)
+{
+	if (file->dirty)
+		BufFileDumpBuffer(file);
+
+	Assert(!file->dirty);
+}
+
+/*
+ * BufFileSeek
+ *
+ * Like fseek(), except that target position needs two values in order to
+ * work when logical filesize exceeds maximum value representable by off_t.
+ * We do not support relative seeks across more than that, however.
+ * I/O errors are reported by ereport().
+ *
+ * Result is 0 if OK, EOF if not.  Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
+{
+	int			newFile;
+	off_t		newOffset;
+
+	switch (whence)
+	{
+		case SEEK_SET:
+			if (fileno < 0)
+				return EOF;
+			newFile = fileno;
+			newOffset = offset;
+			break;
+		case SEEK_CUR:
+
+			/*
+			 * Relative seek considers only the signed offset, ignoring
+			 * fileno. Note that large offsets (> 1 GB) risk overflow in this
+			 * add, unless we have 64-bit off_t.
+			 */
+			newFile = file->curFile;
+			newOffset = (file->curOffset + file->pos) + offset;
+			break;
+		case SEEK_END:
+
+			/*
+			 * The file size of the last file gives us the end offset of that
+			 * file.
+			 */
+			newFile = file->numFiles - 1;
+			newOffset = FileSize(file->files[file->numFiles - 1]);
+			if (newOffset < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+								FilePathName(file->files[file->numFiles - 1]),
+								file->name)));
+			break;
+		default:
+			elog(ERROR, "invalid whence: %d", whence);
+			return EOF;
+	}
+	while (newOffset < 0)
+	{
+		if (--newFile < 0)
+			return EOF;
+		newOffset += MAX_PHYSICAL_FILESIZE;
+	}
+	if (newFile == file->curFile &&
+		newOffset >= file->curOffset &&
+		newOffset <= file->curOffset + file->nbytes)
+	{
+		/*
+		 * Seek is to a point within existing buffer; we can just adjust
+		 * pos-within-buffer, without flushing buffer.  Note this is OK
+		 * whether reading or writing, but buffer remains dirty if we were
+		 * writing.
+		 */
+		file->pos = (int) (newOffset - file->curOffset);
+		return 0;
+	}
+	/* Otherwise, must reposition buffer, so flush any dirty data */
+	BufFileFlush(file);
+
+	/*
+	 * At this point and no sooner, check for seek past last segment. The
+	 * above flush could have created a new segment, so checking sooner would
+	 * not work (at least not with this code).
+	 */
+
+	/* convert seek to "start of next seg" to "end of last seg" */
+	if (newFile == file->numFiles && newOffset == 0)
+	{
+		newFile--;
+		newOffset = MAX_PHYSICAL_FILESIZE;
+	}
+	while (newOffset > MAX_PHYSICAL_FILESIZE)
+	{
+		if (++newFile >= file->numFiles)
+			return EOF;
+		newOffset -= MAX_PHYSICAL_FILESIZE;
+	}
+	if (newFile >= file->numFiles)
+		return EOF;
+	/* Seek is OK! */
+	file->curFile = newFile;
+	file->curOffset = newOffset;
+	file->pos = 0;
+	file->nbytes = 0;
+	return 0;
+}
+
+void
+BufFileTell(BufFile *file, int *fileno, off_t *offset)
+{
+	*fileno = file->curFile;
+	*offset = file->curOffset + file->pos;
+}
+
+/*
+ * BufFileSeekBlock --- block-oriented seek
+ *
+ * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
+ * the file.  Note that users of this interface will fail if their files
+ * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
+ * with tables bigger than that, either...
+ *
+ * Result is 0 if OK, EOF if not.  Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeekBlock(BufFile *file, long blknum)
+{
+	return BufFileSeek(file,
+					   (int) (blknum / BUFFILE_SEG_SIZE),
+					   (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
+					   SEEK_SET);
+}
+
+#ifdef NOT_USED
+/*
+ * BufFileTellBlock --- block-oriented tell
+ *
+ * Any fractional part of a block in the current seek position is ignored.
+ */
+long
+BufFileTellBlock(BufFile *file)
+{
+	long		blknum;
+
+	blknum = (file->curOffset + file->pos) / BLCKSZ;
+	blknum += file->curFile * BUFFILE_SEG_SIZE;
+	return blknum;
+}
+
+#endif
+
+/*
+ * Return the current fileset based BufFile size.
+ *
+ * Counts any holes left behind by BufFileAppend as part of the size.
+ * ereport()s on failure.
+ */
+int64
+BufFileSize(BufFile *file)
+{
+	int64		lastFileSize;
+
+	Assert(file->fileset != NULL);
+
+	/* Get the size of the last physical file. */
+	lastFileSize = FileSize(file->files[file->numFiles - 1]);
+	if (lastFileSize < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+						FilePathName(file->files[file->numFiles - 1]),
+						file->name)));
+
+	return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
+		lastFileSize;
+}
+
+/*
+ * Append the contents of source file (managed within fileset) to
+ * end of target file (managed within same fileset).
+ *
+ * Note that operation subsumes ownership of underlying resources from
+ * "source".  Caller should never call BufFileClose against source having
+ * called here first.  Resource owners for source and target must match,
+ * too.
+ *
+ * This operation works by manipulating lists of segment files, so the
+ * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
+ * boundary, typically creating empty holes before the boundary.  These
+ * areas do not contain any interesting data, and cannot be read from by
+ * caller.
+ *
+ * Returns the block number within target where the contents of source
+ * begins.  Caller should apply this as an offset when working off block
+ * positions that are in terms of the original BufFile space.
+ */
+long
+BufFileAppend(BufFile *target, BufFile *source)
+{
+	long		startBlock = target->numFiles * BUFFILE_SEG_SIZE;
+	int			newNumFiles = target->numFiles + source->numFiles;
+	int			i;
+
+	Assert(target->fileset != NULL);
+	Assert(source->readOnly);
+	Assert(!source->dirty);
+	Assert(source->fileset != NULL);
+
+	if (target->resowner != source->resowner)
+		elog(ERROR, "could not append BufFile with non-matching resource owner");
+
+	target->files = (File *)
+		repalloc(target->files, sizeof(File) * newNumFiles);
+	for (i = target->numFiles; i < newNumFiles; i++)
+		target->files[i] = source->files[i - target->numFiles];
+	target->numFiles = newNumFiles;
+
+	return startBlock;
+}
+
+/*
+ * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno
+ * and the offset.
+ */
+void
+BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
+{
+	int			numFiles = file->numFiles;
+	int			newFile = fileno;
+	off_t		newOffset = file->curOffset;
+	char		segment_name[MAXPGPATH];
+	int			i;
+
+	/*
+	 * Loop over all the files up to the given fileno and remove the files
+	 * that are greater than the fileno and truncate the given file up to the
+	 * offset. Note that we also remove the given fileno if the offset is 0
+	 * provided it is not the first file in which we truncate it.
+	 */
+	for (i = file->numFiles - 1; i >= fileno; i--)
+	{
+		if ((i != fileno || offset == 0) && i != 0)
+		{
+			FileSetSegmentName(segment_name, file->name, i);
+			FileClose(file->files[i]);
+			if (!FileSetDelete(file->fileset, segment_name, true))
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not delete fileset \"%s\": %m",
+								segment_name)));
+			numFiles--;
+			newOffset = MAX_PHYSICAL_FILESIZE;
+
+			/*
+			 * This is required to indicate that we have deleted the given
+			 * fileno.
+			 */
+			if (i == fileno)
+				newFile--;
+		}
+		else
+		{
+			if (FileTruncate(file->files[i], offset,
+							 WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not truncate file \"%s\": %m",
+								FilePathName(file->files[i]))));
+			newOffset = offset;
+		}
+	}
+
+	file->numFiles = numFiles;
+
+	/*
+	 * If the truncate point is within existing buffer then we can just adjust
+	 * pos within buffer.
+	 */
+	if (newFile == file->curFile &&
+		newOffset >= file->curOffset &&
+		newOffset <= file->curOffset + file->nbytes)
+	{
+		/* No need to reset the current pos if the new pos is greater. */
+		if (newOffset <= file->curOffset + file->pos)
+			file->pos = (int) (newOffset - file->curOffset);
+
+		/* Adjust the nbytes for the current buffer. */
+		file->nbytes = (int) (newOffset - file->curOffset);
+	}
+	else if (newFile == file->curFile &&
+			 newOffset < file->curOffset)
+	{
+		/*
+		 * The truncate point is within the existing file but prior to the
+		 * current position, so we can forget the current buffer and reset the
+		 * current position.
+		 */
+		file->curOffset = newOffset;
+		file->pos = 0;
+		file->nbytes = 0;
+	}
+	else if (newFile < file->curFile)
+	{
+		/*
+		 * The truncate point is prior to the current file, so need to reset
+		 * the current position accordingly.
+		 */
+		file->curFile = newFile;
+		file->curOffset = newOffset;
+		file->pos = 0;
+		file->nbytes = 0;
+	}
+	/* Nothing to do, if the truncate point is beyond current file. */
+}
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
new file mode 100644
index 0000000..e04bc39
--- /dev/null
+++ b/src/backend/storage/file/copydir.c
@@ -0,0 +1,216 @@
+/*-------------------------------------------------------------------------
+ *
+ * copydir.c
+ *	  copies a directory
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *	While "xcopy /e /i /q" works fine for copying directories, on Windows XP
+ *	it requires a Window handle which prevents it from working when invoked
+ *	as a service.
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/copydir.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "common/file_utils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+
+/*
+ * copydir: copy a directory
+ *
+ * If recurse is false, subdirectories are ignored.  Anything that's not
+ * a directory or a regular file is ignored.
+ */
+void
+copydir(const char *fromdir, const char *todir, bool recurse)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		fromfile[MAXPGPATH * 2];
+	char		tofile[MAXPGPATH * 2];
+
+	if (MakePGDirectory(todir) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create directory \"%s\": %m", todir)));
+
+	xldir = AllocateDir(fromdir);
+
+	while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+	{
+		PGFileType	xlde_type;
+
+		/* If we got a cancel signal during the copy of the directory, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(xlde->d_name, ".") == 0 ||
+			strcmp(xlde->d_name, "..") == 0)
+			continue;
+
+		snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name);
+		snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+		xlde_type = get_dirent_type(fromfile, xlde, false, ERROR);
+
+		if (xlde_type == PGFILETYPE_DIR)
+		{
+			/* recurse to handle subdirectories */
+			if (recurse)
+				copydir(fromfile, tofile, true);
+		}
+		else if (xlde_type == PGFILETYPE_REG)
+			copy_file(fromfile, tofile);
+	}
+	FreeDir(xldir);
+
+	/*
+	 * Be paranoid here and fsync all files to ensure the copy is really done.
+	 * But if fsync is disabled, we're done.
+	 */
+	if (!enableFsync)
+		return;
+
+	xldir = AllocateDir(todir);
+
+	while ((xlde = ReadDir(xldir, todir)) != NULL)
+	{
+		if (strcmp(xlde->d_name, ".") == 0 ||
+			strcmp(xlde->d_name, "..") == 0)
+			continue;
+
+		snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+		/*
+		 * We don't need to sync subdirectories here since the recursive
+		 * copydir will do it before it returns
+		 */
+		if (get_dirent_type(tofile, xlde, false, ERROR) == PGFILETYPE_REG)
+			fsync_fname(tofile, false);
+	}
+	FreeDir(xldir);
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced. Recent versions of ext4 have made the window much wider but
+	 * it's been true for ext3 and other filesystems in the past.
+	 */
+	fsync_fname(todir, true);
+}
+
+/*
+ * copy one file
+ */
+void
+copy_file(const char *fromfile, const char *tofile)
+{
+	char	   *buffer;
+	int			srcfd;
+	int			dstfd;
+	int			nbytes;
+	off_t		offset;
+	off_t		flush_offset;
+
+	/* Size of copy buffer (read and write requests) */
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+	/*
+	 * Size of data flush requests.  It seems beneficial on most platforms to
+	 * do this every 1MB or so.  But macOS, at least with early releases of
+	 * APFS, is really unfriendly to small mmap/msync requests, so there do it
+	 * only every 32MB.
+	 */
+#if defined(__darwin__)
+#define FLUSH_DISTANCE (32 * 1024 * 1024)
+#else
+#define FLUSH_DISTANCE (1024 * 1024)
+#endif
+
+	/* Use palloc to ensure we get a maxaligned buffer */
+	buffer = palloc(COPY_BUF_SIZE);
+
+	/*
+	 * Open the files
+	 */
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	/*
+	 * Do the data copying.
+	 */
+	flush_offset = 0;
+	for (offset = 0;; offset += nbytes)
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * We fsync the files later, but during the copy, flush them every so
+		 * often to avoid spamming the cache and hopefully get the kernel to
+		 * start writing them out before the fsync comes.
+		 */
+		if (offset - flush_offset >= FLUSH_DISTANCE)
+		{
+			pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+			flush_offset = offset;
+		}
+
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ);
+		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+		pgstat_report_wait_end();
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m", fromfile)));
+		if (nbytes == 0)
+			break;
+		errno = 0;
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE);
+		if ((int) write(dstfd, buffer, nbytes) != nbytes)
+		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m", tofile)));
+		}
+		pgstat_report_wait_end();
+	}
+
+	if (offset > flush_offset)
+		pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+
+	pfree(buffer);
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
new file mode 100644
index 0000000..16b3e8f
--- /dev/null
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,3976 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c
+ *	  Virtual file descriptor code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/fd.c
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have.  (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed.  Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * INTERFACE ROUTINES
+ *
+ * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
+ * A File opened with OpenTemporaryFile is automatically deleted when the
+ * File is closed, either explicitly or implicitly at end of transaction or
+ * process exit. PathNameOpenFile is intended for files that are held open
+ * for a long time, like relation files. It is the caller's responsibility
+ * to close them, there is no automatic mechanism in fd.c for that.
+ *
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends.  Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted.  See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
+ * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
+ * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
+ * They behave like the corresponding native functions, except that the handle
+ * is registered with the current subtransaction, and will be automatically
+ * closed at abort. These are intended mainly for short operations like
+ * reading a configuration file; there is a limit on the number of files that
+ * can be opened using these functions at any one time.
+ *
+ * Finally, BasicOpenFile is just a thin wrapper around open() that can
+ * release file descriptors in use by the virtual file descriptors if
+ * necessary. There is no automatic cleanup of file descriptors returned by
+ * BasicOpenFile, it is solely the caller's responsibility to close the file
+ * descriptor by calling close(2).
+ *
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open.  This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <dirent.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <sys/resource.h>		/* for getrlimit */
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/pg_tablespace.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "common/pg_prng.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/startup.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include "utils/resowner_private.h"
+#include "utils/varlena.h"
+
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(HAVE_SYNC_FILE_RANGE)
+#define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
+
+/*
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c.  This
+ * is the number left free.  (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs.  So it's a bad idea to try to open files
+ * without consulting fd.c.  Nonetheless we cannot control all code.)
+ *
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient.  Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files.  (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
+ */
+#define NUM_RESERVED_FDS		10
+
+/*
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke.  (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that.  Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
+ */
+#define FD_MINFREE				48
+
+/*
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
+ */
+int			max_files_per_process = 1000;
+
+/*
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs).  This is initialized
+ * to a conservative value, and remains that way indefinitely in bootstrap or
+ * standalone-backend cases.  In normal postmaster operation, the postmaster
+ * calls set_max_safe_fds() late in initialization to update the value, and
+ * that value is then inherited by forked subprocesses.
+ *
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
+ */
+int			max_safe_fds = FD_MINFREE;	/* default if not changed */
+
+/* Whether it is safe to continue running after fsync() fails. */
+bool		data_sync_retry = false;
+
+/* How SyncDataDirectory() should do its job. */
+int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+
+/* Which kinds of files should be opened with PG_O_DIRECT. */
+int			io_direct_flags;
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+#define DO_DB(A) \
+	do { \
+		int			_do_db_save_errno = errno; \
+		A; \
+		errno = _do_db_save_errno; \
+	} while (0)
+#else
+#define DO_DB(A) \
+	((void) 0)
+#endif
+
+#define VFD_CLOSED (-1)
+
+#define FileIsValid(file) \
+	((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+/* these are the assigned bits in fdstate below: */
+#define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT	(1 << 2)	/* T = respect temp_file_limit */
+
+typedef struct vfd
+{
+	int			fd;				/* current FD, or VFD_CLOSED if none */
+	unsigned short fdstate;		/* bitflags for VFD's state */
+	ResourceOwner resowner;		/* owner, for automatic cleanup */
+	File		nextFree;		/* link to next free VFD, if in freelist */
+	File		lruMoreRecently;	/* doubly linked recency-of-use list */
+	File		lruLessRecently;
+	off_t		fileSize;		/* current size of file (0 if not temporary) */
+	char	   *fileName;		/* name of file, or NULL for unused VFD */
+	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
+	int			fileFlags;		/* open(2) flags for (re)opening the file */
+	mode_t		fileMode;		/* mode to pass to open(2) */
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size.  This grows as
+ * needed.  'File' values are indexes into this array.
+ * Note that VfdCache[0] is not a usable VFD, just a list header.
+ */
+static Vfd *VfdCache;
+static Size SizeVfdCache = 0;
+
+/*
+ * Number of file descriptors known to be in use by VFD entries.
+ */
+static int	nfile = 0;
+
+/*
+ * Flag to tell whether it's worth scanning VfdCache looking for temp files
+ * to close
+ */
+static bool have_xact_temporary_files = false;
+
+/*
+ * Tracks the total size of all temporary files.  Note: when temp_file_limit
+ * is being enforced, this cannot overflow since the limit cannot be more
+ * than INT_MAX kilobytes.  When not enforcing, it could theoretically
+ * overflow, but we don't care.
+ */
+static uint64 temporary_files_size = 0;
+
+/* Temporary file access initialized and not yet shut down? */
+#ifdef USE_ASSERT_CHECKING
+static bool temporary_files_allowed = false;
+#endif
+
+/*
+ * List of OS handles opened with AllocateFile, AllocateDir and
+ * OpenTransientFile.
+ */
+typedef enum
+{
+	AllocateDescFile,
+	AllocateDescPipe,
+	AllocateDescDir,
+	AllocateDescRawFD
+} AllocateDescKind;
+
+typedef struct
+{
+	AllocateDescKind kind;
+	SubTransactionId create_subid;
+	union
+	{
+		FILE	   *file;
+		DIR		   *dir;
+		int			fd;
+	}			desc;
+} AllocateDesc;
+
+static int	numAllocatedDescs = 0;
+static int	maxAllocatedDescs = 0;
+static AllocateDesc *allocatedDescs = NULL;
+
+/*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int	numExternalFDs = 0;
+
+/*
+ * Number of temporary files opened during the current session;
+ * this is used in generation of tempfile names.
+ */
+static long tempFileCounter = 0;
+
+/*
+ * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
+ * indicating that the current database's default tablespace should be used.)
+ * When numTempTableSpaces is -1, this has not been set in the current
+ * transaction.
+ */
+static Oid *tempTableSpaces = NULL;
+static int	numTempTableSpaces = -1;
+static int	nextTempTableSpace = 0;
+
+
+/*--------------------
+ *
+ * Private Routines
+ *
+ * Delete		   - delete a file from the Lru ring
+ * LruDelete	   - remove a file from the Lru ring and close its FD
+ * Insert		   - put a file at the front of the Lru ring
+ * LruInsert	   - put a file at the front of the Lru ring and open it
+ * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
+ * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
+ * AllocateVfd	   - grab a free (or new) file record (from VfdCache)
+ * FreeVfd		   - free a file record
+ *
+ * The Least Recently Used ring is a doubly linked list that begins and
+ * ends on element zero.  Element zero is special -- it doesn't represent
+ * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
+ * anchor that shows us the beginning/end of the ring.
+ * Only VFD elements that are currently really open (have an FD assigned) are
+ * in the Lru ring.  Elements that are "virtually" open can be recognized
+ * by having a non-null fileName field.
+ *
+ * example:
+ *
+ *	   /--less----\				   /---------\
+ *	   v		   \			  v			  \
+ *	 #0 --more---> LeastRecentlyUsed --more-\ \
+ *	  ^\									| |
+ *	   \\less--> MostRecentlyUsedFile	<---/ |
+ *		\more---/					 \--less--/
+ *
+ *--------------------
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int	LruInsert(File file);
+static bool ReleaseLruFile(void);
+static void ReleaseLruFiles(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int	FileAccess(File file);
+static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
+static bool reserveAllocatedDesc(void);
+static int	FreeDesc(AllocateDesc *desc);
+
+static void BeforeShmemExit_Files(int code, Datum arg);
+static void CleanupTempFiles(bool isCommit, bool isProcExit);
+static void RemovePgTempRelationFiles(const char *tsdirname);
+static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
+
+static void walkdir(const char *path,
+					void (*action) (const char *fname, bool isdir, int elevel),
+					bool process_symlinks,
+					int elevel);
+#ifdef PG_FLUSH_DATA_WORKS
+static void pre_sync_fname(const char *fname, bool isdir, int elevel);
+#endif
+static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
+
+static int	fsync_parent_path(const char *fname, int elevel);
+
+
+/*
+ * pg_fsync --- do fsync with or without writethrough
+ */
+int
+pg_fsync(int fd)
+{
+#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
+	struct stat st;
+
+	/*
+	 * Some operating system implementations of fsync() have requirements
+	 * about the file access modes that were used when their file descriptor
+	 * argument was opened, and these requirements differ depending on whether
+	 * the file descriptor is for a directory.
+	 *
+	 * For any file descriptor that may eventually be handed to fsync(), we
+	 * should have opened it with access modes that are compatible with
+	 * fsync() on all supported systems, otherwise the code may not be
+	 * portable, even if it runs ok on the current system.
+	 *
+	 * We assert here that a descriptor for a file was opened with write
+	 * permissions (either O_RDWR or O_WRONLY) and for a directory without
+	 * write permissions (O_RDONLY).
+	 *
+	 * Ignore any fstat errors and let the follow-up fsync() do its work.
+	 * Doing this sanity check here counts for the case where fsync() is
+	 * disabled.
+	 */
+	if (fstat(fd, &st) == 0)
+	{
+		int			desc_flags = fcntl(fd, F_GETFL);
+
+		/*
+		 * O_RDONLY is historically 0, so just make sure that for directories
+		 * no write flags are used.
+		 */
+		if (S_ISDIR(st.st_mode))
+			Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
+		else
+			Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
+	}
+	errno = 0;
+#endif
+
+	/* #if is to skip the sync_method test if there's no need for it */
+#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
+	if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
+		return pg_fsync_writethrough(fd);
+	else
+#endif
+		return pg_fsync_no_writethrough(fd);
+}
+
+
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ *	enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
+{
+	int			rc;
+
+	if (!enableFsync)
+		return 0;
+
+retry:
+	rc = fsync(fd);
+
+	if (rc == -1 && errno == EINTR)
+		goto retry;
+
+	return rc;
+}
+
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
+{
+	if (enableFsync)
+	{
+#ifdef WIN32
+		return _commit(fd);
+#elif defined(F_FULLFSYNC)
+		return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+		errno = ENOSYS;
+		return -1;
+#endif
+	}
+	else
+		return 0;
+}
+
+/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ */
+int
+pg_fdatasync(int fd)
+{
+	int			rc;
+
+	if (!enableFsync)
+		return 0;
+
+retry:
+	rc = fdatasync(fd);
+
+	if (rc == -1 && errno == EINTR)
+		goto retry;
+
+	return rc;
+}
+
+/*
+ * pg_flush_data --- advise OS that the described dirty data should be flushed
+ *
+ * offset of 0 with nbytes 0 means that the entire file should be flushed
+ */
+void
+pg_flush_data(int fd, off_t offset, off_t nbytes)
+{
+	/*
+	 * Right now file flushing is primarily used to avoid making later
+	 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
+	 * if fsyncs are disabled - that's a decision we might want to make
+	 * configurable at some point.
+	 */
+	if (!enableFsync)
+		return;
+
+	/*
+	 * We compile all alternatives that are supported on the current platform,
+	 * to find portability problems more easily.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE)
+	{
+		int			rc;
+		static bool not_implemented_by_kernel = false;
+
+		if (not_implemented_by_kernel)
+			return;
+
+retry:
+
+		/*
+		 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
+		 * tells the OS that writeback for the specified blocks should be
+		 * started, but that we don't want to wait for completion.  Note that
+		 * this call might block if too much dirty data exists in the range.
+		 * This is the preferable method on OSs supporting it, as it works
+		 * reliably when available (contrast to msync()) and doesn't flush out
+		 * clean data (like FADV_DONTNEED).
+		 */
+		rc = sync_file_range(fd, offset, nbytes,
+							 SYNC_FILE_RANGE_WRITE);
+		if (rc != 0)
+		{
+			int			elevel;
+
+			if (rc == EINTR)
+				goto retry;
+
+			/*
+			 * For systems that don't have an implementation of
+			 * sync_file_range() such as Windows WSL, generate only one
+			 * warning and then suppress all further attempts by this process.
+			 */
+			if (errno == ENOSYS)
+			{
+				elevel = WARNING;
+				not_implemented_by_kernel = true;
+			}
+			else
+				elevel = data_sync_elevel(WARNING);
+
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not flush dirty data: %m")));
+		}
+
+		return;
+	}
+#endif
+#if !defined(WIN32) && defined(MS_ASYNC)
+	{
+		void	   *p;
+		static int	pagesize = 0;
+
+		/*
+		 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
+		 * writeback. On linux it only does so if MS_SYNC is specified, but
+		 * then it does the writeback synchronously. Luckily all common linux
+		 * systems have sync_file_range().  This is preferable over
+		 * FADV_DONTNEED because it doesn't flush out clean data.
+		 *
+		 * We map the file (mmap()), tell the kernel to sync back the contents
+		 * (msync()), and then remove the mapping again (munmap()).
+		 */
+
+		/* mmap() needs actual length if we want to map whole file */
+		if (offset == 0 && nbytes == 0)
+		{
+			nbytes = lseek(fd, 0, SEEK_END);
+			if (nbytes < 0)
+			{
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not determine dirty data size: %m")));
+				return;
+			}
+		}
+
+		/*
+		 * Some platforms reject partial-page mmap() attempts.  To deal with
+		 * that, just truncate the request to a page boundary.  If any extra
+		 * bytes don't get flushed, well, it's only a hint anyway.
+		 */
+
+		/* fetch pagesize only once */
+		if (pagesize == 0)
+			pagesize = sysconf(_SC_PAGESIZE);
+
+		/* align length to pagesize, dropping any fractional page */
+		if (pagesize > 0)
+			nbytes = (nbytes / pagesize) * pagesize;
+
+		/* fractional-page request is a no-op */
+		if (nbytes <= 0)
+			return;
+
+		/*
+		 * mmap could well fail, particularly on 32-bit platforms where there
+		 * may simply not be enough address space.  If so, silently fall
+		 * through to the next implementation.
+		 */
+		if (nbytes <= (off_t) SSIZE_MAX)
+			p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
+		else
+			p = MAP_FAILED;
+
+		if (p != MAP_FAILED)
+		{
+			int			rc;
+
+			rc = msync(p, (size_t) nbytes, MS_ASYNC);
+			if (rc != 0)
+			{
+				ereport(data_sync_elevel(WARNING),
+						(errcode_for_file_access(),
+						 errmsg("could not flush dirty data: %m")));
+				/* NB: need to fall through to munmap()! */
+			}
+
+			rc = munmap(p, (size_t) nbytes);
+			if (rc != 0)
+			{
+				/* FATAL error because mapping would remain */
+				ereport(FATAL,
+						(errcode_for_file_access(),
+						 errmsg("could not munmap() while flushing data: %m")));
+			}
+
+			return;
+		}
+	}
+#endif
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	{
+		int			rc;
+
+		/*
+		 * Signal the kernel that the passed in range should not be cached
+		 * anymore. This has the, desired, side effect of writing out dirty
+		 * data, and the, undesired, side effect of likely discarding useful
+		 * clean cached blocks.  For the latter reason this is the least
+		 * preferable method.
+		 */
+
+		rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
+
+		if (rc != 0)
+		{
+			/* don't error out, this is just a performance optimization */
+			ereport(WARNING,
+					(errcode_for_file_access(),
+					 errmsg("could not flush dirty data: %m")));
+		}
+
+		return;
+	}
+#endif
+}
+
+/*
+ * Truncate an open file to a given length.
+ */
+static int
+pg_ftruncate(int fd, off_t length)
+{
+	int			ret;
+
+retry:
+	ret = ftruncate(fd, length);
+
+	if (ret == -1 && errno == EINTR)
+		goto retry;
+
+	return ret;
+}
+
+/*
+ * Truncate a file to a given length by name.
+ */
+int
+pg_truncate(const char *path, off_t length)
+{
+	int			ret;
+#ifdef WIN32
+	int			save_errno;
+	int			fd;
+
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+	if (fd >= 0)
+	{
+		ret = pg_ftruncate(fd, length);
+		save_errno = errno;
+		CloseTransientFile(fd);
+		errno = save_errno;
+	}
+	else
+		ret = -1;
+#else
+
+retry:
+	ret = truncate(path, length);
+
+	if (ret == -1 && errno == EINTR)
+		goto retry;
+#endif
+
+	return ret;
+}
+
+/*
+ * fsync_fname -- fsync a file or directory, handling errors properly
+ *
+ * Try to fsync a file or directory. When doing the latter, ignore errors that
+ * indicate the OS just doesn't allow/require fsyncing directories.
+ */
+void
+fsync_fname(const char *fname, bool isdir)
+{
+	fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * This routine ensures that, after returning, the effect of renaming file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave you with either the pre-existing or the moved file in place of the
+ * new file; no mixed state or truncated files are possible.
+ *
+ * It does so by using fsync on the old filename and the possibly existing
+ * target filename before the rename, and the target file and directory after.
+ *
+ * Note that rename() cannot be used across arbitrary directories, as they
+ * might not be on the same filesystem. Therefore this routine does not
+ * support renaming across directories.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, int elevel)
+{
+	int			fd;
+
+	/*
+	 * First fsync the old and target path (if it exists), to ensure that they
+	 * are properly persistent on disk. Syncing the target file is not
+	 * strictly necessary, but it makes it easier to reason about crashes;
+	 * because it's then guaranteed that either source or target file exists
+	 * after a crash.
+	 */
+	if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+		return -1;
+
+	fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+		{
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m", newfile)));
+			return -1;
+		}
+	}
+	else
+	{
+		if (pg_fsync(fd) != 0)
+		{
+			int			save_errno;
+
+			/* close file upon error, might not be in transaction context */
+			save_errno = errno;
+			CloseTransientFile(fd);
+			errno = save_errno;
+
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m", newfile)));
+			return -1;
+		}
+
+		if (CloseTransientFile(fd) != 0)
+		{
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not close file \"%s\": %m", newfile)));
+			return -1;
+		}
+	}
+
+	/* Time to do the real deal... */
+	if (rename(oldfile, newfile) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+						oldfile, newfile)));
+		return -1;
+	}
+
+	/*
+	 * To guarantee renaming the file is persistent, fsync the file with its
+	 * new name, and its containing directory.
+	 */
+	if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+		return -1;
+
+	if (fsync_parent_path(newfile, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * durable_unlink -- remove a file in a durable manner
+ *
+ * This routine ensures that, after returning, the effect of removing file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave the system in no mixed state.
+ *
+ * It does so by using fsync on the parent directory of the file after the
+ * actual removal is done.
+ *
+ * Log errors with the severity specified by caller.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_unlink(const char *fname, int elevel)
+{
+	if (unlink(fname) < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not remove file \"%s\": %m",
+						fname)));
+		return -1;
+	}
+
+	/*
+	 * To guarantee that the removal of the file is persistent, fsync its
+	 * parent directory.
+	 */
+	if (fsync_parent_path(fname, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ *
+ * Note that this does not initialize temporary file access, that is
+ * separately initialized via InitTemporaryFileAccess().
+ */
+void
+InitFileAccess(void)
+{
+	Assert(SizeVfdCache == 0);	/* call me only once */
+
+	/* initialize cache header entry */
+	VfdCache = (Vfd *) malloc(sizeof(Vfd));
+	if (VfdCache == NULL)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+	VfdCache->fd = VFD_CLOSED;
+
+	SizeVfdCache = 1;
+}
+
+/*
+ * InitTemporaryFileAccess --- initialize temporary file access during startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ *
+ * This is separate from InitFileAccess() because temporary file cleanup can
+ * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
+ * our reporting has to happen before that. Low level file access should be
+ * available for longer, hence the separate initialization / shutdown of
+ * temporary file handling.
+ */
+void
+InitTemporaryFileAccess(void)
+{
+	Assert(SizeVfdCache != 0);	/* InitFileAccess() needs to have run */
+	Assert(!temporary_files_allowed);	/* call me only once */
+
+	/*
+	 * Register before-shmem-exit hook to ensure temp files are dropped while
+	 * we can still report stats.
+	 */
+	before_shmem_exit(BeforeShmemExit_Files, 0);
+
+#ifdef USE_ASSERT_CHECKING
+	temporary_files_allowed = true;
+#endif
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ *		and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe.  Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer.  In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stderr (FD 2) is available for dup'ing.  While the calling
+ * script could theoretically close that, it would be a really bad idea,
+ * since then one risks loss of error messages from, e.g., libc.
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+	int		   *fd;
+	int			size;
+	int			used = 0;
+	int			highestfd = 0;
+	int			j;
+
+#ifdef HAVE_GETRLIMIT
+	struct rlimit rlim;
+	int			getrlimit_status;
+#endif
+
+	size = 1024;
+	fd = (int *) palloc(size * sizeof(int));
+
+#ifdef HAVE_GETRLIMIT
+	getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
+	if (getrlimit_status != 0)
+		ereport(WARNING, (errmsg("getrlimit failed: %m")));
+#endif							/* HAVE_GETRLIMIT */
+
+	/* dup until failure or probe limit reached */
+	for (;;)
+	{
+		int			thisfd;
+
+#ifdef HAVE_GETRLIMIT
+
+		/*
+		 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
+		 * some platforms
+		 */
+		if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
+			break;
+#endif
+
+		thisfd = dup(2);
+		if (thisfd < 0)
+		{
+			/* Expect EMFILE or ENFILE, else it's fishy */
+			if (errno != EMFILE && errno != ENFILE)
+				elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
+			break;
+		}
+
+		if (used >= size)
+		{
+			size *= 2;
+			fd = (int *) repalloc(fd, size * sizeof(int));
+		}
+		fd[used++] = thisfd;
+
+		if (highestfd < thisfd)
+			highestfd = thisfd;
+
+		if (used >= max_to_probe)
+			break;
+	}
+
+	/* release the files we opened */
+	for (j = 0; j < used; j++)
+		close(fd[j]);
+
+	pfree(fd);
+
+	/*
+	 * Return results.  usable_fds is just the number of successful dups. We
+	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+	 * number) and so already_open is highestfd+1 - usable_fds.
+	 */
+	*usable_fds = used;
+	*already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ *		Determine number of file descriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+	int			usable_fds;
+	int			already_open;
+
+	/*----------
+	 * We want to set max_safe_fds to
+	 *			MIN(usable_fds, max_files_per_process - already_open)
+	 * less the slop factor for files that are opened without consulting
+	 * fd.c.  This ensures that we won't exceed either max_files_per_process
+	 * or the experimentally-determined EMFILE limit.
+	 *----------
+	 */
+	count_usable_fds(max_files_per_process,
+					 &usable_fds, &already_open);
+
+	max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+	/*
+	 * Take off the FDs reserved for system() etc.
+	 */
+	max_safe_fds -= NUM_RESERVED_FDS;
+
+	/*
+	 * Make sure we still have enough to get by.
+	 */
+	if (max_safe_fds < FD_MINFREE)
+		ereport(FATAL,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("insufficient file descriptors available to start server process"),
+				 errdetail("System allows %d, server needs at least %d.",
+						   max_safe_fds + NUM_RESERVED_FDS,
+						   FD_MINFREE + NUM_RESERVED_FDS)));
+
+	elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+		 max_safe_fds, usable_fds, already_open);
+}
+
+/*
+ * Open a file with BasicOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+int
+BasicOpenFile(const char *fileName, int fileFlags)
+{
+	return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
+ *
+ * This is exported for use by places that really want a plain kernel FD,
+ * but need to be proof against running out of FDs.  Once an FD has been
+ * successfully returned, it is the caller's responsibility to ensure that
+ * it will not be leaked on ereport()!	Most users should *not* call this
+ * routine directly, but instead use the VFD abstraction level, which
+ * provides protection against descriptor leaks as well as management of
+ * files that need to be open for more than a short period of time.
+ *
+ * Ideally this should be the *only* direct call of open() in the backend.
+ * In practice, the postmaster calls open() directly, and there are some
+ * direct open() calls done early in backend startup.  Those are OK since
+ * this module wouldn't have any open files to close at that point anyway.
+ */
+int
+BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	int			fd;
+
+tryAgain:
+#ifdef PG_O_DIRECT_USE_F_NOCACHE
+
+	/*
+	 * The value we defined to stand in for O_DIRECT when simulating it with
+	 * F_NOCACHE had better not collide with any of the standard flags.
+	 */
+	StaticAssertStmt((PG_O_DIRECT &
+					  (O_APPEND |
+					   O_CLOEXEC |
+					   O_CREAT |
+					   O_DSYNC |
+					   O_EXCL |
+					   O_RDWR |
+					   O_RDONLY |
+					   O_SYNC |
+					   O_TRUNC |
+					   O_WRONLY)) == 0,
+					 "PG_O_DIRECT value collides with standard flag");
+	fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
+#else
+	fd = open(fileName, fileFlags, fileMode);
+#endif
+
+	if (fd >= 0)
+	{
+#ifdef PG_O_DIRECT_USE_F_NOCACHE
+		if (fileFlags & PG_O_DIRECT)
+		{
+			if (fcntl(fd, F_NOCACHE, 1) < 0)
+			{
+				int			save_errno = errno;
+
+				close(fd);
+				errno = save_errno;
+				return -1;
+			}
+		}
+#endif
+
+		return fd;				/* success! */
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto tryAgain;
+		errno = save_errno;
+	}
+
+	return -1;					/* failure */
+}
+
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved.  This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+	/*
+	 * We don't want more than max_safe_fds / 3 FDs to be consumed for
+	 * "external" FDs.
+	 */
+	if (numExternalFDs < max_safe_fds / 3)
+	{
+		ReserveExternalFD();
+		return true;
+	}
+	errno = EMFILE;
+	return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.  This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure.  Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice.  In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+	/*
+	 * Release VFDs if needed to stay safe.  Because we do this before
+	 * incrementing numExternalFDs, the final state will be as desired, i.e.,
+	 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+	 */
+	ReleaseLruFiles();
+
+	numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+	Assert(numExternalFDs > 0);
+	numExternalFDs--;
+}
+
+
+#if defined(FDDEBUG)
+
+static void
+_dump_lru(void)
+{
+	int			mru = VfdCache[0].lruLessRecently;
+	Vfd		   *vfdP = &VfdCache[mru];
+	char		buf[2048];
+
+	snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
+	while (mru != 0)
+	{
+		mru = vfdP->lruLessRecently;
+		vfdP = &VfdCache[mru];
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
+	}
+	snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
+	elog(LOG, "%s", buf);
+}
+#endif							/* FDDEBUG */
+
+static void
+Delete(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "Delete %d (%s)",
+			   file, VfdCache[file].fileName));
+	DO_DB(_dump_lru());
+
+	vfdP = &VfdCache[file];
+
+	VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
+	VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
+
+	DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "LruDelete %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	/*
+	 * Close the file.  We aren't expecting this to fail; if it does, better
+	 * to leak the FD than to mess up our internal state.
+	 */
+	if (close(vfdP->fd) != 0)
+		elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+			 "could not close file \"%s\": %m", vfdP->fileName);
+	vfdP->fd = VFD_CLOSED;
+	--nfile;
+
+	/* delete the vfd record from the LRU ring */
+	Delete(file);
+}
+
+static void
+Insert(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "Insert %d (%s)",
+			   file, VfdCache[file].fileName));
+	DO_DB(_dump_lru());
+
+	vfdP = &VfdCache[file];
+
+	vfdP->lruMoreRecently = 0;
+	vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+	VfdCache[0].lruLessRecently = file;
+	VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+
+	DO_DB(_dump_lru());
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+LruInsert(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(file != 0);
+
+	DO_DB(elog(LOG, "LruInsert %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	if (FileIsNotOpen(file))
+	{
+		/* Close excess kernel FDs. */
+		ReleaseLruFiles();
+
+		/*
+		 * The open could still fail for lack of file descriptors, eg due to
+		 * overall system file table being full.  So, be prepared to release
+		 * another FD if necessary...
+		 */
+		vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
+									 vfdP->fileMode);
+		if (vfdP->fd < 0)
+		{
+			DO_DB(elog(LOG, "re-open failed: %m"));
+			return -1;
+		}
+		else
+		{
+			++nfile;
+		}
+	}
+
+	/*
+	 * put it at the head of the Lru ring
+	 */
+
+	Insert(file);
+
+	return 0;
+}
+
+/*
+ * Release one kernel FD by closing the least-recently-used VFD.
+ */
+static bool
+ReleaseLruFile(void)
+{
+	DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
+
+	if (nfile > 0)
+	{
+		/*
+		 * There are opened files and so there should be at least one used vfd
+		 * in the ring.
+		 */
+		Assert(VfdCache[0].lruMoreRecently != 0);
+		LruDelete(VfdCache[0].lruMoreRecently);
+		return true;			/* freed a file */
+	}
+	return false;				/* no files available to free */
+}
+
+/*
+ * Release kernel FDs as needed to get under the max_safe_fds limit.
+ * After calling this, it's OK to try to open another file.
+ */
+static void
+ReleaseLruFiles(void)
+{
+	while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
+	{
+		if (!ReleaseLruFile())
+			break;
+	}
+}
+
+static File
+AllocateVfd(void)
+{
+	Index		i;
+	File		file;
+
+	DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
+
+	Assert(SizeVfdCache > 0);	/* InitFileAccess not called? */
+
+	if (VfdCache[0].nextFree == 0)
+	{
+		/*
+		 * The free list is empty so it is time to increase the size of the
+		 * array.  We choose to double it each time this happens. However,
+		 * there's not much point in starting *real* small.
+		 */
+		Size		newCacheSize = SizeVfdCache * 2;
+		Vfd		   *newVfdCache;
+
+		if (newCacheSize < 32)
+			newCacheSize = 32;
+
+		/*
+		 * Be careful not to clobber VfdCache ptr if realloc fails.
+		 */
+		newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
+		if (newVfdCache == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		VfdCache = newVfdCache;
+
+		/*
+		 * Initialize the new entries and link them into the free list.
+		 */
+		for (i = SizeVfdCache; i < newCacheSize; i++)
+		{
+			MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
+			VfdCache[i].nextFree = i + 1;
+			VfdCache[i].fd = VFD_CLOSED;
+		}
+		VfdCache[newCacheSize - 1].nextFree = 0;
+		VfdCache[0].nextFree = SizeVfdCache;
+
+		/*
+		 * Record the new size
+		 */
+		SizeVfdCache = newCacheSize;
+	}
+
+	file = VfdCache[0].nextFree;
+
+	VfdCache[0].nextFree = VfdCache[file].nextFree;
+
+	return file;
+}
+
+static void
+FreeVfd(File file)
+{
+	Vfd		   *vfdP = &VfdCache[file];
+
+	DO_DB(elog(LOG, "FreeVfd: %d (%s)",
+			   file, vfdP->fileName ? vfdP->fileName : ""));
+
+	if (vfdP->fileName != NULL)
+	{
+		free(vfdP->fileName);
+		vfdP->fileName = NULL;
+	}
+	vfdP->fdstate = 0x0;
+
+	vfdP->nextFree = VfdCache[0].nextFree;
+	VfdCache[0].nextFree = file;
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+FileAccess(File file)
+{
+	int			returnValue;
+
+	DO_DB(elog(LOG, "FileAccess %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	/*
+	 * Is the file open?  If not, open it and put it at the head of the LRU
+	 * ring (possibly closing the least recently used file to get an FD).
+	 */
+
+	if (FileIsNotOpen(file))
+	{
+		returnValue = LruInsert(file);
+		if (returnValue != 0)
+			return returnValue;
+	}
+	else if (VfdCache[0].lruLessRecently != file)
+	{
+		/*
+		 * We now know that the file is open and that it is not the last one
+		 * accessed, so we need to move it to the head of the Lru ring.
+		 */
+
+		Delete(file);
+		Insert(file);
+	}
+
+	return 0;
+}
+
+/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+	pgstat_report_tempfile(size);
+
+	if (log_temp_files >= 0)
+	{
+		if ((size / 1024) >= log_temp_files)
+			ereport(LOG,
+					(errmsg("temporary file: path \"%s\", size %lu",
+							path, (unsigned long) size)));
+	}
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+	ResourceOwnerRememberFile(CurrentResourceOwner, file);
+	VfdCache[file].resowner = CurrentResourceOwner;
+
+	/* Backup mechanism for closing at end of xact. */
+	VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+	have_xact_temporary_files = true;
+}
+
+/*
+ *	Called when we get a shared invalidation message on some relation.
+ */
+#ifdef NOT_USED
+void
+FileInvalidate(File file)
+{
+	Assert(FileIsValid(file));
+	if (!FileIsNotOpen(file))
+		LruDelete(file);
+}
+#endif
+
+/*
+ * Open a file with PathNameOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+File
+PathNameOpenFile(const char *fileName, int fileFlags)
+{
+	return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	char	   *fnamecopy;
+	File		file;
+	Vfd		   *vfdP;
+
+	DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
+			   fileName, fileFlags, fileMode));
+
+	/*
+	 * We need a malloc'd copy of the file name; fail cleanly if no room.
+	 */
+	fnamecopy = strdup(fileName);
+	if (fnamecopy == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
+	file = AllocateVfd();
+	vfdP = &VfdCache[file];
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+	/*
+	 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC.  The
+	 * client shouldn't be expected to know which kernel descriptors are
+	 * currently open, so it wouldn't make sense for them to be inherited by
+	 * executed subprograms.
+	 */
+	fileFlags |= O_CLOEXEC;
+
+	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+	if (vfdP->fd < 0)
+	{
+		int			save_errno = errno;
+
+		FreeVfd(file);
+		free(fnamecopy);
+		errno = save_errno;
+		return -1;
+	}
+	++nfile;
+	DO_DB(elog(LOG, "PathNameOpenFile: success %d",
+			   vfdP->fd));
+
+	vfdP->fileName = fnamecopy;
+	/* Saved flags are adjusted to be OK for re-opening file */
+	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
+	vfdP->fileMode = fileMode;
+	vfdP->fileSize = 0;
+	vfdP->fdstate = 0x0;
+	vfdP->resowner = NULL;
+
+	Insert(file);
+
+	return file;
+}
+
+/*
+ * Create directory 'directory'.  If necessary, create 'basedir', which must
+ * be the directory above it.  This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles().  Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+	if (MakePGDirectory(directory) < 0)
+	{
+		if (errno == EEXIST)
+			return;
+
+		/*
+		 * Failed.  Try to create basedir first in case it's missing. Tolerate
+		 * EEXIST to close a race against another process following the same
+		 * algorithm.
+		 */
+		if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("cannot create temporary directory \"%s\": %m",
+							basedir)));
+
+		/* Try again. */
+		if (MakePGDirectory(directory) < 0 && errno != EEXIST)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("cannot create temporary subdirectory \"%s\": %m",
+							directory)));
+	}
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+	struct stat statbuf;
+
+	/* Silently ignore missing directory. */
+	if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+		return;
+
+	/*
+	 * Currently, walkdir doesn't offer a way for our passed in function to
+	 * maintain state.  Perhaps it should, so that we could tell the caller
+	 * whether this operation succeeded or failed.  Since this operation is
+	 * used in a cleanup path, we wouldn't actually behave differently: we'll
+	 * just log failures.
+	 */
+	walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
+/*
+ * Open a temporary file that will disappear when we close it.
+ *
+ * This routine takes care of generating an appropriate tempfile name.
+ * There's no need to pass in fileFlags or fileMode either, since only
+ * one setting makes any sense for a temp file.
+ *
+ * Unless interXact is true, the file is remembered by CurrentResourceOwner
+ * to ensure it's closed and deleted when it's no longer needed, typically at
+ * the end-of-transaction. In most cases, you don't want temporary files to
+ * outlive the transaction that created them, so this should be false -- but
+ * if you need "somewhat" temporary storage, this might be useful. In either
+ * case, the file is removed when the File is explicitly closed.
+ */
+File
+OpenTemporaryFile(bool interXact)
+{
+	File		file = 0;
+
+	Assert(temporary_files_allowed);	/* check temp file access is up */
+
+	/*
+	 * Make sure the current resource owner has space for this File before we
+	 * open it, if we'll be registering it below.
+	 */
+	if (!interXact)
+		ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	/*
+	 * If some temp tablespace(s) have been given to us, try to use the next
+	 * one.  If a given tablespace can't be found, we silently fall back to
+	 * the database's default tablespace.
+	 *
+	 * BUT: if the temp file is slated to outlive the current transaction,
+	 * force it into the database's default tablespace, so that it will not
+	 * pose a threat to possible tablespace drop attempts.
+	 */
+	if (numTempTableSpaces > 0 && !interXact)
+	{
+		Oid			tblspcOid = GetNextTempTableSpace();
+
+		if (OidIsValid(tblspcOid))
+			file = OpenTemporaryFileInTablespace(tblspcOid, false);
+	}
+
+	/*
+	 * If not, or if tablespace is bad, create in database's default
+	 * tablespace.  MyDatabaseTableSpace should normally be set before we get
+	 * here, but just in case it isn't, fall back to pg_default tablespace.
+	 */
+	if (file <= 0)
+		file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
+											 MyDatabaseTableSpace :
+											 DEFAULTTABLESPACE_OID,
+											 true);
+
+	/* Mark it for deletion at close and temporary file size limit */
+	VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
+
+	/* Register it with the current resource owner */
+	if (!interXact)
+		RegisterTemporaryFile(file);
+
+	return file;
+}
+
+/*
+ * Return the path of the temp directory in a given tablespace.
+ */
+void
+TempTablespacePath(char *path, Oid tablespace)
+{
+	/*
+	 * Identify the tempfile directory for this tablespace.
+	 *
+	 * If someone tries to specify pg_global, use pg_default instead.
+	 */
+	if (tablespace == InvalidOid ||
+		tablespace == DEFAULTTABLESPACE_OID ||
+		tablespace == GLOBALTABLESPACE_OID)
+		snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
+	else
+	{
+		/* All other tablespaces are accessed via symlinks */
+		snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+				 tablespace, TABLESPACE_VERSION_DIRECTORY,
+				 PG_TEMP_FILES_DIR);
+	}
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+	char		tempdirpath[MAXPGPATH];
+	char		tempfilepath[MAXPGPATH];
+	File		file;
+
+	TempTablespacePath(tempdirpath, tblspcOid);
+
+	/*
+	 * Generate a tempfile name that should be unique within the current
+	 * database instance.
+	 */
+	snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
+			 tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
+
+	/*
+	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+	 * temp file that can be reused.
+	 */
+	file = PathNameOpenFile(tempfilepath,
+							O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+	if (file <= 0)
+	{
+		/*
+		 * We might need to create the tablespace's tempfile directory, if no
+		 * one has yet done so.
+		 *
+		 * Don't check for an error from MakePGDirectory; it could fail if
+		 * someone else just did the same thing.  If it doesn't work then
+		 * we'll bomb out on the second create attempt, instead.
+		 */
+		(void) MakePGDirectory(tempdirpath);
+
+		file = PathNameOpenFile(tempfilepath,
+								O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+		if (file <= 0 && rejectError)
+			elog(ERROR, "could not create temporary file \"%s\": %m",
+				 tempfilepath);
+	}
+
+	return file;
+}
+
+
+/*
+ * Create a new file.  The directory containing it must already exist.  Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles().  Alternatively, it can be
+ * inside a directory created with PathNameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+	File		file;
+
+	Assert(temporary_files_allowed);	/* check temp file access is up */
+
+	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	/*
+	 * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
+	 * temp file that can be reused.
+	 */
+	file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+	if (file <= 0)
+	{
+		if (error_on_failure)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create temporary file \"%s\": %m",
+							path)));
+		else
+			return file;
+	}
+
+	/* Mark it for temp_file_limit accounting. */
+	VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+	/* Register it for automatic close. */
+	RegisterTemporaryFile(file);
+
+	return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend.  Files opened this way don't count against the
+ * temp_file_limit of the caller, are automatically closed at the end of the
+ * transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path, int mode)
+{
+	File		file;
+
+	Assert(temporary_files_allowed);	/* check temp file access is up */
+
+	ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+	file = PathNameOpenFile(path, mode | PG_BINARY);
+
+	/* If no such file, then we don't raise an error. */
+	if (file <= 0 && errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open temporary file \"%s\": %m",
+						path)));
+
+	if (file > 0)
+	{
+		/* Register it for automatic close. */
+		RegisterTemporaryFile(file);
+	}
+
+	return file;
+}
+
+/*
+ * Delete a file by pathname.  Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+	struct stat filestats;
+	int			stat_errno;
+
+	/* Get the final size for pgstat reporting. */
+	if (stat(path, &filestats) != 0)
+		stat_errno = errno;
+	else
+		stat_errno = 0;
+
+	/*
+	 * Unlike FileClose's automatic file deletion code, we tolerate
+	 * non-existence to support BufFileDeleteFileSet which doesn't know how
+	 * many segments it has to delete until it runs out.
+	 */
+	if (stat_errno == ENOENT)
+		return false;
+
+	if (unlink(path) < 0)
+	{
+		if (errno != ENOENT)
+			ereport(error_on_failure ? ERROR : LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not unlink temporary file \"%s\": %m",
+							path)));
+		return false;
+	}
+
+	if (stat_errno == 0)
+		ReportTemporaryFileUsage(path, filestats.st_size);
+	else
+	{
+		errno = stat_errno;
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
+	}
+
+	return true;
+}
+
+/*
+ * close a file when done with it
+ */
+void
+FileClose(File file)
+{
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileClose: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	vfdP = &VfdCache[file];
+
+	if (!FileIsNotOpen(file))
+	{
+		/* close the file */
+		if (close(vfdP->fd) != 0)
+		{
+			/*
+			 * We may need to panic on failure to close non-temporary files;
+			 * see LruDelete.
+			 */
+			elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+				 "could not close file \"%s\": %m", vfdP->fileName);
+		}
+
+		--nfile;
+		vfdP->fd = VFD_CLOSED;
+
+		/* remove the file from the lru ring */
+		Delete(file);
+	}
+
+	if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+	{
+		/* Subtract its size from current usage (do first in case of error) */
+		temporary_files_size -= vfdP->fileSize;
+		vfdP->fileSize = 0;
+	}
+
+	/*
+	 * Delete the file if it was temporary, and make a log entry if wanted
+	 */
+	if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
+	{
+		struct stat filestats;
+		int			stat_errno;
+
+		/*
+		 * If we get an error, as could happen within the ereport/elog calls,
+		 * we'll come right back here during transaction abort.  Reset the
+		 * flag to ensure that we can't get into an infinite loop.  This code
+		 * is arranged to ensure that the worst-case consequence is failing to
+		 * emit log message(s), not failing to attempt the unlink.
+		 */
+		vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
+
+
+		/* first try the stat() */
+		if (stat(vfdP->fileName, &filestats))
+			stat_errno = errno;
+		else
+			stat_errno = 0;
+
+		/* in any case do the unlink */
+		if (unlink(vfdP->fileName))
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
+
+		/* and last report the stat results */
+		if (stat_errno == 0)
+			ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
+		else
+		{
+			errno = stat_errno;
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
+		}
+	}
+
+	/* Unregister it from the resource owner */
+	if (vfdP->resowner)
+		ResourceOwnerForgetFile(vfdP->resowner, file);
+
+	/*
+	 * Return the Vfd slot to the free list
+	 */
+	FreeVfd(file);
+}
+
+/*
+ * FilePrefetch - initiate asynchronous read of a given range of the file.
+ *
+ * Currently the only implementation of this function is using posix_fadvise
+ * which is the simplest standardized interface that accomplishes this.
+ * We could add an implementation using libaio in the future; but note that
+ * this API is inappropriate for libaio, which wants to have a buffer provided
+ * to read into.
+ */
+int
+FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+retry:
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
+							   POSIX_FADV_WILLNEED);
+	pgstat_report_wait_end();
+
+	if (returnCode == EINTR)
+		goto retry;
+
+	return returnCode;
+#else
+	Assert(FileIsValid(file));
+	return 0;
+#endif
+}
+
+void
+FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) nbytes));
+
+	if (nbytes <= 0)
+		return;
+
+	if (VfdCache[file].fileFlags & PG_O_DIRECT)
+		return;
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return;
+
+	pgstat_report_wait_start(wait_event_info);
+	pg_flush_data(VfdCache[file].fd, offset, nbytes);
+	pgstat_report_wait_end();
+}
+
+int
+FileRead(File file, void *buffer, size_t amount, off_t offset,
+		 uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   amount, buffer));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+retry:
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
+	pgstat_report_wait_end();
+
+	if (returnCode < 0)
+	{
+		/*
+		 * Windows may run out of kernel buffers and return "Insufficient
+		 * system resources" error.  Wait a bit and retry to solve it.
+		 *
+		 * It is rumored that EINTR is also possible on some Unix filesystems,
+		 * in which case immediate retry is indicated.
+		 */
+#ifdef WIN32
+		DWORD		error = GetLastError();
+
+		switch (error)
+		{
+			case ERROR_NO_SYSTEM_RESOURCES:
+				pg_usleep(1000L);
+				errno = EINTR;
+				break;
+			default:
+				_dosmaperr(error);
+				break;
+		}
+#endif
+		/* OK to retry if interrupted */
+		if (errno == EINTR)
+			goto retry;
+	}
+
+	return returnCode;
+}
+
+int
+FileWrite(File file, const void *buffer, size_t amount, off_t offset,
+		  uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   amount, buffer));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+	/*
+	 * If enforcing temp_file_limit and it's a temp file, check to see if the
+	 * write would overrun temp_file_limit, and throw error if so.  Note: it's
+	 * really a modularity violation to throw error here; we should set errno
+	 * and return -1.  However, there's no way to report a suitable error
+	 * message if we do that.  All current callers would just throw error
+	 * immediately anyway, so this is safe at present.
+	 */
+	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
+	{
+		off_t		past_write = offset + amount;
+
+		if (past_write > vfdP->fileSize)
+		{
+			uint64		newTotal = temporary_files_size;
+
+			newTotal += past_write - vfdP->fileSize;
+			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
+				ereport(ERROR,
+						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+						 errmsg("temporary file size exceeds temp_file_limit (%dkB)",
+								temp_file_limit)));
+		}
+	}
+
+retry:
+	errno = 0;
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
+	pgstat_report_wait_end();
+
+	/* if write didn't set errno, assume problem is no disk space */
+	if (returnCode != amount && errno == 0)
+		errno = ENOSPC;
+
+	if (returnCode >= 0)
+	{
+		/*
+		 * Maintain fileSize and temporary_files_size if it's a temp file.
+		 */
+		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+		{
+			off_t		past_write = offset + amount;
+
+			if (past_write > vfdP->fileSize)
+			{
+				temporary_files_size += past_write - vfdP->fileSize;
+				vfdP->fileSize = past_write;
+			}
+		}
+	}
+	else
+	{
+		/*
+		 * See comments in FileRead()
+		 */
+#ifdef WIN32
+		DWORD		error = GetLastError();
+
+		switch (error)
+		{
+			case ERROR_NO_SYSTEM_RESOURCES:
+				pg_usleep(1000L);
+				errno = EINTR;
+				break;
+			default:
+				_dosmaperr(error);
+				break;
+		}
+#endif
+		/* OK to retry if interrupted */
+		if (errno == EINTR)
+			goto retry;
+	}
+
+	return returnCode;
+}
+
+int
+FileSync(File file, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSync: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_fsync(VfdCache[file].fd);
+	pgstat_report_wait_end();
+
+	return returnCode;
+}
+
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+	int			returnCode;
+	ssize_t		written;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+	pgstat_report_wait_end();
+
+	if (written < 0)
+		return -1;
+	else if (written != amount)
+	{
+		/* if errno is unset, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return -1;
+
+retry:
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0)
+		return 0;
+	else if (returnCode == EINTR)
+		goto retry;
+
+	/* for compatibility with %m printing etc */
+	errno = returnCode;
+
+	/*
+	 * Return in cases of a "real" failure, if fallocate is not supported,
+	 * fall through to the FileZero() backed implementation.
+	 */
+	if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+		return -1;
+#endif
+
+	return FileZero(file, offset, amount, wait_event_info);
+}
+
+off_t
+FileSize(File file)
+{
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSize %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	if (FileIsNotOpen(file))
+	{
+		if (FileAccess(file) < 0)
+			return (off_t) -1;
+	}
+
+	return lseek(VfdCache[file].fd, 0, SEEK_END);
+}
+
+int
+FileTruncate(File file, off_t offset, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileTruncate %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = pg_ftruncate(VfdCache[file].fd, offset);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0 && VfdCache[file].fileSize > offset)
+	{
+		/* adjust our state for truncation of a temp file */
+		Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
+		temporary_files_size -= VfdCache[file].fileSize - offset;
+		VfdCache[file].fileSize = offset;
+	}
+
+	return returnCode;
+}
+
+/*
+ * Return the pathname associated with an open file.
+ *
+ * The returned string points to an internal buffer, which is valid until
+ * the file is closed.
+ */
+char *
+FilePathName(File file)
+{
+	Assert(FileIsValid(file));
+
+	return VfdCache[file].fileName;
+}
+
+/*
+ * Return the raw file descriptor of an opened file.
+ *
+ * The returned file descriptor will be valid until the file is closed, but
+ * there are a lot of things that can make that happen.  So the caller should
+ * be careful not to do much of anything else before it finishes using the
+ * returned file descriptor.
+ */
+int
+FileGetRawDesc(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fd;
+}
+
+/*
+ * FileGetRawFlags - returns the file flags on open(2)
+ */
+int
+FileGetRawFlags(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fileFlags;
+}
+
+/*
+ * FileGetRawMode - returns the mode bitmask passed to open(2)
+ */
+mode_t
+FileGetRawMode(File file)
+{
+	Assert(FileIsValid(file));
+	return VfdCache[file].fileMode;
+}
+
+/*
+ * Make room for another allocatedDescs[] array entry if needed and possible.
+ * Returns true if an array element is available.
+ */
+static bool
+reserveAllocatedDesc(void)
+{
+	AllocateDesc *newDescs;
+	int			newMax;
+
+	/* Quick out if array already has a free slot. */
+	if (numAllocatedDescs < maxAllocatedDescs)
+		return true;
+
+	/*
+	 * If the array hasn't yet been created in the current process, initialize
+	 * it with FD_MINFREE / 3 elements.  In many scenarios this is as many as
+	 * we will ever need, anyway.  We don't want to look at max_safe_fds
+	 * immediately because set_max_safe_fds() may not have run yet.
+	 */
+	if (allocatedDescs == NULL)
+	{
+		newMax = FD_MINFREE / 3;
+		newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
+		/* Out of memory already?  Treat as fatal error. */
+		if (newDescs == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		allocatedDescs = newDescs;
+		maxAllocatedDescs = newMax;
+		return true;
+	}
+
+	/*
+	 * Consider enlarging the array beyond the initial allocation used above.
+	 * By the time this happens, max_safe_fds should be known accurately.
+	 *
+	 * We mustn't let allocated descriptors hog all the available FDs, and in
+	 * practice we'd better leave a reasonable number of FDs for VFD use.  So
+	 * set the maximum to max_safe_fds / 3.  (This should certainly be at
+	 * least as large as the initial size, FD_MINFREE / 3, so we aren't
+	 * tightening the restriction here.)  Recall that "external" FDs are
+	 * allowed to consume another third of max_safe_fds.
+	 */
+	newMax = max_safe_fds / 3;
+	if (newMax > maxAllocatedDescs)
+	{
+		newDescs = (AllocateDesc *) realloc(allocatedDescs,
+											newMax * sizeof(AllocateDesc));
+		/* Treat out-of-memory as a non-fatal error. */
+		if (newDescs == NULL)
+			return false;
+		allocatedDescs = newDescs;
+		maxAllocatedDescs = newMax;
+		return true;
+	}
+
+	/* Can't enlarge allocatedDescs[] any more. */
+	return false;
+}
+
+/*
+ * Routines that want to use stdio (ie, FILE*) should use AllocateFile
+ * rather than plain fopen().  This lets fd.c deal with freeing FDs if
+ * necessary to open the file.  When done, call FreeFile rather than fclose.
+ *
+ * Note that files that will be open for any significant length of time
+ * should NOT be handled this way, since they cannot share kernel file
+ * descriptors with other files; there is grave risk of running out of FDs
+ * if anyone locks down too many FDs.  Most callers of this routine are
+ * simply reading a config file that they will read and close immediately.
+ *
+ * fd.c will automatically close all files opened with AllocateFile at
+ * transaction commit or abort; this prevents FD leakage if a routine
+ * that calls AllocateFile is terminated prematurely by ereport(ERROR).
+ *
+ * Ideally this should be the *only* direct call of fopen() in the backend.
+ */
+FILE *
+AllocateFile(const char *name, const char *mode)
+{
+	FILE	   *file;
+
+	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+			   numAllocatedDescs, name));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+						maxAllocatedDescs, name)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	if ((file = fopen(name, mode)) != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescFile;
+		desc->desc.file = file;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.file;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Open a file with OpenTransientFilePerm() and pass default file mode for
+ * the fileMode parameter.
+ */
+int
+OpenTransientFile(const char *fileName, int fileFlags)
+{
+	return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * Like AllocateFile, but returns an unbuffered fd like open(2)
+ */
+int
+OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+	int			fd;
+
+	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
+			   numAllocatedDescs, fileName));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+						maxAllocatedDescs, fileName)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+	fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+	if (fd >= 0)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescRawFD;
+		desc->desc.fd = fd;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+
+		return fd;
+	}
+
+	return -1;					/* failure */
+}
+
+/*
+ * Routines that want to initiate a pipe stream should use OpenPipeStream
+ * rather than plain popen().  This lets fd.c deal with freeing FDs if
+ * necessary.  When done, call ClosePipeStream rather than pclose.
+ *
+ * This function also ensures that the popen'd program is run with default
+ * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
+ * uses.  This ensures desirable response to, eg, closing a read pipe early.
+ */
+FILE *
+OpenPipeStream(const char *command, const char *mode)
+{
+	FILE	   *file;
+	int			save_errno;
+
+	DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
+			   numAllocatedDescs, command));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
+						maxAllocatedDescs, command)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	fflush(NULL);
+	pqsignal(SIGPIPE, SIG_DFL);
+	errno = 0;
+	file = popen(command, mode);
+	save_errno = errno;
+	pqsignal(SIGPIPE, SIG_IGN);
+	errno = save_errno;
+	if (file != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescPipe;
+		desc->desc.file = file;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.file;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Free an AllocateDesc of any type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+	int			result;
+
+	/* Close the underlying object */
+	switch (desc->kind)
+	{
+		case AllocateDescFile:
+			result = fclose(desc->desc.file);
+			break;
+		case AllocateDescPipe:
+			result = pclose(desc->desc.file);
+			break;
+		case AllocateDescDir:
+			result = closedir(desc->desc.dir);
+			break;
+		case AllocateDescRawFD:
+			result = close(desc->desc.fd);
+			break;
+		default:
+			elog(ERROR, "AllocateDesc kind not recognized");
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	/* Compact storage in the allocatedDescs array */
+	numAllocatedDescs--;
+	*desc = allocatedDescs[numAllocatedDescs];
+
+	return result;
+}
+
+/*
+ * Close a file returned by AllocateFile.
+ *
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeFile(FILE *file)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove file from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescFile && desc->desc.file == file)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+	return fclose(file);
+}
+
+/*
+ * Close a file returned by OpenTransientFile.
+ *
+ * Note we do not check close's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+CloseTransientFile(int fd)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove fd from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
+
+	return close(fd);
+}
+
+/*
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir().  This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
+ *
+ * Returns NULL, with errno set, on failure.  Note that failure detection
+ * is commonly left to the following call of ReadDir or ReadDirExtended;
+ * see the comments for ReadDir.
+ *
+ * Ideally this should be the *only* direct call of opendir() in the backend.
+ */
+DIR *
+AllocateDir(const char *dirname)
+{
+	DIR		   *dir;
+
+	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+			   numAllocatedDescs, dirname));
+
+	/* Can we allocate another non-virtual FD? */
+	if (!reserveAllocatedDesc())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
+						maxAllocatedDescs, dirname)));
+
+	/* Close excess kernel FDs. */
+	ReleaseLruFiles();
+
+TryAgain:
+	if ((dir = opendir(dirname)) != NULL)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescDir;
+		desc->desc.dir = dir;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.dir;
+	}
+
+	if (errno == EMFILE || errno == ENFILE)
+	{
+		int			save_errno = errno;
+
+		ereport(LOG,
+				(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+				 errmsg("out of file descriptors: %m; release and retry")));
+		errno = 0;
+		if (ReleaseLruFile())
+			goto TryAgain;
+		errno = save_errno;
+	}
+
+	return NULL;
+}
+
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno.  Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ *		dir = AllocateDir(path);
+ *		while ((dirent = ReadDir(dir, path)) != NULL)
+ *			process dirent;
+ *		FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno isn't changed between AllocateDir and ReadDir if you
+ * use this shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+	return ReadDirExtended(dir, dirname, ERROR);
+}
+
+/*
+ * Alternate version of ReadDir that allows caller to specify the elevel
+ * for any error report (whether it's reporting an initial failure of
+ * AllocateDir or a subsequent directory read failure).
+ *
+ * If elevel < ERROR, returns NULL after any error.  With the normal coding
+ * pattern, this will result in falling out of the loop immediately as
+ * though the directory contained no (more) entries.
+ */
+struct dirent *
+ReadDirExtended(DIR *dir, const char *dirname, int elevel)
+{
+	struct dirent *dent;
+
+	/* Give a generic message for AllocateDir failure, if caller didn't */
+	if (dir == NULL)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m",
+						dirname)));
+		return NULL;
+	}
+
+	errno = 0;
+	if ((dent = readdir(dir)) != NULL)
+		return dent;
+
+	if (errno)
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not read directory \"%s\": %m",
+						dirname)));
+	return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Returns closedir's return value (with errno set if it's not 0).
+ * Note we do not check the return value --- it is up to the caller
+ * to handle close errors if wanted.
+ *
+ * Does nothing if dir == NULL; we assume that directory open failure was
+ * already reported if desired.
+ */
+int
+FreeDir(DIR *dir)
+{
+	int			i;
+
+	/* Nothing to do if AllocateDir failed */
+	if (dir == NULL)
+		return 0;
+
+	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
+
+	/* Remove dir from list of allocated dirs, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a dir not in allocatedDescs */
+	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+	return closedir(dir);
+}
+
+
+/*
+ * Close a pipe stream returned by OpenPipeStream.
+ */
+int
+ClosePipeStream(FILE *file)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
+
+	/* Remove file from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescPipe && desc->desc.file == file)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
+
+	return pclose(file);
+}
+
+/*
+ * closeAllVfds
+ *
+ * Force all VFDs into the physically-closed state, so that the fewest
+ * possible number of kernel file descriptors are in use.  There is no
+ * change in the logical state of the VFDs.
+ */
+void
+closeAllVfds(void)
+{
+	Index		i;
+
+	if (SizeVfdCache > 0)
+	{
+		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
+		for (i = 1; i < SizeVfdCache; i++)
+		{
+			if (!FileIsNotOpen(i))
+				LruDelete(i);
+		}
+	}
+}
+
+
+/*
+ * SetTempTablespaces
+ *
+ * Define a list (actually an array) of OIDs of tablespaces to use for
+ * temporary files.  This list will be used until end of transaction,
+ * unless this function is called again before then.  It is caller's
+ * responsibility that the passed-in array has adequate lifespan (typically
+ * it'd be allocated in TopTransactionContext).
+ *
+ * Some entries of the array may be InvalidOid, indicating that the current
+ * database's default tablespace should be used.
+ */
+void
+SetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+	Assert(numSpaces >= 0);
+	tempTableSpaces = tableSpaces;
+	numTempTableSpaces = numSpaces;
+
+	/*
+	 * Select a random starting point in the list.  This is to minimize
+	 * conflicts between backends that are most likely sharing the same list
+	 * of temp tablespaces.  Note that if we create multiple temp files in the
+	 * same transaction, we'll advance circularly through the list --- this
+	 * ensures that large temporary sort files are nicely spread across all
+	 * available tablespaces.
+	 */
+	if (numSpaces > 1)
+		nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
+												  0, numSpaces - 1);
+	else
+		nextTempTableSpace = 0;
+}
+
+/*
+ * TempTablespacesAreSet
+ *
+ * Returns true if SetTempTablespaces has been called in current transaction.
+ * (This is just so that tablespaces.c doesn't need its own per-transaction
+ * state.)
+ */
+bool
+TempTablespacesAreSet(void)
+{
+	return (numTempTableSpaces >= 0);
+}
+
+/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files.  (Some entries may be InvalidOid, indicating that the
+ * current database's default tablespace should be used.)  At most numSpaces
+ * entries will be filled.
+ * Returns the number of OIDs that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+	int			i;
+
+	Assert(TempTablespacesAreSet());
+	for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+		tableSpaces[i] = tempTableSpaces[i];
+
+	return i;
+}
+
+/*
+ * GetNextTempTableSpace
+ *
+ * Select the next temp tablespace to use.  A result of InvalidOid means
+ * to use the current database's default tablespace.
+ */
+Oid
+GetNextTempTableSpace(void)
+{
+	if (numTempTableSpaces > 0)
+	{
+		/* Advance nextTempTableSpace counter with wraparound */
+		if (++nextTempTableSpace >= numTempTableSpaces)
+			nextTempTableSpace = 0;
+		return tempTableSpaces[nextTempTableSpace];
+	}
+	return InvalidOid;
+}
+
+
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort.  At abort, we close temp files
+ * that the subtransaction may have opened.  At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+				  SubTransactionId parentSubid)
+{
+	Index		i;
+
+	for (i = 0; i < numAllocatedDescs; i++)
+	{
+		if (allocatedDescs[i].create_subid == mySubid)
+		{
+			if (isCommit)
+				allocatedDescs[i].create_subid = parentSubid;
+			else
+			{
+				/* have to recheck the item after FreeDesc (ugly) */
+				FreeDesc(&allocatedDescs[i--]);
+			}
+		}
+	}
+}
+
+/*
+ * AtEOXact_Files
+ *
+ * This routine is called during transaction commit or abort.  All still-open
+ * per-transaction temporary file VFDs are closed, which also causes the
+ * underlying files to be deleted (although they should've been closed already
+ * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
+ * closed. We also forget any transaction-local temp tablespace list.
+ *
+ * The isCommit flag is used only to decide whether to emit warnings about
+ * unclosed files.
+ */
+void
+AtEOXact_Files(bool isCommit)
+{
+	CleanupTempFiles(isCommit, false);
+	tempTableSpaces = NULL;
+	numTempTableSpaces = -1;
+}
+
+/*
+ * BeforeShmemExit_Files
+ *
+ * before_shmem_exit hook to clean up temp files during backend shutdown.
+ * Here, we want to clean up *all* temp files including interXact ones.
+ */
+static void
+BeforeShmemExit_Files(int code, Datum arg)
+{
+	CleanupTempFiles(false, true);
+
+	/* prevent further temp files from being created */
+#ifdef USE_ASSERT_CHECKING
+	temporary_files_allowed = false;
+#endif
+}
+
+/*
+ * Close temporary files and delete their underlying files.
+ *
+ * isCommit: if true, this is normal transaction commit, and we don't
+ * expect any remaining files; warn if there are some.
+ *
+ * isProcExit: if true, this is being called as the backend process is
+ * exiting. If that's the case, we should remove all temporary files; if
+ * that's not the case, we are being called for transaction commit/abort
+ * and should only remove transaction-local temp files.  In either case,
+ * also clean up "allocated" stdio files, dirs and fds.
+ */
+static void
+CleanupTempFiles(bool isCommit, bool isProcExit)
+{
+	Index		i;
+
+	/*
+	 * Careful here: at proc_exit we need extra cleanup, not just
+	 * xact_temporary files.
+	 */
+	if (isProcExit || have_xact_temporary_files)
+	{
+		Assert(FileIsNotOpen(0));	/* Make sure ring not corrupted */
+		for (i = 1; i < SizeVfdCache; i++)
+		{
+			unsigned short fdstate = VfdCache[i].fdstate;
+
+			if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+				VfdCache[i].fileName != NULL)
+			{
+				/*
+				 * If we're in the process of exiting a backend process, close
+				 * all temporary files. Otherwise, only close temporary files
+				 * local to the current transaction. They should be closed by
+				 * the ResourceOwner mechanism already, so this is just a
+				 * debugging cross-check.
+				 */
+				if (isProcExit)
+					FileClose(i);
+				else if (fdstate & FD_CLOSE_AT_EOXACT)
+				{
+					elog(WARNING,
+						 "temporary file %s not closed at end-of-transaction",
+						 VfdCache[i].fileName);
+					FileClose(i);
+				}
+			}
+		}
+
+		have_xact_temporary_files = false;
+	}
+
+	/* Complain if any allocated files remain open at commit. */
+	if (isCommit && numAllocatedDescs > 0)
+		elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
+			 numAllocatedDescs);
+
+	/* Clean up "allocated" stdio files, dirs and fds. */
+	while (numAllocatedDescs > 0)
+		FreeDesc(&allocatedDescs[0]);
+}
+
+
+/*
+ * Remove temporary and temporary relation files left over from a prior
+ * postmaster session
+ *
+ * This should be called during postmaster startup.  It will forcibly
+ * remove any leftover files created by OpenTemporaryFile and any leftover
+ * temporary relation files created by mdcreate.
+ *
+ * During post-backend-crash restart cycle, this routine is called when
+ * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
+ * queries are using temp files could result in useless storage usage that can
+ * only be reclaimed by a service restart. The argument against enabling it is
+ * that someone might want to examine the temporary files for debugging
+ * purposes. This does however mean that OpenTemporaryFile had better allow for
+ * collision with an existing temp file name.
+ *
+ * NOTE: this function and its subroutines generally report syscall failures
+ * with ereport(LOG) and keep going.  Removing temp files is not so critical
+ * that we should fail to start the database when we can't do it.
+ */
+void
+RemovePgTempFiles(void)
+{
+	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
+	DIR		   *spc_dir;
+	struct dirent *spc_de;
+
+	/*
+	 * First process temp files in pg_default ($PGDATA/base)
+	 */
+	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
+	RemovePgTempFilesInDir(temp_path, true, false);
+	RemovePgTempRelationFiles("base");
+
+	/*
+	 * Cycle through temp directories for all non-default tablespaces.
+	 */
+	spc_dir = AllocateDir("pg_tblspc");
+
+	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
+	{
+		if (strcmp(spc_de->d_name, ".") == 0 ||
+			strcmp(spc_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+		RemovePgTempFilesInDir(temp_path, true, false);
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+		RemovePgTempRelationFiles(temp_path);
+	}
+
+	FreeDir(spc_dir);
+
+	/*
+	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+	 * DataDir as well.  However, that is *not* cleaned here because doing so
+	 * would create a race condition.  It's done separately, earlier in
+	 * postmaster startup.
+	 */
+}
+
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles.
+ *
+ * If missing_ok is true, it's all right for the named directory to not exist.
+ * Any other problem results in a LOG message.  (missing_ok should be true at
+ * the top level, since pgsql_tmp directories are not created until needed.)
+ *
+ * At the top level, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked.  When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ *
+ * (These two flags could be replaced by one, but it seems clearer to keep
+ * them separate.)
+ */
+void
+RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
+{
+	DIR		   *temp_dir;
+	struct dirent *temp_de;
+	char		rm_path[MAXPGPATH * 2];
+
+	temp_dir = AllocateDir(tmpdirname);
+
+	if (temp_dir == NULL && errno == ENOENT && missing_ok)
+		return;
+
+	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
+	{
+		if (strcmp(temp_de->d_name, ".") == 0 ||
+			strcmp(temp_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(rm_path, sizeof(rm_path), "%s/%s",
+				 tmpdirname, temp_de->d_name);
+
+		if (unlink_all ||
+			strncmp(temp_de->d_name,
+					PG_TEMP_FILE_PREFIX,
+					strlen(PG_TEMP_FILE_PREFIX)) == 0)
+		{
+			PGFileType	type = get_dirent_type(rm_path, temp_de, false, LOG);
+
+			if (type == PGFILETYPE_ERROR)
+				continue;
+			else if (type == PGFILETYPE_DIR)
+			{
+				/* recursively remove contents, then directory itself */
+				RemovePgTempFilesInDir(rm_path, false, true);
+
+				if (rmdir(rm_path) < 0)
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not remove directory \"%s\": %m",
+									rm_path)));
+			}
+			else
+			{
+				if (unlink(rm_path) < 0)
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not remove file \"%s\": %m",
+									rm_path)));
+			}
+		}
+		else
+			ereport(LOG,
+					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
+							rm_path)));
+	}
+
+	FreeDir(temp_dir);
+}
+
+/* Process one tablespace directory, look for per-DB subdirectories */
+static void
+RemovePgTempRelationFiles(const char *tsdirname)
+{
+	DIR		   *ts_dir;
+	struct dirent *de;
+	char		dbspace_path[MAXPGPATH * 2];
+
+	ts_dir = AllocateDir(tsdirname);
+
+	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
+	{
+		/*
+		 * We're only interested in the per-database directories, which have
+		 * numeric names.  Note that this code will also (properly) ignore "."
+		 * and "..".
+		 */
+		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+			continue;
+
+		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+				 tsdirname, de->d_name);
+		RemovePgTempRelationFilesInDbspace(dbspace_path);
+	}
+
+	FreeDir(ts_dir);
+}
+
+/* Process one per-dbspace directory for RemovePgTempRelationFiles */
+static void
+RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
+{
+	DIR		   *dbspace_dir;
+	struct dirent *de;
+	char		rm_path[MAXPGPATH * 2];
+
+	dbspace_dir = AllocateDir(dbspacedirname);
+
+	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
+	{
+		if (!looks_like_temp_rel_name(de->d_name))
+			continue;
+
+		snprintf(rm_path, sizeof(rm_path), "%s/%s",
+				 dbspacedirname, de->d_name);
+
+		if (unlink(rm_path) < 0)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not remove file \"%s\": %m",
+							rm_path)));
+	}
+
+	FreeDir(dbspace_dir);
+}
+
+/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
+bool
+looks_like_temp_rel_name(const char *name)
+{
+	int			pos;
+	int			savepos;
+
+	/* Must start with "t". */
+	if (name[0] != 't')
+		return false;
+
+	/* Followed by a non-empty string of digits and then an underscore. */
+	for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (pos == 1 || name[pos] != '_')
+		return false;
+
+	/* Followed by another nonempty string of digits. */
+	for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (savepos == pos)
+		return false;
+
+	/* We might have _forkname or .segment or both. */
+	if (name[pos] == '_')
+	{
+		int			forkchar = forkname_chars(&name[pos + 1], NULL);
+
+		if (forkchar <= 0)
+			return false;
+		pos += forkchar + 1;
+	}
+	if (name[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+			;
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now we should be at the end. */
+	if (name[pos] != '\0')
+		return false;
+	return true;
+}
+
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int			fd;
+
+	ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
+							 path);
+
+	fd = OpenTransientFile(path, O_RDONLY);
+	if (fd < 0)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+		return;
+	}
+	if (syncfs(fd) < 0)
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not synchronize file system for file \"%s\": %m", path)));
+	CloseTransientFile(fd);
+}
+#endif
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on recovery_init_sync_method setting.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_wal and immediately under pg_tblspc.
+ * Other symlinks are presumed to point at files we're not responsible
+ * for fsyncing, and might not have privileges to write at all.
+ *
+ * Errors are logged but not considered fatal; that's because this is used
+ * only during database startup, to deal with the possibility that there are
+ * issued-but-unsynced writes pending against the data directory.  We want to
+ * ensure that such writes reach disk before anything that's done in the new
+ * run.  However, aborting on error would result in failure to start for
+ * harmless cases such as read-only files in the data directory, and that's
+ * not good either.
+ *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
+ * Note we assume we're chdir'd into PGDATA to begin with.
+ */
+void
+SyncDataDirectory(void)
+{
+	bool		xlog_is_symlink;
+
+	/* We can skip this whole thing if fsync is disabled. */
+	if (!enableFsync)
+		return;
+
+	/*
+	 * If pg_wal is a symlink, we'll need to recurse into it separately,
+	 * because the first walkdir below will ignore it.
+	 */
+	xlog_is_symlink = false;
+
+	{
+		struct stat st;
+
+		if (lstat("pg_wal", &st) < 0)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							"pg_wal")));
+		else if (S_ISLNK(st.st_mode))
+			xlog_is_symlink = true;
+	}
+
+#ifdef HAVE_SYNCFS
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
+	{
+		DIR		   *dir;
+		struct dirent *de;
+
+		/*
+		 * On Linux, we don't have to open every single file one by one.  We
+		 * can use syncfs() to sync whole filesystems.  We only expect
+		 * filesystem boundaries to exist where we tolerate symlinks, namely
+		 * pg_wal and the tablespaces, so we call syncfs() for each of those
+		 * directories.
+		 */
+
+		/* Prepare to report progress syncing the data directory via syncfs. */
+		begin_startup_progress_phase();
+
+		/* Sync the top level pgdata directory. */
+		do_syncfs(".");
+		/* If any tablespaces are configured, sync each of those. */
+		dir = AllocateDir("pg_tblspc");
+		while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
+		{
+			char		path[MAXPGPATH];
+
+			if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+				continue;
+
+			snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+			do_syncfs(path);
+		}
+		FreeDir(dir);
+		/* If pg_wal is a symlink, process that too. */
+		if (xlog_is_symlink)
+			do_syncfs("pg_wal");
+		return;
+	}
+#endif							/* !HAVE_SYNCFS */
+
+#ifdef PG_FLUSH_DATA_WORKS
+	/* Prepare to report progress of the pre-fsync phase. */
+	begin_startup_progress_phase();
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync the data
+	 * directory and its contents.  Errors in this step are even less
+	 * interesting than normal, so log them only at DEBUG1.
+	 */
+	walkdir(".", pre_sync_fname, false, DEBUG1);
+	if (xlog_is_symlink)
+		walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
+	walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
+#endif
+
+	/* Prepare to report progress syncing the data directory via fsync. */
+	begin_startup_progress_phase();
+
+	/*
+	 * Now we do the fsync()s in the same order.
+	 *
+	 * The main call ignores symlinks, so in addition to specially processing
+	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
+	 * process_symlinks = true.  Note that if there are any plain directories
+	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
+	 * so we don't worry about optimizing it.
+	 */
+	walkdir(".", datadir_fsync_fname, false, LOG);
+	if (xlog_is_symlink)
+		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
+	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * If process_symlinks is true, the action and recursion are also applied
+ * to regular files and directories that are pointed to by symlinks in the
+ * given directory; otherwise symlinks are ignored.  Symlinks are always
+ * ignored in subdirectories, ie we intentionally don't pass down the
+ * process_symlinks flag to recursive calls.
+ *
+ * Errors are reported at level elevel, which might be ERROR or less.
+ *
+ * See also walkdir in file_utils.c, which is a frontend version of this
+ * logic.
+ */
+static void
+walkdir(const char *path,
+		void (*action) (const char *fname, bool isdir, int elevel),
+		bool process_symlinks,
+		int elevel)
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+
+	while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
+	{
+		char		subpath[MAXPGPATH * 2];
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
+
+		switch (get_dirent_type(subpath, de, process_symlinks, elevel))
+		{
+			case PGFILETYPE_REG:
+				(*action) (subpath, false, elevel);
+				break;
+			case PGFILETYPE_DIR:
+				walkdir(subpath, action, false, elevel);
+				break;
+			default:
+
+				/*
+				 * Errors are already reported directly by get_dirent_type(),
+				 * and any remaining symlinks and unknown file types are
+				 * ignored.
+				 */
+				break;
+		}
+	}
+
+	FreeDir(dir);				/* we ignore any error here */
+
+	/*
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.  However, skip this if AllocateDir failed; the action function
+	 * might not be robust against that.
+	 */
+	if (dir)
+		(*action) (path, true, elevel);
+}
+
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
+ */
+#ifdef PG_FLUSH_DATA_WORKS
+
+static void
+pre_sync_fname(const char *fname, bool isdir, int elevel)
+{
+	int			fd;
+
+	/* Don't try to flush directories, it'll likely just fail */
+	if (isdir)
+		return;
+
+	ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
+							 fname);
+
+	fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
+
+	if (fd < 0)
+	{
+		if (errno == EACCES)
+			return;
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return;
+	}
+
+	/*
+	 * pg_flush_data() ignores errors, which is ok because this is only a
+	 * hint.
+	 */
+	pg_flush_data(fd, 0, 0);
+
+	if (CloseTransientFile(fd) != 0)
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fname)));
+}
+
+#endif							/* PG_FLUSH_DATA_WORKS */
+
+static void
+datadir_fsync_fname(const char *fname, bool isdir, int elevel)
+{
+	ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
+							 fname);
+
+	/*
+	 * We want to silently ignoring errors about unreadable files.  Pass that
+	 * desire on to fsync_fname_ext().
+	 */
+	fsync_fname_ext(fname, isdir, true, elevel);
+}
+
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+	if (isdir)
+	{
+		if (rmdir(fname) != 0 && errno != ENOENT)
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not remove directory \"%s\": %m", fname)));
+	}
+	else
+	{
+		/* Use PathNameDeleteTemporaryFile to report filesize */
+		PathNameDeleteTemporaryFile(fname, false);
+	}
+}
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
+ * files. Logs other errors at a caller-specified level.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise.
+ */
+int
+fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
+{
+	int			fd;
+	int			flags;
+	int			returncode;
+
+	/*
+	 * Some OSs require directories to be opened read-only whereas other
+	 * systems don't allow us to fsync files opened read-only; so we need both
+	 * cases here.  Using O_RDWR will cause us to fail to fsync files that are
+	 * not writable by our userid, but we assume that's OK.
+	 */
+	flags = PG_BINARY;
+	if (!isdir)
+		flags |= O_RDWR;
+	else
+		flags |= O_RDONLY;
+
+	fd = OpenTransientFile(fname, flags);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES), just ignore the error in that case.  If desired also silently
+	 * ignoring errors about unreadable files. Log others.
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return 0;
+	else if (fd < 0 && ignore_perm && errno == EACCES)
+		return 0;
+	else if (fd < 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	returncode = pg_fsync(fd);
+
+	/*
+	 * Some OSes don't allow us to fsync directories at all, so we can ignore
+	 * those errors. Anything else needs to be logged.
+	 */
+	if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
+	{
+		int			save_errno;
+
+		/* close file upon error, might not be in transaction context */
+		save_errno = errno;
+		(void) CloseTransientFile(fd);
+		errno = save_errno;
+
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	if (CloseTransientFile(fd) != 0)
+	{
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fname)));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static int
+fsync_parent_path(const char *fname, int elevel)
+{
+	char		parentpath[MAXPGPATH];
+
+	strlcpy(parentpath, fname, MAXPGPATH);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		strlcpy(parentpath, ".", MAXPGPATH);
+
+	if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Create a PostgreSQL data sub-directory
+ *
+ * The data directory itself, and most of its sub-directories, are created at
+ * initdb time, but we do have some occasions when we create directories in
+ * the backend (CREATE TABLESPACE, for example).  In those cases, we want to
+ * make sure that those directories are created consistently.  Today, that means
+ * making sure that the created directory has the correct permissions, which is
+ * what pg_dir_create_mode tracks for us.
+ *
+ * Note that we also set the umask() based on what we understand the correct
+ * permissions to be (see file_perm.c).
+ *
+ * For permissions other than the default, mkdir() can be used directly, but
+ * be sure to consider carefully such cases -- a sub-directory with incorrect
+ * permissions in a PostgreSQL data directory could cause backups and other
+ * processes to fail.
+ */
+int
+MakePGDirectory(const char *directoryName)
+{
+	return mkdir(directoryName, pg_dir_create_mode);
+}
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled.  Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL.  A later attempt to
+ * fsync again might falsely report success.  Therefore we must not allow any
+ * further checkpoints to be attempted.  data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+	return data_sync_retry ? elevel : PANIC;
+}
+
+bool
+check_debug_io_direct(char **newval, void **extra, GucSource source)
+{
+	bool		result = true;
+	int			flags;
+
+#if PG_O_DIRECT == 0
+	if (strcmp(*newval, "") != 0)
+	{
+		GUC_check_errdetail("debug_io_direct is not supported on this platform.");
+		result = false;
+	}
+	flags = 0;
+#else
+	List	   *elemlist;
+	ListCell   *l;
+	char	   *rawstring;
+
+	/* Need a modifiable copy of string */
+	rawstring = pstrdup(*newval);
+
+	if (!SplitGUCList(rawstring, ',', &elemlist))
+	{
+		GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
+							"debug_io_direct");
+		pfree(rawstring);
+		list_free(elemlist);
+		return false;
+	}
+
+	flags = 0;
+	foreach(l, elemlist)
+	{
+		char	   *item = (char *) lfirst(l);
+
+		if (pg_strcasecmp(item, "data") == 0)
+			flags |= IO_DIRECT_DATA;
+		else if (pg_strcasecmp(item, "wal") == 0)
+			flags |= IO_DIRECT_WAL;
+		else if (pg_strcasecmp(item, "wal_init") == 0)
+			flags |= IO_DIRECT_WAL_INIT;
+		else
+		{
+			GUC_check_errdetail("invalid option \"%s\"", item);
+			result = false;
+			break;
+		}
+	}
+
+	/*
+	 * It's possible to configure block sizes smaller than our assumed I/O
+	 * alignment size, which could result in invalid I/O requests.
+	 */
+#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
+	if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
+	{
+		GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
+		result = false;
+	}
+#endif
+#if BLCKSZ < PG_IO_ALIGN_SIZE
+	if (result && (flags & IO_DIRECT_DATA))
+	{
+		GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small");
+		result = false;
+	}
+#endif
+
+	pfree(rawstring);
+	list_free(elemlist);
+#endif
+
+	if (!result)
+		return result;
+
+	/* Save the flags in *extra, for use by assign_debug_io_direct */
+	*extra = guc_malloc(ERROR, sizeof(int));
+	*((int *) *extra) = flags;
+
+	return result;
+}
+
+extern void
+assign_debug_io_direct(const char *newval, void *extra)
+{
+	int		   *flags = (int *) extra;
+
+	io_direct_flags = *flags;
+}
diff --git a/src/backend/storage/file/fileset.c b/src/backend/storage/file/fileset.c
new file mode 100644
index 0000000..e9951b0
--- /dev/null
+++ b/src/backend/storage/file/fileset.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * fileset.c
+ *	  Management of named temporary files.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/fileset.c
+ *
+ * FileSets provide a temporary namespace (think directory) so that files can
+ * be discovered by name.
+ *
+ * FileSets can be used by backends when the temporary files need to be
+ * opened/closed multiple times and the underlying files need to survive across
+ * transactions.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/fileset.h"
+#include "utils/builtins.h"
+
+static void FileSetPath(char *path, FileSet *fileset, Oid tablespace);
+static void FilePath(char *path, FileSet *fileset, const char *name);
+static Oid	ChooseTablespace(const FileSet *fileset, const char *name);
+
+/*
+ * Initialize a space for temporary files. This API can be used by shared
+ * fileset as well as if the temporary files are used only by single backend
+ * but the files need to be opened and closed multiple times and also the
+ * underlying files need to survive across transactions.
+ *
+ * The callers are expected to explicitly remove such files by using
+ * FileSetDelete/FileSetDeleteAll.
+ *
+ * Files will be distributed over the tablespaces configured in
+ * temp_tablespaces.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted.
+ */
+void
+FileSetInit(FileSet *fileset)
+{
+	static uint32 counter = 0;
+
+	fileset->creator_pid = MyProcPid;
+	fileset->number = counter;
+	counter = (counter + 1) % INT_MAX;
+
+	/* Capture the tablespace OIDs so that all backends agree on them. */
+	PrepareTempTablespaces();
+	fileset->ntablespaces =
+		GetTempTablespaces(&fileset->tablespaces[0],
+						   lengthof(fileset->tablespaces));
+	if (fileset->ntablespaces == 0)
+	{
+		/* If the GUC is empty, use current database's default tablespace */
+		fileset->tablespaces[0] = MyDatabaseTableSpace;
+		fileset->ntablespaces = 1;
+	}
+	else
+	{
+		int			i;
+
+		/*
+		 * An entry of InvalidOid means use the default tablespace for the
+		 * current database.  Replace that now, to be sure that all users of
+		 * the FileSet agree on what to do.
+		 */
+		for (i = 0; i < fileset->ntablespaces; i++)
+		{
+			if (fileset->tablespaces[i] == InvalidOid)
+				fileset->tablespaces[i] = MyDatabaseTableSpace;
+		}
+	}
+}
+
+/*
+ * Create a new file in the given set.
+ */
+File
+FileSetCreate(FileSet *fileset, const char *name)
+{
+	char		path[MAXPGPATH];
+	File		file;
+
+	FilePath(path, fileset, name);
+	file = PathNameCreateTemporaryFile(path, false);
+
+	/* If we failed, see if we need to create the directory on demand. */
+	if (file <= 0)
+	{
+		char		tempdirpath[MAXPGPATH];
+		char		filesetpath[MAXPGPATH];
+		Oid			tablespace = ChooseTablespace(fileset, name);
+
+		TempTablespacePath(tempdirpath, tablespace);
+		FileSetPath(filesetpath, fileset, tablespace);
+		PathNameCreateTemporaryDir(tempdirpath, filesetpath);
+		file = PathNameCreateTemporaryFile(path, true);
+	}
+
+	return file;
+}
+
+/*
+ * Open a file that was created with FileSetCreate() */
+File
+FileSetOpen(FileSet *fileset, const char *name, int mode)
+{
+	char		path[MAXPGPATH];
+	File		file;
+
+	FilePath(path, fileset, name);
+	file = PathNameOpenTemporaryFile(path, mode);
+
+	return file;
+}
+
+/*
+ * Delete a file that was created with FileSetCreate().
+ *
+ * Return true if the file existed, false if didn't.
+ */
+bool
+FileSetDelete(FileSet *fileset, const char *name,
+			  bool error_on_failure)
+{
+	char		path[MAXPGPATH];
+
+	FilePath(path, fileset, name);
+
+	return PathNameDeleteTemporaryFile(path, error_on_failure);
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+FileSetDeleteAll(FileSet *fileset)
+{
+	char		dirpath[MAXPGPATH];
+	int			i;
+
+	/*
+	 * Delete the directory we created in each tablespace.  Doesn't fail
+	 * because we use this in error cleanup paths, but can generate LOG
+	 * message on IO error.
+	 */
+	for (i = 0; i < fileset->ntablespaces; ++i)
+	{
+		FileSetPath(dirpath, fileset, fileset->tablespaces[i]);
+		PathNameDeleteTemporaryDir(dirpath);
+	}
+}
+
+/*
+ * Build the path for the directory holding the files backing a FileSet in a
+ * given tablespace.
+ */
+static void
+FileSetPath(char *path, FileSet *fileset, Oid tablespace)
+{
+	char		tempdirpath[MAXPGPATH];
+
+	TempTablespacePath(tempdirpath, tablespace);
+	snprintf(path, MAXPGPATH, "%s/%s%lu.%u.fileset",
+			 tempdirpath, PG_TEMP_FILE_PREFIX,
+			 (unsigned long) fileset->creator_pid, fileset->number);
+}
+
+/*
+ * Sorting has to determine which tablespace a given temporary file belongs in.
+ */
+static Oid
+ChooseTablespace(const FileSet *fileset, const char *name)
+{
+	uint32		hash = hash_any((const unsigned char *) name, strlen(name));
+
+	return fileset->tablespaces[hash % fileset->ntablespaces];
+}
+
+/*
+ * Compute the full path of a file in a FileSet.
+ */
+static void
+FilePath(char *path, FileSet *fileset, const char *name)
+{
+	char		dirpath[MAXPGPATH];
+
+	FileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
+	snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
+}
diff --git a/src/backend/storage/file/meson.build b/src/backend/storage/file/meson.build
new file mode 100644
index 0000000..e7fe850
--- /dev/null
+++ b/src/backend/storage/file/meson.build
@@ -0,0 +1,10 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+  'buffile.c',
+  'copydir.c',
+  'fd.c',
+  'fileset.c',
+  'reinit.c',
+  'sharedfileset.c',
+)
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
new file mode 100644
index 0000000..fb55371
--- /dev/null
+++ b/src/backend/storage/file/reinit.c
@@ -0,0 +1,422 @@
+/*-------------------------------------------------------------------------
+ *
+ * reinit.c
+ *	  Reinitialization of unlogged relations
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/reinit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "common/relpath.h"
+#include "postmaster/startup.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+#include "storage/reinit.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
+												  int op);
+static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
+											   int op);
+
+typedef struct
+{
+	Oid			reloid;			/* hash key */
+} unlogged_relation_entry;
+
+/*
+ * Reset unlogged relations from before the last restart.
+ *
+ * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
+ * relation with an "init" fork, except for the "init" fork itself.
+ *
+ * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
+ * fork.
+ */
+void
+ResetUnloggedRelations(int op)
+{
+	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
+	DIR		   *spc_dir;
+	struct dirent *spc_de;
+	MemoryContext tmpctx,
+				oldctx;
+
+	/* Log it. */
+	elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
+		 (op & UNLOGGED_RELATION_CLEANUP) != 0,
+		 (op & UNLOGGED_RELATION_INIT) != 0);
+
+	/*
+	 * Just to be sure we don't leak any memory, let's create a temporary
+	 * memory context for this operation.
+	 */
+	tmpctx = AllocSetContextCreate(CurrentMemoryContext,
+								   "ResetUnloggedRelations",
+								   ALLOCSET_DEFAULT_SIZES);
+	oldctx = MemoryContextSwitchTo(tmpctx);
+
+	/* Prepare to report progress resetting unlogged relations. */
+	begin_startup_progress_phase();
+
+	/*
+	 * First process unlogged files in pg_default ($PGDATA/base)
+	 */
+	ResetUnloggedRelationsInTablespaceDir("base", op);
+
+	/*
+	 * Cycle through directories for all non-default tablespaces.
+	 */
+	spc_dir = AllocateDir("pg_tblspc");
+
+	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
+	{
+		if (strcmp(spc_de->d_name, ".") == 0 ||
+			strcmp(spc_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+		ResetUnloggedRelationsInTablespaceDir(temp_path, op);
+	}
+
+	FreeDir(spc_dir);
+
+	/*
+	 * Restore memory context.
+	 */
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextDelete(tmpctx);
+}
+
+/*
+ * Process one tablespace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
+{
+	DIR		   *ts_dir;
+	struct dirent *de;
+	char		dbspace_path[MAXPGPATH * 2];
+
+	ts_dir = AllocateDir(tsdirname);
+
+	/*
+	 * If we get ENOENT on a tablespace directory, log it and return.  This
+	 * can happen if a previous DROP TABLESPACE crashed between removing the
+	 * tablespace directory and removing the symlink in pg_tblspc.  We don't
+	 * really want to prevent database startup in that scenario, so let it
+	 * pass instead.  Any other type of error will be reported by ReadDir
+	 * (causing a startup failure).
+	 */
+	if (ts_dir == NULL && errno == ENOENT)
+	{
+		ereport(LOG,
+				(errcode_for_file_access(),
+				 errmsg("could not open directory \"%s\": %m",
+						tsdirname)));
+		return;
+	}
+
+	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
+	{
+		/*
+		 * We're only interested in the per-database directories, which have
+		 * numeric names.  Note that this code will also (properly) ignore "."
+		 * and "..".
+		 */
+		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+			continue;
+
+		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+				 tsdirname, de->d_name);
+
+		if (op & UNLOGGED_RELATION_INIT)
+			ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s",
+									 dbspace_path);
+		else if (op & UNLOGGED_RELATION_CLEANUP)
+			ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
+									 dbspace_path);
+
+		ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
+	}
+
+	FreeDir(ts_dir);
+}
+
+/*
+ * Process one per-dbspace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
+{
+	DIR		   *dbspace_dir;
+	struct dirent *de;
+	char		rm_path[MAXPGPATH * 2];
+
+	/* Caller must specify at least one operation. */
+	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
+
+	/*
+	 * Cleanup is a two-pass operation.  First, we go through and identify all
+	 * the files with init forks.  Then, we go through again and nuke
+	 * everything with the same OID except the init fork.
+	 */
+	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
+	{
+		HTAB	   *hash;
+		HASHCTL		ctl;
+
+		/*
+		 * It's possible that someone could create a ton of unlogged relations
+		 * in the same database & tablespace, so we'd better use a hash table
+		 * rather than an array or linked list to keep track of which files
+		 * need to be reset.  Otherwise, this cleanup operation would be
+		 * O(n^2).
+		 */
+		ctl.keysize = sizeof(Oid);
+		ctl.entrysize = sizeof(unlogged_relation_entry);
+		ctl.hcxt = CurrentMemoryContext;
+		hash = hash_create("unlogged relation OIDs", 32, &ctl,
+						   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+		/* Scan the directory. */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			relnumchars;
+			unlogged_relation_entry ent;
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/*
+			 * Put the OID portion of the name into the hash table, if it
+			 * isn't already.
+			 */
+			ent.reloid = atooid(de->d_name);
+			(void) hash_search(hash, &ent, HASH_ENTER, NULL);
+		}
+
+		/* Done with the first pass. */
+		FreeDir(dbspace_dir);
+
+		/*
+		 * If we didn't find any init forks, there's no point in continuing;
+		 * we can bail out now.
+		 */
+		if (hash_get_num_entries(hash) == 0)
+		{
+			hash_destroy(hash);
+			return;
+		}
+
+		/*
+		 * Now, make a second pass and remove anything that matches.
+		 */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			relnumchars;
+			unlogged_relation_entry ent;
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+													 &forkNum))
+				continue;
+
+			/* We never remove the init fork. */
+			if (forkNum == INIT_FORKNUM)
+				continue;
+
+			/*
+			 * See whether the OID portion of the name shows up in the hash
+			 * table.  If so, nuke it!
+			 */
+			ent.reloid = atooid(de->d_name);
+			if (hash_search(hash, &ent, HASH_FIND, NULL))
+			{
+				snprintf(rm_path, sizeof(rm_path), "%s/%s",
+						 dbspacedirname, de->d_name);
+				if (unlink(rm_path) < 0)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not remove file \"%s\": %m",
+									rm_path)));
+				else
+					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
+			}
+		}
+
+		/* Cleanup is complete. */
+		FreeDir(dbspace_dir);
+		hash_destroy(hash);
+	}
+
+	/*
+	 * Initialization happens after cleanup is complete: we copy each init
+	 * fork file to the corresponding main fork file.  Note that if we are
+	 * asked to do both cleanup and init, we may never get here: if the
+	 * cleanup code determines that there are no init forks in this dbspace,
+	 * it will return before we get to this point.
+	 */
+	if ((op & UNLOGGED_RELATION_INIT) != 0)
+	{
+		/* Scan the directory. */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			relnumchars;
+			char		relnumbuf[OIDCHARS + 1];
+			char		srcpath[MAXPGPATH * 2];
+			char		dstpath[MAXPGPATH];
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/* Construct source pathname. */
+			snprintf(srcpath, sizeof(srcpath), "%s/%s",
+					 dbspacedirname, de->d_name);
+
+			/* Construct destination pathname. */
+			memcpy(relnumbuf, de->d_name, relnumchars);
+			relnumbuf[relnumchars] = '\0';
+			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
+					 dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
+					 strlen(forkNames[INIT_FORKNUM]));
+
+			/* OK, we're ready to perform the actual copy. */
+			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
+			copy_file(srcpath, dstpath);
+		}
+
+		FreeDir(dbspace_dir);
+
+		/*
+		 * copy_file() above has already called pg_flush_data() on the files
+		 * it created. Now we need to fsync those files, because a checkpoint
+		 * won't do it for us while we're in recovery. We do this in a
+		 * separate pass to allow the kernel to perform all the flushes
+		 * (especially the metadata ones) at once.
+		 */
+		dbspace_dir = AllocateDir(dbspacedirname);
+		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+		{
+			ForkNumber	forkNum;
+			int			relnumchars;
+			char		relnumbuf[OIDCHARS + 1];
+			char		mainpath[MAXPGPATH];
+
+			/* Skip anything that doesn't look like a relation data file. */
+			if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+													 &forkNum))
+				continue;
+
+			/* Also skip it unless this is the init fork. */
+			if (forkNum != INIT_FORKNUM)
+				continue;
+
+			/* Construct main fork pathname. */
+			memcpy(relnumbuf, de->d_name, relnumchars);
+			relnumbuf[relnumchars] = '\0';
+			snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
+					 dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
+					 strlen(forkNames[INIT_FORKNUM]));
+
+			fsync_fname(mainpath, false);
+		}
+
+		FreeDir(dbspace_dir);
+
+		/*
+		 * Lastly, fsync the database directory itself, ensuring the
+		 * filesystem remembers the file creations and deletions we've done.
+		 * We don't bother with this during a call that does only
+		 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
+		 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
+		 * too at the next startup attempt.
+		 */
+		fsync_fname(dbspacedirname, true);
+	}
+}
+
+/*
+ * Basic parsing of putative relation filenames.
+ *
+ * This function returns true if the file appears to be in the correct format
+ * for a non-temporary relation and false otherwise.
+ *
+ * NB: If this function returns true, the caller is entitled to assume that
+ * *relnumchars has been set to a value no more than OIDCHARS, and thus
+ * that a buffer of OIDCHARS+1 characters is sufficient to hold the
+ * RelFileNumber portion of the filename.  This is critical to protect against
+ * a possible buffer overrun.
+ */
+bool
+parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
+									ForkNumber *fork)
+{
+	int			pos;
+
+	/* Look for a non-empty string of digits (that isn't too long). */
+	for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
+		;
+	if (pos == 0 || pos > OIDCHARS)
+		return false;
+	*relnumchars = pos;
+
+	/* Check for a fork name. */
+	if (name[pos] != '_')
+		*fork = MAIN_FORKNUM;
+	else
+	{
+		int			forkchar;
+
+		forkchar = forkname_chars(&name[pos + 1], fork);
+		if (forkchar <= 0)
+			return false;
+		pos += forkchar + 1;
+	}
+
+	/* Check for a segment number. */
+	if (name[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+			;
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now we should be at the end. */
+	if (name[pos] != '\0')
+		return false;
+	return true;
+}
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
new file mode 100644
index 0000000..a13c8ed
--- /dev/null
+++ b/src/backend/storage/file/sharedfileset.c
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.c
+ *	  Shared temporary file management.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/file/sharedfileset.c
+ *
+ * SharedFileSets provide a temporary namespace (think directory) so that
+ * files can be discovered by name, and a shared ownership semantics so that
+ * shared files survive until the last user detaches.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/sharedfileset.h"
+#include "utils/builtins.h"
+
+static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+
+/*
+ * Initialize a space for temporary files that can be opened by other backends.
+ * Other backends must attach to it before accessing it.  Associate this
+ * SharedFileSet with 'seg'.  Any contained files will be deleted when the
+ * last backend detaches.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted.
+ */
+void
+SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
+{
+	/* Initialize the shared fileset specific members. */
+	SpinLockInit(&fileset->mutex);
+	fileset->refcnt = 1;
+
+	/* Initialize the fileset. */
+	FileSetInit(&fileset->fs);
+
+	/* Register our cleanup callback. */
+	if (seg)
+		on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Attach to a set of directories that was created with SharedFileSetInit.
+ */
+void
+SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
+{
+	bool		success;
+
+	SpinLockAcquire(&fileset->mutex);
+	if (fileset->refcnt == 0)
+		success = false;
+	else
+	{
+		++fileset->refcnt;
+		success = true;
+	}
+	SpinLockRelease(&fileset->mutex);
+
+	if (!success)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not attach to a SharedFileSet that is already destroyed")));
+
+	/* Register our cleanup callback. */
+	on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+SharedFileSetDeleteAll(SharedFileSet *fileset)
+{
+	FileSetDeleteAll(&fileset->fs);
+}
+
+/*
+ * Callback function that will be invoked when this backend detaches from a
+ * DSM segment holding a SharedFileSet that it has created or attached to.  If
+ * we are the last to detach, then try to remove the directories and
+ * everything in them.  We can't raise an error on failures, because this runs
+ * in error cleanup paths.
+ */
+static void
+SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
+{
+	bool		unlink_all = false;
+	SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
+
+	SpinLockAcquire(&fileset->mutex);
+	Assert(fileset->refcnt > 0);
+	if (--fileset->refcnt == 0)
+		unlink_all = true;
+	SpinLockRelease(&fileset->mutex);
+
+	/*
+	 * If we are the last to detach, we delete the directory in all
+	 * tablespaces.  Note that we are still actually attached for the rest of
+	 * this function so we can safely access its data.
+	 */
+	if (unlink_all)
+		FileSetDeleteAll(&fileset->fs);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-13 13:44:03 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-13 13:44:03 +0000
commit	293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
tree	fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/storage/file
parent	Initial commit. (diff)
download	postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip