/*------------------------------------------------------------------------- * * fd.c * Virtual file descriptor code. * * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/storage/file/fd.c * * NOTES: * * This code manages a cache of 'virtual' file descriptors (VFDs). * The server opens many file descriptors for a variety of reasons, * including base tables, scratch files (e.g., sort and hash spool * files), and random calls to C library routines like system(3); it * is quite easy to exceed system limits on the number of open files a * single process can have. (This is around 1024 on many modern * operating systems, but may be lower on others.) * * VFDs are managed as an LRU pool, with actual OS file descriptors * being opened and closed as needed. Obviously, if a routine is * opened using these interfaces, all subsequent operations must also * be through these interfaces (the File type is not a real file * descriptor). * * For this scheme to work, most (if not all) routines throughout the * server should use these interfaces instead of calling the C library * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we * may find ourselves short of real file descriptors anyway. * * INTERFACE ROUTINES * * PathNameOpenFile and OpenTemporaryFile are used to open virtual files. * A File opened with OpenTemporaryFile is automatically deleted when the * File is closed, either explicitly or implicitly at end of transaction or * process exit. PathNameOpenFile is intended for files that are held open * for a long time, like relation files. It is the caller's responsibility * to close them, there is no automatic mechanism in fd.c for that. * * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage * temporary files that have names so that they can be shared between * backends. Such files are automatically closed and count against the * temporary file limit of the backend that creates them, but unlike anonymous * files they are not automatically deleted. See sharedfileset.c for a shared * ownership mechanism that provides automatic cleanup for shared files when * the last of a group of backends detaches. * * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. * They behave like the corresponding native functions, except that the handle * is registered with the current subtransaction, and will be automatically * closed at abort. These are intended mainly for short operations like * reading a configuration file; there is a limit on the number of files that * can be opened using these functions at any one time. * * Finally, BasicOpenFile is just a thin wrapper around open() that can * release file descriptors in use by the virtual file descriptors if * necessary. There is no automatic cleanup of file descriptors returned by * BasicOpenFile, it is solely the caller's responsibility to close the file * descriptor by calling close(2). * * If a non-virtual file descriptor needs to be held open for any length of * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD * (and eventually ReleaseExternalFD), so that we can take it into account * while deciding how many VFDs can be open. This applies to FDs obtained * with BasicOpenFile as well as those obtained without use of any fd.c API. * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include #include #include #include #ifndef WIN32 #include #endif #include #include #include #ifdef HAVE_SYS_RESOURCE_H #include /* for getrlimit */ #endif #include "access/xact.h" #include "access/xlog.h" #include "catalog/pg_tablespace.h" #include "common/file_perm.h" #include "common/file_utils.h" #include "miscadmin.h" #include "pgstat.h" #include "port/pg_iovec.h" #include "portability/mem.h" #include "storage/fd.h" #include "storage/ipc.h" #include "utils/guc.h" #include "utils/resowner_private.h" /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ #if defined(HAVE_SYNC_FILE_RANGE) #define PG_FLUSH_DATA_WORKS 1 #elif !defined(WIN32) && defined(MS_ASYNC) #define PG_FLUSH_DATA_WORKS 1 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #define PG_FLUSH_DATA_WORKS 1 #endif /* * We must leave some file descriptors free for system(), the dynamic loader, * and other code that tries to open files without consulting fd.c. This * is the number left free. (While we try fairly hard to prevent EMFILE * errors, there's never any guarantee that we won't get ENFILE due to * other processes chewing up FDs. So it's a bad idea to try to open files * without consulting fd.c. Nonetheless we cannot control all code.) * * Because this is just a fixed setting, we are effectively assuming that * no such code will leave FDs open over the long term; otherwise the slop * is likely to be insufficient. Note in particular that we expect that * loading a shared library does not result in any permanent increase in * the number of open files. (This appears to be true on most if not * all platforms as of Feb 2004.) */ #define NUM_RESERVED_FDS 10 /* * If we have fewer than this many usable FDs after allowing for the reserved * ones, choke. (This value is chosen to work with "ulimit -n 64", but not * much less than that. Note that this value ensures numExternalFDs can be * at least 16; as of this writing, the contrib/postgres_fdw regression tests * will not pass unless that can grow to at least 14.) */ #define FD_MINFREE 48 /* * A number of platforms allow individual processes to open many more files * than they can really support when *many* processes do the same thing. * This GUC parameter lets the DBA limit max_safe_fds to something less than * what the postmaster's initial probe suggests will work. */ int max_files_per_process = 1000; /* * Maximum number of file descriptors to open for operations that fd.c knows * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized * to a conservative value, and remains that way indefinitely in bootstrap or * standalone-backend cases. In normal postmaster operation, the postmaster * calls set_max_safe_fds() late in initialization to update the value, and * that value is then inherited by forked subprocesses. * * Note: the value of max_files_per_process is taken into account while * setting this variable, and so need not be tested separately. */ int max_safe_fds = FD_MINFREE; /* default if not changed */ /* Whether it is safe to continue running after fsync() fails. */ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; /* Debugging.... */ #ifdef FDDEBUG #define DO_DB(A) \ do { \ int _do_db_save_errno = errno; \ A; \ errno = _do_db_save_errno; \ } while (0) #else #define DO_DB(A) \ ((void) 0) #endif #define VFD_CLOSED (-1) #define FileIsValid(file) \ ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL) #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) /* these are the assigned bits in fdstate below: */ #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ typedef struct vfd { int fd; /* current FD, or VFD_CLOSED if none */ unsigned short fdstate; /* bitflags for VFD's state */ ResourceOwner resowner; /* owner, for automatic cleanup */ File nextFree; /* link to next free VFD, if in freelist */ File lruMoreRecently; /* doubly linked recency-of-use list */ File lruLessRecently; off_t fileSize; /* current size of file (0 if not temporary) */ char *fileName; /* name of file, or NULL for unused VFD */ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ int fileFlags; /* open(2) flags for (re)opening the file */ mode_t fileMode; /* mode to pass to open(2) */ } Vfd; /* * Virtual File Descriptor array pointer and size. This grows as * needed. 'File' values are indexes into this array. * Note that VfdCache[0] is not a usable VFD, just a list header. */ static Vfd *VfdCache; static Size SizeVfdCache = 0; /* * Number of file descriptors known to be in use by VFD entries. */ static int nfile = 0; /* * Flag to tell whether it's worth scanning VfdCache looking for temp files * to close */ static bool have_xact_temporary_files = false; /* * Tracks the total size of all temporary files. Note: when temp_file_limit * is being enforced, this cannot overflow since the limit cannot be more * than INT_MAX kilobytes. When not enforcing, it could theoretically * overflow, but we don't care. */ static uint64 temporary_files_size = 0; /* * List of OS handles opened with AllocateFile, AllocateDir and * OpenTransientFile. */ typedef enum { AllocateDescFile, AllocateDescPipe, AllocateDescDir, AllocateDescRawFD } AllocateDescKind; typedef struct { AllocateDescKind kind; SubTransactionId create_subid; union { FILE *file; DIR *dir; int fd; } desc; } AllocateDesc; static int numAllocatedDescs = 0; static int maxAllocatedDescs = 0; static AllocateDesc *allocatedDescs = NULL; /* * Number of open "external" FDs reported to Reserve/ReleaseExternalFD. */ static int numExternalFDs = 0; /* * Number of temporary files opened during the current session; * this is used in generation of tempfile names. */ static long tempFileCounter = 0; /* * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid, * indicating that the current database's default tablespace should be used.) * When numTempTableSpaces is -1, this has not been set in the current * transaction. */ static Oid *tempTableSpaces = NULL; static int numTempTableSpaces = -1; static int nextTempTableSpace = 0; /*-------------------- * * Private Routines * * Delete - delete a file from the Lru ring * LruDelete - remove a file from the Lru ring and close its FD * Insert - put a file at the front of the Lru ring * LruInsert - put a file at the front of the Lru ring and open it * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit * AllocateVfd - grab a free (or new) file record (from VfdCache) * FreeVfd - free a file record * * The Least Recently Used ring is a doubly linked list that begins and * ends on element zero. Element zero is special -- it doesn't represent * a file and its "fd" field always == VFD_CLOSED. Element zero is just an * anchor that shows us the beginning/end of the ring. * Only VFD elements that are currently really open (have an FD assigned) are * in the Lru ring. Elements that are "virtually" open can be recognized * by having a non-null fileName field. * * example: * * /--less----\ /---------\ * v \ v \ * #0 --more---> LeastRecentlyUsed --more-\ \ * ^\ | | * \\less--> MostRecentlyUsedFile <---/ | * \more---/ \--less--/ * *-------------------- */ static void Delete(File file); static void LruDelete(File file); static void Insert(File file); static int LruInsert(File file); static bool ReleaseLruFile(void); static void ReleaseLruFiles(void); static File AllocateVfd(void); static void FreeVfd(File file); static int FileAccess(File file); static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); static bool reserveAllocatedDesc(void); static int FreeDesc(AllocateDesc *desc); static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isCommit, bool isProcExit); static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static void walkdir(const char *path, void (*action) (const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel); #ifdef PG_FLUSH_DATA_WORKS static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); static int fsync_parent_path(const char *fname, int elevel); /* * pg_fsync --- do fsync with or without writethrough */ int pg_fsync(int fd) { #if !defined(WIN32) && defined(USE_ASSERT_CHECKING) struct stat st; /* * Some operating system implementations of fsync() have requirements * about the file access modes that were used when their file descriptor * argument was opened, and these requirements differ depending on whether * the file descriptor is for a directory. * * For any file descriptor that may eventually be handed to fsync(), we * should have opened it with access modes that are compatible with * fsync() on all supported systems, otherwise the code may not be * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write * permissions (either O_RDWR or O_WRONLY) and for a directory without * write permissions (O_RDONLY). * * Ignore any fstat errors and let the follow-up fsync() do its work. * Doing this sanity check here counts for the case where fsync() is * disabled. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); /* * O_RDONLY is historically 0, so just make sure that for directories * no write flags are used. */ if (S_ISDIR(st.st_mode)) Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); else Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); } errno = 0; #endif /* #if is to skip the sync_method test if there's no need for it */ #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC) if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH) return pg_fsync_writethrough(fd); else #endif return pg_fsync_no_writethrough(fd); } /* * pg_fsync_no_writethrough --- same as fsync except does nothing if * enableFsync is off */ int pg_fsync_no_writethrough(int fd) { if (enableFsync) return fsync(fd); else return 0; } /* * pg_fsync_writethrough */ int pg_fsync_writethrough(int fd) { if (enableFsync) { #ifdef WIN32 return _commit(fd); #elif defined(F_FULLFSYNC) return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; #else errno = ENOSYS; return -1; #endif } else return 0; } /* * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off * * Not all platforms have fdatasync; treat as fsync if not available. */ int pg_fdatasync(int fd) { if (enableFsync) { #ifdef HAVE_FDATASYNC return fdatasync(fd); #else return fsync(fd); #endif } else return 0; } /* * pg_flush_data --- advise OS that the described dirty data should be flushed * * offset of 0 with nbytes 0 means that the entire file should be flushed */ void pg_flush_data(int fd, off_t offset, off_t nbytes) { /* * Right now file flushing is primarily used to avoid making later * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes * if fsyncs are disabled - that's a decision we might want to make * configurable at some point. */ if (!enableFsync) return; /* * We compile all alternatives that are supported on the current platform, * to find portability problems more easily. */ #if defined(HAVE_SYNC_FILE_RANGE) { int rc; static bool not_implemented_by_kernel = false; if (not_implemented_by_kernel) return; /* * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, * tells the OS that writeback for the specified blocks should be * started, but that we don't want to wait for completion. Note that * this call might block if too much dirty data exists in the range. * This is the preferable method on OSs supporting it, as it works * reliably when available (contrast to msync()) and doesn't flush out * clean data (like FADV_DONTNEED). */ rc = sync_file_range(fd, offset, nbytes, SYNC_FILE_RANGE_WRITE); if (rc != 0) { int elevel; /* * For systems that don't have an implementation of * sync_file_range() such as Windows WSL, generate only one * warning and then suppress all further attempts by this process. */ if (errno == ENOSYS) { elevel = WARNING; not_implemented_by_kernel = true; } else elevel = data_sync_elevel(WARNING); ereport(elevel, (errcode_for_file_access(), errmsg("could not flush dirty data: %m"))); } return; } #endif #if !defined(WIN32) && defined(MS_ASYNC) { void *p; static int pagesize = 0; /* * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers * writeback. On linux it only does so if MS_SYNC is specified, but * then it does the writeback synchronously. Luckily all common linux * systems have sync_file_range(). This is preferable over * FADV_DONTNEED because it doesn't flush out clean data. * * We map the file (mmap()), tell the kernel to sync back the contents * (msync()), and then remove the mapping again (munmap()). */ /* mmap() needs actual length if we want to map whole file */ if (offset == 0 && nbytes == 0) { nbytes = lseek(fd, 0, SEEK_END); if (nbytes < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not determine dirty data size: %m"))); return; } } /* * Some platforms reject partial-page mmap() attempts. To deal with * that, just truncate the request to a page boundary. If any extra * bytes don't get flushed, well, it's only a hint anyway. */ /* fetch pagesize only once */ if (pagesize == 0) pagesize = sysconf(_SC_PAGESIZE); /* align length to pagesize, dropping any fractional page */ if (pagesize > 0) nbytes = (nbytes / pagesize) * pagesize; /* fractional-page request is a no-op */ if (nbytes <= 0) return; /* * mmap could well fail, particularly on 32-bit platforms where there * may simply not be enough address space. If so, silently fall * through to the next implementation. */ if (nbytes <= (off_t) SSIZE_MAX) p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); else p = MAP_FAILED; if (p != MAP_FAILED) { int rc; rc = msync(p, (size_t) nbytes, MS_ASYNC); if (rc != 0) { ereport(data_sync_elevel(WARNING), (errcode_for_file_access(), errmsg("could not flush dirty data: %m"))); /* NB: need to fall through to munmap()! */ } rc = munmap(p, (size_t) nbytes); if (rc != 0) { /* FATAL error because mapping would remain */ ereport(FATAL, (errcode_for_file_access(), errmsg("could not munmap() while flushing data: %m"))); } return; } } #endif #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) { int rc; /* * Signal the kernel that the passed in range should not be cached * anymore. This has the, desired, side effect of writing out dirty * data, and the, undesired, side effect of likely discarding useful * clean cached blocks. For the latter reason this is the least * preferable method. */ rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); if (rc != 0) { /* don't error out, this is just a performance optimization */ ereport(WARNING, (errcode_for_file_access(), errmsg("could not flush dirty data: %m"))); } return; } #endif } /* * Truncate a file to a given length by name. */ int pg_truncate(const char *path, off_t length) { #ifdef WIN32 int save_errno; int ret; int fd; fd = OpenTransientFile(path, O_RDWR | PG_BINARY); if (fd >= 0) { ret = ftruncate(fd, 0); save_errno = errno; CloseTransientFile(fd); errno = save_errno; } else ret = -1; return ret; #else return truncate(path, length); #endif } /* * fsync_fname -- fsync a file or directory, handling errors properly * * Try to fsync a file or directory. When doing the latter, ignore errors that * indicate the OS just doesn't allow/require fsyncing directories. */ void fsync_fname(const char *fname, bool isdir) { fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR)); } /* * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability * * This routine ensures that, after returning, the effect of renaming file * persists in case of a crash. A crash while this routine is running will * leave you with either the pre-existing or the moved file in place of the * new file; no mixed state or truncated files are possible. * * It does so by using fsync on the old filename and the possibly existing * target filename before the rename, and the target file and directory after. * * Note that rename() cannot be used across arbitrary directories, as they * might not be on the same filesystem. Therefore this routine does not * support renaming across directories. * * Log errors with the caller specified severity. * * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not * valid upon return. */ int durable_rename(const char *oldfile, const char *newfile, int elevel) { int fd; /* * First fsync the old and target path (if it exists), to ensure that they * are properly persistent on disk. Syncing the target file is not * strictly necessary, but it makes it easier to reason about crashes; * because it's then guaranteed that either source or target file exists * after a crash. */ if (fsync_fname_ext(oldfile, false, false, elevel) != 0) return -1; fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR); if (fd < 0) { if (errno != ENOENT) { ereport(elevel, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", newfile))); return -1; } } else { if (pg_fsync(fd) != 0) { int save_errno; /* close file upon error, might not be in transaction context */ save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", newfile))); return -1; } if (CloseTransientFile(fd) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", newfile))); return -1; } } /* Time to do the real deal... */ if (rename(oldfile, newfile) < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", oldfile, newfile))); return -1; } /* * To guarantee renaming the file is persistent, fsync the file with its * new name, and its containing directory. */ if (fsync_fname_ext(newfile, false, false, elevel) != 0) return -1; if (fsync_parent_path(newfile, elevel) != 0) return -1; return 0; } /* * durable_unlink -- remove a file in a durable manner * * This routine ensures that, after returning, the effect of removing file * persists in case of a crash. A crash while this routine is running will * leave the system in no mixed state. * * It does so by using fsync on the parent directory of the file after the * actual removal is done. * * Log errors with the severity specified by caller. * * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not * valid upon return. */ int durable_unlink(const char *fname, int elevel) { if (unlink(fname) < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", fname))); return -1; } /* * To guarantee that the removal of the file is persistent, fsync its * parent directory. */ if (fsync_parent_path(fname, elevel) != 0) return -1; return 0; } /* * durable_rename_excl -- rename a file in a durable manner. * * Similar to durable_rename(), except that this routine tries (but does not * guarantee) not to overwrite the target file. * * Note that a crash in an unfortunate moment can leave you with two links to * the target file. * * Log errors with the caller specified severity. * * On Windows, using a hard link followed by unlink() causes concurrency * issues, while a simple rename() does not cause that, so be careful when * changing the logic of this routine. * * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not * valid upon return. */ int durable_rename_excl(const char *oldfile, const char *newfile, int elevel) { /* * Ensure that, if we crash directly after the rename/link, a file with * valid contents is moved into place. */ if (fsync_fname_ext(oldfile, false, false, elevel) != 0) return -1; #ifdef HAVE_WORKING_LINK if (link(oldfile, newfile) < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not link file \"%s\" to \"%s\": %m", oldfile, newfile), (AmCheckpointerProcess() ? errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : 0))); return -1; } unlink(oldfile); #else if (rename(oldfile, newfile) < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", oldfile, newfile), (AmCheckpointerProcess() ? errhint("This is known to fail occasionally during archive recovery, where it is harmless.") : 0))); return -1; } #endif /* * Make change persistent in case of an OS crash, both the new entry and * its parent directory need to be flushed. */ if (fsync_fname_ext(newfile, false, false, elevel) != 0) return -1; /* Same for parent directory */ if (fsync_parent_path(newfile, elevel) != 0) return -1; return 0; } /* * InitFileAccess --- initialize this module during backend startup * * This is called during either normal or standalone backend start. * It is *not* called in the postmaster. */ void InitFileAccess(void) { Assert(SizeVfdCache == 0); /* call me only once */ /* initialize cache header entry */ VfdCache = (Vfd *) malloc(sizeof(Vfd)); if (VfdCache == NULL) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); VfdCache->fd = VFD_CLOSED; SizeVfdCache = 1; /* register proc-exit hook to ensure temp files are dropped at exit */ on_proc_exit(AtProcExit_Files, 0); } /* * count_usable_fds --- count how many FDs the system will let us open, * and estimate how many are already open. * * We stop counting if usable_fds reaches max_to_probe. Note: a small * value of max_to_probe might result in an underestimate of already_open; * we must fill in any "gaps" in the set of used FDs before the calculation * of already_open will give the right answer. In practice, max_to_probe * of a couple of dozen should be enough to ensure good results. * * We assume stderr (FD 2) is available for dup'ing. While the calling * script could theoretically close that, it would be a really bad idea, * since then one risks loss of error messages from, e.g., libc. */ static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) { int *fd; int size; int used = 0; int highestfd = 0; int j; #ifdef HAVE_GETRLIMIT struct rlimit rlim; int getrlimit_status; #endif size = 1024; fd = (int *) palloc(size * sizeof(int)); #ifdef HAVE_GETRLIMIT #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */ getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim); #else /* but BSD doesn't ... */ getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim); #endif /* RLIMIT_NOFILE */ if (getrlimit_status != 0) ereport(WARNING, (errmsg("getrlimit failed: %m"))); #endif /* HAVE_GETRLIMIT */ /* dup until failure or probe limit reached */ for (;;) { int thisfd; #ifdef HAVE_GETRLIMIT /* * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on * some platforms */ if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1) break; #endif thisfd = dup(2); if (thisfd < 0) { /* Expect EMFILE or ENFILE, else it's fishy */ if (errno != EMFILE && errno != ENFILE) elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used); break; } if (used >= size) { size *= 2; fd = (int *) repalloc(fd, size * sizeof(int)); } fd[used++] = thisfd; if (highestfd < thisfd) highestfd = thisfd; if (used >= max_to_probe) break; } /* release the files we opened */ for (j = 0; j < used; j++) close(fd[j]); pfree(fd); /* * Return results. usable_fds is just the number of successful dups. We * assume that the system limit is highestfd+1 (remember 0 is a legal FD * number) and so already_open is highestfd+1 - usable_fds. */ *usable_fds = used; *already_open = highestfd + 1 - used; } /* * set_max_safe_fds * Determine number of file descriptors that fd.c is allowed to use */ void set_max_safe_fds(void) { int usable_fds; int already_open; /*---------- * We want to set max_safe_fds to * MIN(usable_fds, max_files_per_process - already_open) * less the slop factor for files that are opened without consulting * fd.c. This ensures that we won't exceed either max_files_per_process * or the experimentally-determined EMFILE limit. *---------- */ count_usable_fds(max_files_per_process, &usable_fds, &already_open); max_safe_fds = Min(usable_fds, max_files_per_process - already_open); /* * Take off the FDs reserved for system() etc. */ max_safe_fds -= NUM_RESERVED_FDS; /* * Make sure we still have enough to get by. */ if (max_safe_fds < FD_MINFREE) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("insufficient file descriptors available to start server process"), errdetail("System allows %d, we need at least %d.", max_safe_fds + NUM_RESERVED_FDS, FD_MINFREE + NUM_RESERVED_FDS))); elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d", max_safe_fds, usable_fds, already_open); } /* * Open a file with BasicOpenFilePerm() and pass default file mode for the * fileMode parameter. */ int BasicOpenFile(const char *fileName, int fileFlags) { return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode); } /* * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed * * This is exported for use by places that really want a plain kernel FD, * but need to be proof against running out of FDs. Once an FD has been * successfully returned, it is the caller's responsibility to ensure that * it will not be leaked on ereport()! Most users should *not* call this * routine directly, but instead use the VFD abstraction level, which * provides protection against descriptor leaks as well as management of * files that need to be open for more than a short period of time. * * Ideally this should be the *only* direct call of open() in the backend. * In practice, the postmaster calls open() directly, and there are some * direct open() calls done early in backend startup. Those are OK since * this module wouldn't have any open files to close at that point anyway. */ int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) { int fd; tryAgain: fd = open(fileName, fileFlags, fileMode); if (fd >= 0) return fd; /* success! */ if (errno == EMFILE || errno == ENFILE) { int save_errno = errno; ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("out of file descriptors: %m; release and retry"))); errno = 0; if (ReleaseLruFile()) goto tryAgain; errno = save_errno; } return -1; /* failure */ } /* * AcquireExternalFD - attempt to reserve an external file descriptor * * This should be used by callers that need to hold a file descriptor open * over more than a short interval, but cannot use any of the other facilities * provided by this module. * * The difference between this and the underlying ReserveExternalFD function * is that this will report failure (by setting errno and returning false) * if "too many" external FDs are already reserved. This should be used in * any code where the total number of FDs to be reserved is not predictable * and small. */ bool AcquireExternalFD(void) { /* * We don't want more than max_safe_fds / 3 FDs to be consumed for * "external" FDs. */ if (numExternalFDs < max_safe_fds / 3) { ReserveExternalFD(); return true; } errno = EMFILE; return false; } /* * ReserveExternalFD - report external consumption of a file descriptor * * This should be used by callers that need to hold a file descriptor open * over more than a short interval, but cannot use any of the other facilities * provided by this module. This just tracks the use of the FD and closes * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available. * * Call this directly only in code where failure to reserve the FD would be * fatal; for example, the WAL-writing code does so, since the alternative is * session failure. Also, it's very unwise to do so in code that could * consume more than one FD per process. * * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain * available, it doesn't matter too much whether this is called before or * after actually opening the FD; but doing so beforehand reduces the risk of * an EMFILE failure if not everybody played nice. In any case, it's solely * caller's responsibility to keep the external-FD count in sync with reality. */ void ReserveExternalFD(void) { /* * Release VFDs if needed to stay safe. Because we do this before * incrementing numExternalFDs, the final state will be as desired, i.e., * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds. */ ReleaseLruFiles(); numExternalFDs++; } /* * ReleaseExternalFD - report release of an external file descriptor * * This is guaranteed not to change errno, so it can be used in failure paths. */ void ReleaseExternalFD(void) { Assert(numExternalFDs > 0); numExternalFDs--; } #if defined(FDDEBUG) static void _dump_lru(void) { int mru = VfdCache[0].lruLessRecently; Vfd *vfdP = &VfdCache[mru]; char buf[2048]; snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru); while (mru != 0) { mru = vfdP->lruLessRecently; vfdP = &VfdCache[mru]; snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru); } snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST"); elog(LOG, "%s", buf); } #endif /* FDDEBUG */ static void Delete(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "Delete %d (%s)", file, VfdCache[file].fileName)); DO_DB(_dump_lru()); vfdP = &VfdCache[file]; VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently; VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently; DO_DB(_dump_lru()); } static void LruDelete(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "LruDelete %d (%s)", file, VfdCache[file].fileName)); vfdP = &VfdCache[file]; /* * Close the file. We aren't expecting this to fail; if it does, better * to leak the FD than to mess up our internal state. */ if (close(vfdP->fd) != 0) elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), "could not close file \"%s\": %m", vfdP->fileName); vfdP->fd = VFD_CLOSED; --nfile; /* delete the vfd record from the LRU ring */ Delete(file); } static void Insert(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "Insert %d (%s)", file, VfdCache[file].fileName)); DO_DB(_dump_lru()); vfdP = &VfdCache[file]; vfdP->lruMoreRecently = 0; vfdP->lruLessRecently = VfdCache[0].lruLessRecently; VfdCache[0].lruLessRecently = file; VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; DO_DB(_dump_lru()); } /* returns 0 on success, -1 on re-open failure (with errno set) */ static int LruInsert(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "LruInsert %d (%s)", file, VfdCache[file].fileName)); vfdP = &VfdCache[file]; if (FileIsNotOpen(file)) { /* Close excess kernel FDs. */ ReleaseLruFiles(); /* * The open could still fail for lack of file descriptors, eg due to * overall system file table being full. So, be prepared to release * another FD if necessary... */ vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode); if (vfdP->fd < 0) { DO_DB(elog(LOG, "re-open failed: %m")); return -1; } else { ++nfile; } } /* * put it at the head of the Lru ring */ Insert(file); return 0; } /* * Release one kernel FD by closing the least-recently-used VFD. */ static bool ReleaseLruFile(void) { DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile)); if (nfile > 0) { /* * There are opened files and so there should be at least one used vfd * in the ring. */ Assert(VfdCache[0].lruMoreRecently != 0); LruDelete(VfdCache[0].lruMoreRecently); return true; /* freed a file */ } return false; /* no files available to free */ } /* * Release kernel FDs as needed to get under the max_safe_fds limit. * After calling this, it's OK to try to open another file. */ static void ReleaseLruFiles(void) { while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds) { if (!ReleaseLruFile()) break; } } static File AllocateVfd(void) { Index i; File file; DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache)); Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ if (VfdCache[0].nextFree == 0) { /* * The free list is empty so it is time to increase the size of the * array. We choose to double it each time this happens. However, * there's not much point in starting *real* small. */ Size newCacheSize = SizeVfdCache * 2; Vfd *newVfdCache; if (newCacheSize < 32) newCacheSize = 32; /* * Be careful not to clobber VfdCache ptr if realloc fails. */ newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize); if (newVfdCache == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); VfdCache = newVfdCache; /* * Initialize the new entries and link them into the free list. */ for (i = SizeVfdCache; i < newCacheSize; i++) { MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd)); VfdCache[i].nextFree = i + 1; VfdCache[i].fd = VFD_CLOSED; } VfdCache[newCacheSize - 1].nextFree = 0; VfdCache[0].nextFree = SizeVfdCache; /* * Record the new size */ SizeVfdCache = newCacheSize; } file = VfdCache[0].nextFree; VfdCache[0].nextFree = VfdCache[file].nextFree; return file; } static void FreeVfd(File file) { Vfd *vfdP = &VfdCache[file]; DO_DB(elog(LOG, "FreeVfd: %d (%s)", file, vfdP->fileName ? vfdP->fileName : "")); if (vfdP->fileName != NULL) { free(vfdP->fileName); vfdP->fileName = NULL; } vfdP->fdstate = 0x0; vfdP->nextFree = VfdCache[0].nextFree; VfdCache[0].nextFree = file; } /* returns 0 on success, -1 on re-open failure (with errno set) */ static int FileAccess(File file) { int returnValue; DO_DB(elog(LOG, "FileAccess %d (%s)", file, VfdCache[file].fileName)); /* * Is the file open? If not, open it and put it at the head of the LRU * ring (possibly closing the least recently used file to get an FD). */ if (FileIsNotOpen(file)) { returnValue = LruInsert(file); if (returnValue != 0) return returnValue; } else if (VfdCache[0].lruLessRecently != file) { /* * We now know that the file is open and that it is not the last one * accessed, so we need to move it to the head of the Lru ring. */ Delete(file); Insert(file); } return 0; } /* * Called whenever a temporary file is deleted to report its size. */ static void ReportTemporaryFileUsage(const char *path, off_t size) { pgstat_report_tempfile(size); if (log_temp_files >= 0) { if ((size / 1024) >= log_temp_files) ereport(LOG, (errmsg("temporary file: path \"%s\", size %lu", path, (unsigned long) size))); } } /* * Called to register a temporary file for automatic close. * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called * before the file was opened. */ static void RegisterTemporaryFile(File file) { ResourceOwnerRememberFile(CurrentResourceOwner, file); VfdCache[file].resowner = CurrentResourceOwner; /* Backup mechanism for closing at end of xact. */ VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; have_xact_temporary_files = true; } /* * Called when we get a shared invalidation message on some relation. */ #ifdef NOT_USED void FileInvalidate(File file) { Assert(FileIsValid(file)); if (!FileIsNotOpen(file)) LruDelete(file); } #endif /* * Open a file with PathNameOpenFilePerm() and pass default file mode for the * fileMode parameter. */ File PathNameOpenFile(const char *fileName, int fileFlags) { return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); } /* * open a file in an arbitrary directory * * NB: if the passed pathname is relative (which it usually is), * it will be interpreted relative to the process' working directory * (which should always be $PGDATA when this code is running). */ File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) { char *fnamecopy; File file; Vfd *vfdP; DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", fileName, fileFlags, fileMode)); /* * We need a malloc'd copy of the file name; fail cleanly if no room. */ fnamecopy = strdup(fileName); if (fnamecopy == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); file = AllocateVfd(); vfdP = &VfdCache[file]; /* Close excess kernel FDs. */ ReleaseLruFiles(); vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); if (vfdP->fd < 0) { int save_errno = errno; FreeVfd(file); free(fnamecopy); errno = save_errno; return -1; } ++nfile; DO_DB(elog(LOG, "PathNameOpenFile: success %d", vfdP->fd)); vfdP->fileName = fnamecopy; /* Saved flags are adjusted to be OK for re-opening file */ vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); vfdP->fileMode = fileMode; vfdP->fileSize = 0; vfdP->fdstate = 0x0; vfdP->resowner = NULL; Insert(file); return file; } /* * Create directory 'directory'. If necessary, create 'basedir', which must * be the directory above it. This is designed for creating the top-level * temporary directory on demand before creating a directory underneath it. * Do nothing if the directory already exists. * * Directories created within the top-level temporary directory should begin * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and * deleted at startup by RemovePgTempFiles(). Further subdirectories below * that do not need any particular prefix. */ void PathNameCreateTemporaryDir(const char *basedir, const char *directory) { if (MakePGDirectory(directory) < 0) { if (errno == EEXIST) return; /* * Failed. Try to create basedir first in case it's missing. Tolerate * EEXIST to close a race against another process following the same * algorithm. */ if (MakePGDirectory(basedir) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("cannot create temporary directory \"%s\": %m", basedir))); /* Try again. */ if (MakePGDirectory(directory) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("cannot create temporary subdirectory \"%s\": %m", directory))); } } /* * Delete a directory and everything in it, if it exists. */ void PathNameDeleteTemporaryDir(const char *dirname) { struct stat statbuf; /* Silently ignore missing directory. */ if (stat(dirname, &statbuf) != 0 && errno == ENOENT) return; /* * Currently, walkdir doesn't offer a way for our passed in function to * maintain state. Perhaps it should, so that we could tell the caller * whether this operation succeeded or failed. Since this operation is * used in a cleanup path, we wouldn't actually behave differently: we'll * just log failures. */ walkdir(dirname, unlink_if_exists_fname, false, LOG); } /* * Open a temporary file that will disappear when we close it. * * This routine takes care of generating an appropriate tempfile name. * There's no need to pass in fileFlags or fileMode either, since only * one setting makes any sense for a temp file. * * Unless interXact is true, the file is remembered by CurrentResourceOwner * to ensure it's closed and deleted when it's no longer needed, typically at * the end-of-transaction. In most cases, you don't want temporary files to * outlive the transaction that created them, so this should be false -- but * if you need "somewhat" temporary storage, this might be useful. In either * case, the file is removed when the File is explicitly closed. */ File OpenTemporaryFile(bool interXact) { File file = 0; /* * Make sure the current resource owner has space for this File before we * open it, if we'll be registering it below. */ if (!interXact) ResourceOwnerEnlargeFiles(CurrentResourceOwner); /* * If some temp tablespace(s) have been given to us, try to use the next * one. If a given tablespace can't be found, we silently fall back to * the database's default tablespace. * * BUT: if the temp file is slated to outlive the current transaction, * force it into the database's default tablespace, so that it will not * pose a threat to possible tablespace drop attempts. */ if (numTempTableSpaces > 0 && !interXact) { Oid tblspcOid = GetNextTempTableSpace(); if (OidIsValid(tblspcOid)) file = OpenTemporaryFileInTablespace(tblspcOid, false); } /* * If not, or if tablespace is bad, create in database's default * tablespace. MyDatabaseTableSpace should normally be set before we get * here, but just in case it isn't, fall back to pg_default tablespace. */ if (file <= 0) file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ? MyDatabaseTableSpace : DEFAULTTABLESPACE_OID, true); /* Mark it for deletion at close and temporary file size limit */ VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; /* Register it with the current resource owner */ if (!interXact) RegisterTemporaryFile(file); return file; } /* * Return the path of the temp directory in a given tablespace. */ void TempTablespacePath(char *path, Oid tablespace) { /* * Identify the tempfile directory for this tablespace. * * If someone tries to specify pg_global, use pg_default instead. */ if (tablespace == InvalidOid || tablespace == DEFAULTTABLESPACE_OID || tablespace == GLOBALTABLESPACE_OID) snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); else { /* All other tablespaces are accessed via symlinks */ snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", tablespace, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); } } /* * Open a temporary file in a specific tablespace. * Subroutine for OpenTemporaryFile, which see for details. */ static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) { char tempdirpath[MAXPGPATH]; char tempfilepath[MAXPGPATH]; File file; TempTablespacePath(tempdirpath, tblspcOid); /* * Generate a tempfile name that should be unique within the current * database instance. */ snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld", tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++); /* * Open the file. Note: we don't use O_EXCL, in case there is an orphaned * temp file that can be reused. */ file = PathNameOpenFile(tempfilepath, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); if (file <= 0) { /* * We might need to create the tablespace's tempfile directory, if no * one has yet done so. * * Don't check for an error from MakePGDirectory; it could fail if * someone else just did the same thing. If it doesn't work then * we'll bomb out on the second create attempt, instead. */ (void) MakePGDirectory(tempdirpath); file = PathNameOpenFile(tempfilepath, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); if (file <= 0 && rejectError) elog(ERROR, "could not create temporary file \"%s\": %m", tempfilepath); } return file; } /* * Create a new file. The directory containing it must already exist. Files * created this way are subject to temp_file_limit and are automatically * closed at end of transaction, but are not automatically deleted on close * because they are intended to be shared between cooperating backends. * * If the file is inside the top-level temporary directory, its name should * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be * inside a directory created with PathNameCreateTemporaryDir(), in which case * the prefix isn't needed. */ File PathNameCreateTemporaryFile(const char *path, bool error_on_failure) { File file; ResourceOwnerEnlargeFiles(CurrentResourceOwner); /* * Open the file. Note: we don't use O_EXCL, in case there is an orphaned * temp file that can be reused. */ file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); if (file <= 0) { if (error_on_failure) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create temporary file \"%s\": %m", path))); else return file; } /* Mark it for temp_file_limit accounting. */ VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; /* Register it for automatic close. */ RegisterTemporaryFile(file); return file; } /* * Open a file that was created with PathNameCreateTemporaryFile, possibly in * another backend. Files opened this way don't count against the * temp_file_limit of the caller, are automatically closed at the end of the * transaction but are not deleted on close. */ File PathNameOpenTemporaryFile(const char *path, int mode) { File file; ResourceOwnerEnlargeFiles(CurrentResourceOwner); file = PathNameOpenFile(path, mode | PG_BINARY); /* If no such file, then we don't raise an error. */ if (file <= 0 && errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open temporary file \"%s\": %m", path))); if (file > 0) { /* Register it for automatic close. */ RegisterTemporaryFile(file); } return file; } /* * Delete a file by pathname. Return true if the file existed, false if * didn't. */ bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) { struct stat filestats; int stat_errno; /* Get the final size for pgstat reporting. */ if (stat(path, &filestats) != 0) stat_errno = errno; else stat_errno = 0; /* * Unlike FileClose's automatic file deletion code, we tolerate * non-existence to support BufFileDeleteShared which doesn't know how * many segments it has to delete until it runs out. */ if (stat_errno == ENOENT) return false; if (unlink(path) < 0) { if (errno != ENOENT) ereport(error_on_failure ? ERROR : LOG, (errcode_for_file_access(), errmsg("could not unlink temporary file \"%s\": %m", path))); return false; } if (stat_errno == 0) ReportTemporaryFileUsage(path, filestats.st_size); else { errno = stat_errno; ereport(LOG, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", path))); } return true; } /* * close a file when done with it */ void FileClose(File file) { Vfd *vfdP; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileClose: %d (%s)", file, VfdCache[file].fileName)); vfdP = &VfdCache[file]; if (!FileIsNotOpen(file)) { /* close the file */ if (close(vfdP->fd) != 0) { /* * We may need to panic on failure to close non-temporary files; * see LruDelete. */ elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), "could not close file \"%s\": %m", vfdP->fileName); } --nfile; vfdP->fd = VFD_CLOSED; /* remove the file from the lru ring */ Delete(file); } if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { /* Subtract its size from current usage (do first in case of error) */ temporary_files_size -= vfdP->fileSize; vfdP->fileSize = 0; } /* * Delete the file if it was temporary, and make a log entry if wanted */ if (vfdP->fdstate & FD_DELETE_AT_CLOSE) { struct stat filestats; int stat_errno; /* * If we get an error, as could happen within the ereport/elog calls, * we'll come right back here during transaction abort. Reset the * flag to ensure that we can't get into an infinite loop. This code * is arranged to ensure that the worst-case consequence is failing to * emit log message(s), not failing to attempt the unlink. */ vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; /* first try the stat() */ if (stat(vfdP->fileName, &filestats)) stat_errno = errno; else stat_errno = 0; /* in any case do the unlink */ if (unlink(vfdP->fileName)) ereport(LOG, (errcode_for_file_access(), errmsg("could not delete file \"%s\": %m", vfdP->fileName))); /* and last report the stat results */ if (stat_errno == 0) ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); else { errno = stat_errno; ereport(LOG, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", vfdP->fileName))); } } /* Unregister it from the resource owner */ if (vfdP->resowner) ResourceOwnerForgetFile(vfdP->resowner, file); /* * Return the Vfd slot to the free list */ FreeVfd(file); } /* * FilePrefetch - initiate asynchronous read of a given range of the file. * * Currently the only implementation of this function is using posix_fadvise * which is the simplest standardized interface that accomplishes this. * We could add an implementation using libaio in the future; but note that * this API is inappropriate for libaio, which wants to have a buffer provided * to read into. */ int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info) { #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) int returnCode; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d", file, VfdCache[file].fileName, (int64) offset, amount)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; pgstat_report_wait_start(wait_event_info); returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, POSIX_FADV_WILLNEED); pgstat_report_wait_end(); return returnCode; #else Assert(FileIsValid(file)); return 0; #endif } void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) { int returnCode; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT, file, VfdCache[file].fileName, (int64) offset, (int64) nbytes)); if (nbytes <= 0) return; returnCode = FileAccess(file); if (returnCode < 0) return; pgstat_report_wait_start(wait_event_info); pg_flush_data(VfdCache[file].fd, offset, nbytes); pgstat_report_wait_end(); } int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info) { int returnCode; Vfd *vfdP; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p", file, VfdCache[file].fileName, (int64) offset, amount, buffer)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; vfdP = &VfdCache[file]; retry: pgstat_report_wait_start(wait_event_info); returnCode = pg_pread(vfdP->fd, buffer, amount, offset); pgstat_report_wait_end(); if (returnCode < 0) { /* * Windows may run out of kernel buffers and return "Insufficient * system resources" error. Wait a bit and retry to solve it. * * It is rumored that EINTR is also possible on some Unix filesystems, * in which case immediate retry is indicated. */ #ifdef WIN32 DWORD error = GetLastError(); switch (error) { case ERROR_NO_SYSTEM_RESOURCES: pg_usleep(1000L); errno = EINTR; break; default: _dosmaperr(error); break; } #endif /* OK to retry if interrupted */ if (errno == EINTR) goto retry; } return returnCode; } int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info) { int returnCode; Vfd *vfdP; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", file, VfdCache[file].fileName, (int64) offset, amount, buffer)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; vfdP = &VfdCache[file]; /* * If enforcing temp_file_limit and it's a temp file, check to see if the * write would overrun temp_file_limit, and throw error if so. Note: it's * really a modularity violation to throw error here; we should set errno * and return -1. However, there's no way to report a suitable error * message if we do that. All current callers would just throw error * immediately anyway, so this is safe at present. */ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) { off_t past_write = offset + amount; if (past_write > vfdP->fileSize) { uint64 newTotal = temporary_files_size; newTotal += past_write - vfdP->fileSize; if (newTotal > (uint64) temp_file_limit * (uint64) 1024) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("temporary file size exceeds temp_file_limit (%dkB)", temp_file_limit))); } } retry: errno = 0; pgstat_report_wait_start(wait_event_info); returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset); pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ if (returnCode != amount && errno == 0) errno = ENOSPC; if (returnCode >= 0) { /* * Maintain fileSize and temporary_files_size if it's a temp file. */ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) { off_t past_write = offset + amount; if (past_write > vfdP->fileSize) { temporary_files_size += past_write - vfdP->fileSize; vfdP->fileSize = past_write; } } } else { /* * See comments in FileRead() */ #ifdef WIN32 DWORD error = GetLastError(); switch (error) { case ERROR_NO_SYSTEM_RESOURCES: pg_usleep(1000L); errno = EINTR; break; default: _dosmaperr(error); break; } #endif /* OK to retry if interrupted */ if (errno == EINTR) goto retry; } return returnCode; } int FileSync(File file, uint32 wait_event_info) { int returnCode; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSync: %d (%s)", file, VfdCache[file].fileName)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; pgstat_report_wait_start(wait_event_info); returnCode = pg_fsync(VfdCache[file].fd); pgstat_report_wait_end(); return returnCode; } off_t FileSize(File file) { Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSize %d (%s)", file, VfdCache[file].fileName)); if (FileIsNotOpen(file)) { if (FileAccess(file) < 0) return (off_t) -1; } return lseek(VfdCache[file].fd, 0, SEEK_END); } int FileTruncate(File file, off_t offset, uint32 wait_event_info) { int returnCode; Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileTruncate %d (%s)", file, VfdCache[file].fileName)); returnCode = FileAccess(file); if (returnCode < 0) return returnCode; pgstat_report_wait_start(wait_event_info); returnCode = ftruncate(VfdCache[file].fd, offset); pgstat_report_wait_end(); if (returnCode == 0 && VfdCache[file].fileSize > offset) { /* adjust our state for truncation of a temp file */ Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); temporary_files_size -= VfdCache[file].fileSize - offset; VfdCache[file].fileSize = offset; } return returnCode; } /* * Return the pathname associated with an open file. * * The returned string points to an internal buffer, which is valid until * the file is closed. */ char * FilePathName(File file) { Assert(FileIsValid(file)); return VfdCache[file].fileName; } /* * Return the raw file descriptor of an opened file. * * The returned file descriptor will be valid until the file is closed, but * there are a lot of things that can make that happen. So the caller should * be careful not to do much of anything else before it finishes using the * returned file descriptor. */ int FileGetRawDesc(File file) { Assert(FileIsValid(file)); return VfdCache[file].fd; } /* * FileGetRawFlags - returns the file flags on open(2) */ int FileGetRawFlags(File file) { Assert(FileIsValid(file)); return VfdCache[file].fileFlags; } /* * FileGetRawMode - returns the mode bitmask passed to open(2) */ mode_t FileGetRawMode(File file) { Assert(FileIsValid(file)); return VfdCache[file].fileMode; } /* * Make room for another allocatedDescs[] array entry if needed and possible. * Returns true if an array element is available. */ static bool reserveAllocatedDesc(void) { AllocateDesc *newDescs; int newMax; /* Quick out if array already has a free slot. */ if (numAllocatedDescs < maxAllocatedDescs) return true; /* * If the array hasn't yet been created in the current process, initialize * it with FD_MINFREE / 3 elements. In many scenarios this is as many as * we will ever need, anyway. We don't want to look at max_safe_fds * immediately because set_max_safe_fds() may not have run yet. */ if (allocatedDescs == NULL) { newMax = FD_MINFREE / 3; newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc)); /* Out of memory already? Treat as fatal error. */ if (newDescs == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); allocatedDescs = newDescs; maxAllocatedDescs = newMax; return true; } /* * Consider enlarging the array beyond the initial allocation used above. * By the time this happens, max_safe_fds should be known accurately. * * We mustn't let allocated descriptors hog all the available FDs, and in * practice we'd better leave a reasonable number of FDs for VFD use. So * set the maximum to max_safe_fds / 3. (This should certainly be at * least as large as the initial size, FD_MINFREE / 3, so we aren't * tightening the restriction here.) Recall that "external" FDs are * allowed to consume another third of max_safe_fds. */ newMax = max_safe_fds / 3; if (newMax > maxAllocatedDescs) { newDescs = (AllocateDesc *) realloc(allocatedDescs, newMax * sizeof(AllocateDesc)); /* Treat out-of-memory as a non-fatal error. */ if (newDescs == NULL) return false; allocatedDescs = newDescs; maxAllocatedDescs = newMax; return true; } /* Can't enlarge allocatedDescs[] any more. */ return false; } /* * Routines that want to use stdio (ie, FILE*) should use AllocateFile * rather than plain fopen(). This lets fd.c deal with freeing FDs if * necessary to open the file. When done, call FreeFile rather than fclose. * * Note that files that will be open for any significant length of time * should NOT be handled this way, since they cannot share kernel file * descriptors with other files; there is grave risk of running out of FDs * if anyone locks down too many FDs. Most callers of this routine are * simply reading a config file that they will read and close immediately. * * fd.c will automatically close all files opened with AllocateFile at * transaction commit or abort; this prevents FD leakage if a routine * that calls AllocateFile is terminated prematurely by ereport(ERROR). * * Ideally this should be the *only* direct call of fopen() in the backend. */ FILE * AllocateFile(const char *name, const char *mode) { FILE *file; DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)", numAllocatedDescs, name)); /* Can we allocate another non-virtual FD? */ if (!reserveAllocatedDesc()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", maxAllocatedDescs, name))); /* Close excess kernel FDs. */ ReleaseLruFiles(); TryAgain: if ((file = fopen(name, mode)) != NULL) { AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; desc->kind = AllocateDescFile; desc->desc.file = file; desc->create_subid = GetCurrentSubTransactionId(); numAllocatedDescs++; return desc->desc.file; } if (errno == EMFILE || errno == ENFILE) { int save_errno = errno; ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("out of file descriptors: %m; release and retry"))); errno = 0; if (ReleaseLruFile()) goto TryAgain; errno = save_errno; } return NULL; } /* * Open a file with OpenTransientFilePerm() and pass default file mode for * the fileMode parameter. */ int OpenTransientFile(const char *fileName, int fileFlags) { return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode); } /* * Like AllocateFile, but returns an unbuffered fd like open(2) */ int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode) { int fd; DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)", numAllocatedDescs, fileName)); /* Can we allocate another non-virtual FD? */ if (!reserveAllocatedDesc()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", maxAllocatedDescs, fileName))); /* Close excess kernel FDs. */ ReleaseLruFiles(); fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); if (fd >= 0) { AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; desc->kind = AllocateDescRawFD; desc->desc.fd = fd; desc->create_subid = GetCurrentSubTransactionId(); numAllocatedDescs++; return fd; } return -1; /* failure */ } /* * Routines that want to initiate a pipe stream should use OpenPipeStream * rather than plain popen(). This lets fd.c deal with freeing FDs if * necessary. When done, call ClosePipeStream rather than pclose. * * This function also ensures that the popen'd program is run with default * SIGPIPE processing, rather than the SIG_IGN setting the backend normally * uses. This ensures desirable response to, eg, closing a read pipe early. */ FILE * OpenPipeStream(const char *command, const char *mode) { FILE *file; int save_errno; DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)", numAllocatedDescs, command)); /* Can we allocate another non-virtual FD? */ if (!reserveAllocatedDesc()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"", maxAllocatedDescs, command))); /* Close excess kernel FDs. */ ReleaseLruFiles(); TryAgain: fflush(stdout); fflush(stderr); pqsignal(SIGPIPE, SIG_DFL); errno = 0; file = popen(command, mode); save_errno = errno; pqsignal(SIGPIPE, SIG_IGN); errno = save_errno; if (file != NULL) { AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; desc->kind = AllocateDescPipe; desc->desc.file = file; desc->create_subid = GetCurrentSubTransactionId(); numAllocatedDescs++; return desc->desc.file; } if (errno == EMFILE || errno == ENFILE) { ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("out of file descriptors: %m; release and retry"))); if (ReleaseLruFile()) goto TryAgain; errno = save_errno; } return NULL; } /* * Free an AllocateDesc of any type. * * The argument *must* point into the allocatedDescs[] array. */ static int FreeDesc(AllocateDesc *desc) { int result; /* Close the underlying object */ switch (desc->kind) { case AllocateDescFile: result = fclose(desc->desc.file); break; case AllocateDescPipe: result = pclose(desc->desc.file); break; case AllocateDescDir: result = closedir(desc->desc.dir); break; case AllocateDescRawFD: result = close(desc->desc.fd); break; default: elog(ERROR, "AllocateDesc kind not recognized"); result = 0; /* keep compiler quiet */ break; } /* Compact storage in the allocatedDescs array */ numAllocatedDescs--; *desc = allocatedDescs[numAllocatedDescs]; return result; } /* * Close a file returned by AllocateFile. * * Note we do not check fclose's return value --- it is up to the caller * to handle close errors. */ int FreeFile(FILE *file) { int i; DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs)); /* Remove file from list of allocated files, if it's present */ for (i = numAllocatedDescs; --i >= 0;) { AllocateDesc *desc = &allocatedDescs[i]; if (desc->kind == AllocateDescFile && desc->desc.file == file) return FreeDesc(desc); } /* Only get here if someone passes us a file not in allocatedDescs */ elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); return fclose(file); } /* * Close a file returned by OpenTransientFile. * * Note we do not check close's return value --- it is up to the caller * to handle close errors. */ int CloseTransientFile(int fd) { int i; DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs)); /* Remove fd from list of allocated files, if it's present */ for (i = numAllocatedDescs; --i >= 0;) { AllocateDesc *desc = &allocatedDescs[i]; if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd) return FreeDesc(desc); } /* Only get here if someone passes us a file not in allocatedDescs */ elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); return close(fd); } /* * Routines that want to use (ie, DIR*) should use AllocateDir * rather than plain opendir(). This lets fd.c deal with freeing FDs if * necessary to open the directory, and with closing it after an elog. * When done, call FreeDir rather than closedir. * * Returns NULL, with errno set, on failure. Note that failure detection * is commonly left to the following call of ReadDir or ReadDirExtended; * see the comments for ReadDir. * * Ideally this should be the *only* direct call of opendir() in the backend. */ DIR * AllocateDir(const char *dirname) { DIR *dir; DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)", numAllocatedDescs, dirname)); /* Can we allocate another non-virtual FD? */ if (!reserveAllocatedDesc()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"", maxAllocatedDescs, dirname))); /* Close excess kernel FDs. */ ReleaseLruFiles(); TryAgain: if ((dir = opendir(dirname)) != NULL) { AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; desc->kind = AllocateDescDir; desc->desc.dir = dir; desc->create_subid = GetCurrentSubTransactionId(); numAllocatedDescs++; return desc->desc.dir; } if (errno == EMFILE || errno == ENFILE) { int save_errno = errno; ereport(LOG, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("out of file descriptors: %m; release and retry"))); errno = 0; if (ReleaseLruFile()) goto TryAgain; errno = save_errno; } return NULL; } /* * Read a directory opened with AllocateDir, ereport'ing any error. * * This is easier to use than raw readdir() since it takes care of some * otherwise rather tedious and error-prone manipulation of errno. Also, * if you are happy with a generic error message for AllocateDir failure, * you can just do * * dir = AllocateDir(path); * while ((dirent = ReadDir(dir, path)) != NULL) * process dirent; * FreeDir(dir); * * since a NULL dir parameter is taken as indicating AllocateDir failed. * (Make sure errno isn't changed between AllocateDir and ReadDir if you * use this shortcut.) * * The pathname passed to AllocateDir must be passed to this routine too, * but it is only used for error reporting. */ struct dirent * ReadDir(DIR *dir, const char *dirname) { return ReadDirExtended(dir, dirname, ERROR); } /* * Alternate version of ReadDir that allows caller to specify the elevel * for any error report (whether it's reporting an initial failure of * AllocateDir or a subsequent directory read failure). * * If elevel < ERROR, returns NULL after any error. With the normal coding * pattern, this will result in falling out of the loop immediately as * though the directory contained no (more) entries. */ struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel) { struct dirent *dent; /* Give a generic message for AllocateDir failure, if caller didn't */ if (dir == NULL) { ereport(elevel, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", dirname))); return NULL; } errno = 0; if ((dent = readdir(dir)) != NULL) return dent; if (errno) ereport(elevel, (errcode_for_file_access(), errmsg("could not read directory \"%s\": %m", dirname))); return NULL; } /* * Close a directory opened with AllocateDir. * * Returns closedir's return value (with errno set if it's not 0). * Note we do not check the return value --- it is up to the caller * to handle close errors if wanted. * * Does nothing if dir == NULL; we assume that directory open failure was * already reported if desired. */ int FreeDir(DIR *dir) { int i; /* Nothing to do if AllocateDir failed */ if (dir == NULL) return 0; DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs)); /* Remove dir from list of allocated dirs, if it's present */ for (i = numAllocatedDescs; --i >= 0;) { AllocateDesc *desc = &allocatedDescs[i]; if (desc->kind == AllocateDescDir && desc->desc.dir == dir) return FreeDesc(desc); } /* Only get here if someone passes us a dir not in allocatedDescs */ elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir"); return closedir(dir); } /* * Close a pipe stream returned by OpenPipeStream. */ int ClosePipeStream(FILE *file) { int i; DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs)); /* Remove file from list of allocated files, if it's present */ for (i = numAllocatedDescs; --i >= 0;) { AllocateDesc *desc = &allocatedDescs[i]; if (desc->kind == AllocateDescPipe && desc->desc.file == file) return FreeDesc(desc); } /* Only get here if someone passes us a file not in allocatedDescs */ elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream"); return pclose(file); } /* * closeAllVfds * * Force all VFDs into the physically-closed state, so that the fewest * possible number of kernel file descriptors are in use. There is no * change in the logical state of the VFDs. */ void closeAllVfds(void) { Index i; if (SizeVfdCache > 0) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { if (!FileIsNotOpen(i)) LruDelete(i); } } } /* * SetTempTablespaces * * Define a list (actually an array) of OIDs of tablespaces to use for * temporary files. This list will be used until end of transaction, * unless this function is called again before then. It is caller's * responsibility that the passed-in array has adequate lifespan (typically * it'd be allocated in TopTransactionContext). * * Some entries of the array may be InvalidOid, indicating that the current * database's default tablespace should be used. */ void SetTempTablespaces(Oid *tableSpaces, int numSpaces) { Assert(numSpaces >= 0); tempTableSpaces = tableSpaces; numTempTableSpaces = numSpaces; /* * Select a random starting point in the list. This is to minimize * conflicts between backends that are most likely sharing the same list * of temp tablespaces. Note that if we create multiple temp files in the * same transaction, we'll advance circularly through the list --- this * ensures that large temporary sort files are nicely spread across all * available tablespaces. */ if (numSpaces > 1) nextTempTableSpace = random() % numSpaces; else nextTempTableSpace = 0; } /* * TempTablespacesAreSet * * Returns true if SetTempTablespaces has been called in current transaction. * (This is just so that tablespaces.c doesn't need its own per-transaction * state.) */ bool TempTablespacesAreSet(void) { return (numTempTableSpaces >= 0); } /* * GetTempTablespaces * * Populate an array with the OIDs of the tablespaces that should be used for * temporary files. (Some entries may be InvalidOid, indicating that the * current database's default tablespace should be used.) At most numSpaces * entries will be filled. * Returns the number of OIDs that were copied into the output array. */ int GetTempTablespaces(Oid *tableSpaces, int numSpaces) { int i; Assert(TempTablespacesAreSet()); for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) tableSpaces[i] = tempTableSpaces[i]; return i; } /* * GetNextTempTableSpace * * Select the next temp tablespace to use. A result of InvalidOid means * to use the current database's default tablespace. */ Oid GetNextTempTableSpace(void) { if (numTempTableSpaces > 0) { /* Advance nextTempTableSpace counter with wraparound */ if (++nextTempTableSpace >= numTempTableSpaces) nextTempTableSpace = 0; return tempTableSpaces[nextTempTableSpace]; } return InvalidOid; } /* * AtEOSubXact_Files * * Take care of subtransaction commit/abort. At abort, we close temp files * that the subtransaction may have opened. At commit, we reassign the * files that were opened to the parent subtransaction. */ void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid) { Index i; for (i = 0; i < numAllocatedDescs; i++) { if (allocatedDescs[i].create_subid == mySubid) { if (isCommit) allocatedDescs[i].create_subid = parentSubid; else { /* have to recheck the item after FreeDesc (ugly) */ FreeDesc(&allocatedDescs[i--]); } } } } /* * AtEOXact_Files * * This routine is called during transaction commit or abort. All still-open * per-transaction temporary file VFDs are closed, which also causes the * underlying files to be deleted (although they should've been closed already * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are * closed. We also forget any transaction-local temp tablespace list. * * The isCommit flag is used only to decide whether to emit warnings about * unclosed files. */ void AtEOXact_Files(bool isCommit) { CleanupTempFiles(isCommit, false); tempTableSpaces = NULL; numTempTableSpaces = -1; } /* * AtProcExit_Files * * on_proc_exit hook to clean up temp files during backend shutdown. * Here, we want to clean up *all* temp files including interXact ones. */ static void AtProcExit_Files(int code, Datum arg) { CleanupTempFiles(false, true); } /* * Close temporary files and delete their underlying files. * * isCommit: if true, this is normal transaction commit, and we don't * expect any remaining files; warn if there are some. * * isProcExit: if true, this is being called as the backend process is * exiting. If that's the case, we should remove all temporary files; if * that's not the case, we are being called for transaction commit/abort * and should only remove transaction-local temp files. In either case, * also clean up "allocated" stdio files, dirs and fds. */ static void CleanupTempFiles(bool isCommit, bool isProcExit) { Index i; /* * Careful here: at proc_exit we need extra cleanup, not just * xact_temporary files. */ if (isProcExit || have_xact_temporary_files) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { unsigned short fdstate = VfdCache[i].fdstate; if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && VfdCache[i].fileName != NULL) { /* * If we're in the process of exiting a backend process, close * all temporary files. Otherwise, only close temporary files * local to the current transaction. They should be closed by * the ResourceOwner mechanism already, so this is just a * debugging cross-check. */ if (isProcExit) FileClose(i); else if (fdstate & FD_CLOSE_AT_EOXACT) { elog(WARNING, "temporary file %s not closed at end-of-transaction", VfdCache[i].fileName); FileClose(i); } } } have_xact_temporary_files = false; } /* Complain if any allocated files remain open at commit. */ if (isCommit && numAllocatedDescs > 0) elog(WARNING, "%d temporary files and directories not closed at end-of-transaction", numAllocatedDescs); /* Clean up "allocated" stdio files, dirs and fds. */ while (numAllocatedDescs > 0) FreeDesc(&allocatedDescs[0]); } /* * Remove temporary and temporary relation files left over from a prior * postmaster session * * This should be called during postmaster startup. It will forcibly * remove any leftover files created by OpenTemporaryFile and any leftover * temporary relation files created by mdcreate. * * During post-backend-crash restart cycle, this routine is called when * remove_temp_files_after_crash GUC is enabled. Multiple crashes while * queries are using temp files could result in useless storage usage that can * only be reclaimed by a service restart. The argument against enabling it is * that someone might want to examine the temporary files for debugging * purposes. This does however mean that OpenTemporaryFile had better allow for * collision with an existing temp file name. * * NOTE: this function and its subroutines generally report syscall failures * with ereport(LOG) and keep going. Removing temp files is not so critical * that we should fail to start the database when we can't do it. */ void RemovePgTempFiles(void) { char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)]; DIR *spc_dir; struct dirent *spc_de; /* * First process temp files in pg_default ($PGDATA/base) */ snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); RemovePgTempFilesInDir(temp_path, true, false); RemovePgTempRelationFiles("base"); /* * Cycle through temp directories for all non-default tablespaces. */ spc_dir = AllocateDir("pg_tblspc"); while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL) { if (strcmp(spc_de->d_name, ".") == 0 || strcmp(spc_de->d_name, "..") == 0) continue; snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); RemovePgTempFilesInDir(temp_path, true, false); snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); RemovePgTempRelationFiles(temp_path); } FreeDir(spc_dir); /* * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of * DataDir as well. However, that is *not* cleaned here because doing so * would create a race condition. It's done separately, earlier in * postmaster startup. */ } /* * Process one pgsql_tmp directory for RemovePgTempFiles. * * If missing_ok is true, it's all right for the named directory to not exist. * Any other problem results in a LOG message. (missing_ok should be true at * the top level, since pgsql_tmp directories are not created until needed.) * * At the top level, this should be called with unlink_all = false, so that * only files matching the temporary name prefix will be unlinked. When * recursing it will be called with unlink_all = true to unlink everything * under a top-level temporary directory. * * (These two flags could be replaced by one, but it seems clearer to keep * them separate.) */ void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all) { DIR *temp_dir; struct dirent *temp_de; char rm_path[MAXPGPATH * 2]; temp_dir = AllocateDir(tmpdirname); if (temp_dir == NULL && errno == ENOENT && missing_ok) return; while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL) { if (strcmp(temp_de->d_name, ".") == 0 || strcmp(temp_de->d_name, "..") == 0) continue; snprintf(rm_path, sizeof(rm_path), "%s/%s", tmpdirname, temp_de->d_name); if (unlink_all || strncmp(temp_de->d_name, PG_TEMP_FILE_PREFIX, strlen(PG_TEMP_FILE_PREFIX)) == 0) { struct stat statbuf; if (lstat(rm_path, &statbuf) < 0) { ereport(LOG, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", rm_path))); continue; } if (S_ISDIR(statbuf.st_mode)) { /* recursively remove contents, then directory itself */ RemovePgTempFilesInDir(rm_path, false, true); if (rmdir(rm_path) < 0) ereport(LOG, (errcode_for_file_access(), errmsg("could not remove directory \"%s\": %m", rm_path))); } else { if (unlink(rm_path) < 0) ereport(LOG, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", rm_path))); } } else ereport(LOG, (errmsg("unexpected file found in temporary-files directory: \"%s\"", rm_path))); } FreeDir(temp_dir); } /* Process one tablespace directory, look for per-DB subdirectories */ static void RemovePgTempRelationFiles(const char *tsdirname) { DIR *ts_dir; struct dirent *de; char dbspace_path[MAXPGPATH * 2]; ts_dir = AllocateDir(tsdirname); while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL) { /* * We're only interested in the per-database directories, which have * numeric names. Note that this code will also (properly) ignore "." * and "..". */ if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) continue; snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", tsdirname, de->d_name); RemovePgTempRelationFilesInDbspace(dbspace_path); } FreeDir(ts_dir); } /* Process one per-dbspace directory for RemovePgTempRelationFiles */ static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname) { DIR *dbspace_dir; struct dirent *de; char rm_path[MAXPGPATH * 2]; dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL) { if (!looks_like_temp_rel_name(de->d_name)) continue; snprintf(rm_path, sizeof(rm_path), "%s/%s", dbspacedirname, de->d_name); if (unlink(rm_path) < 0) ereport(LOG, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", rm_path))); } FreeDir(dbspace_dir); } /* t_, or t__ */ bool looks_like_temp_rel_name(const char *name) { int pos; int savepos; /* Must start with "t". */ if (name[0] != 't') return false; /* Followed by a non-empty string of digits and then an underscore. */ for (pos = 1; isdigit((unsigned char) name[pos]); ++pos) ; if (pos == 1 || name[pos] != '_') return false; /* Followed by another nonempty string of digits. */ for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos) ; if (savepos == pos) return false; /* We might have _forkname or .segment or both. */ if (name[pos] == '_') { int forkchar = forkname_chars(&name[pos + 1], NULL); if (forkchar <= 0) return false; pos += forkchar + 1; } if (name[pos] == '.') { int segchar; for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) ; if (segchar <= 1) return false; pos += segchar; } /* Now we should be at the end. */ if (name[pos] != '\0') return false; return true; } #ifdef HAVE_SYNCFS static void do_syncfs(const char *path) { int fd; fd = OpenTransientFile(path, O_RDONLY); if (fd < 0) { ereport(LOG, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); return; } if (syncfs(fd) < 0) ereport(LOG, (errcode_for_file_access(), errmsg("could not synchronize file system for file \"%s\": %m", path))); CloseTransientFile(fd); } #endif /* * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for * all potential filesystem, depending on recovery_init_sync_method setting. * * We fsync regular files and directories wherever they are, but we * follow symlinks only for pg_wal and immediately under pg_tblspc. * Other symlinks are presumed to point at files we're not responsible * for fsyncing, and might not have privileges to write at all. * * Errors are logged but not considered fatal; that's because this is used * only during database startup, to deal with the possibility that there are * issued-but-unsynced writes pending against the data directory. We want to * ensure that such writes reach disk before anything that's done in the new * run. However, aborting on error would result in failure to start for * harmless cases such as read-only files in the data directory, and that's * not good either. * * Note that if we previously crashed due to a PANIC on fsync(), we'll be * rewriting all changes again during recovery. * * Note we assume we're chdir'd into PGDATA to begin with. */ void SyncDataDirectory(void) { bool xlog_is_symlink; /* We can skip this whole thing if fsync is disabled. */ if (!enableFsync) return; /* * If pg_wal is a symlink, we'll need to recurse into it separately, * because the first walkdir below will ignore it. */ xlog_is_symlink = false; #ifndef WIN32 { struct stat st; if (lstat("pg_wal", &st) < 0) ereport(LOG, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", "pg_wal"))); else if (S_ISLNK(st.st_mode)) xlog_is_symlink = true; } #else if (pgwin32_is_junction("pg_wal")) xlog_is_symlink = true; #endif #ifdef HAVE_SYNCFS if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS) { DIR *dir; struct dirent *de; /* * On Linux, we don't have to open every single file one by one. We * can use syncfs() to sync whole filesystems. We only expect * filesystem boundaries to exist where we tolerate symlinks, namely * pg_wal and the tablespaces, so we call syncfs() for each of those * directories. */ /* Sync the top level pgdata directory. */ do_syncfs("."); /* If any tablespaces are configured, sync each of those. */ dir = AllocateDir("pg_tblspc"); while ((de = ReadDirExtended(dir, "pg_tblspc", LOG))) { char path[MAXPGPATH]; if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); do_syncfs(path); } FreeDir(dir); /* If pg_wal is a symlink, process that too. */ if (xlog_is_symlink) do_syncfs("pg_wal"); return; } #endif /* !HAVE_SYNCFS */ /* * If possible, hint to the kernel that we're soon going to fsync the data * directory and its contents. Errors in this step are even less * interesting than normal, so log them only at DEBUG1. */ #ifdef PG_FLUSH_DATA_WORKS walkdir(".", pre_sync_fname, false, DEBUG1); if (xlog_is_symlink) walkdir("pg_wal", pre_sync_fname, false, DEBUG1); walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); #endif /* * Now we do the fsync()s in the same order. * * The main call ignores symlinks, so in addition to specially processing * pg_wal if it's a symlink, pg_tblspc has to be visited separately with * process_symlinks = true. Note that if there are any plain directories * in pg_tblspc, they'll get fsync'd twice. That's not an expected case * so we don't worry about optimizing it. */ walkdir(".", datadir_fsync_fname, false, LOG); if (xlog_is_symlink) walkdir("pg_wal", datadir_fsync_fname, false, LOG); walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); } /* * walkdir: recursively walk a directory, applying the action to each * regular file and directory (including the named directory itself). * * If process_symlinks is true, the action and recursion are also applied * to regular files and directories that are pointed to by symlinks in the * given directory; otherwise symlinks are ignored. Symlinks are always * ignored in subdirectories, ie we intentionally don't pass down the * process_symlinks flag to recursive calls. * * Errors are reported at level elevel, which might be ERROR or less. * * See also walkdir in file_utils.c, which is a frontend version of this * logic. */ static void walkdir(const char *path, void (*action) (const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel) { DIR *dir; struct dirent *de; dir = AllocateDir(path); while ((de = ReadDirExtended(dir, path, elevel)) != NULL) { char subpath[MAXPGPATH * 2]; CHECK_FOR_INTERRUPTS(); if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); switch (get_dirent_type(subpath, de, process_symlinks, elevel)) { case PGFILETYPE_REG: (*action) (subpath, false, elevel); break; case PGFILETYPE_DIR: walkdir(subpath, action, false, elevel); break; default: /* * Errors are already reported directly by get_dirent_type(), * and any remaining symlinks and unknown file types are * ignored. */ break; } } FreeDir(dir); /* we ignore any error here */ /* * It's important to fsync the destination directory itself as individual * file fsyncs don't guarantee that the directory entry for the file is * synced. However, skip this if AllocateDir failed; the action function * might not be robust against that. */ if (dir) (*action) (path, true, elevel); } /* * Hint to the OS that it should get ready to fsync() this file. * * Ignores errors trying to open unreadable files, and logs other errors at a * caller-specified level. */ #ifdef PG_FLUSH_DATA_WORKS static void pre_sync_fname(const char *fname, bool isdir, int elevel) { int fd; /* Don't try to flush directories, it'll likely just fail */ if (isdir) return; fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY); if (fd < 0) { if (errno == EACCES) return; ereport(elevel, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); return; } /* * pg_flush_data() ignores errors, which is ok because this is only a * hint. */ pg_flush_data(fd, 0, 0); if (CloseTransientFile(fd) != 0) ereport(elevel, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", fname))); } #endif /* PG_FLUSH_DATA_WORKS */ static void datadir_fsync_fname(const char *fname, bool isdir, int elevel) { /* * We want to silently ignoring errors about unreadable files. Pass that * desire on to fsync_fname_ext(). */ fsync_fname_ext(fname, isdir, true, elevel); } static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel) { if (isdir) { if (rmdir(fname) != 0 && errno != ENOENT) ereport(elevel, (errcode_for_file_access(), errmsg("could not remove directory \"%s\": %m", fname))); } else { /* Use PathNameDeleteTemporaryFile to report filesize */ PathNameDeleteTemporaryFile(fname, false); } } /* * fsync_fname_ext -- Try to fsync a file or directory * * If ignore_perm is true, ignore errors upon trying to open unreadable * files. Logs other errors at a caller-specified level. * * Returns 0 if the operation succeeded, -1 otherwise. */ int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) { int fd; int flags; int returncode; /* * Some OSs require directories to be opened read-only whereas other * systems don't allow us to fsync files opened read-only; so we need both * cases here. Using O_RDWR will cause us to fail to fsync files that are * not writable by our userid, but we assume that's OK. */ flags = PG_BINARY; if (!isdir) flags |= O_RDWR; else flags |= O_RDONLY; fd = OpenTransientFile(fname, flags); /* * Some OSs don't allow us to open directories at all (Windows returns * EACCES), just ignore the error in that case. If desired also silently * ignoring errors about unreadable files. Log others. */ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) return 0; else if (fd < 0 && ignore_perm && errno == EACCES) return 0; else if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); return -1; } returncode = pg_fsync(fd); /* * Some OSes don't allow us to fsync directories at all, so we can ignore * those errors. Anything else needs to be logged. */ if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) { int save_errno; /* close file upon error, might not be in transaction context */ save_errno = errno; (void) CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); return -1; } if (CloseTransientFile(fd) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", fname))); return -1; } return 0; } /* * fsync_parent_path -- fsync the parent path of a file or directory * * This is aimed at making file operations persistent on disk in case of * an OS crash or power failure. */ static int fsync_parent_path(const char *fname, int elevel) { char parentpath[MAXPGPATH]; strlcpy(parentpath, fname, MAXPGPATH); get_parent_directory(parentpath); /* * get_parent_directory() returns an empty string if the input argument is * just a file name (see comments in path.c), so handle that as being the * current directory. */ if (strlen(parentpath) == 0) strlcpy(parentpath, ".", MAXPGPATH); if (fsync_fname_ext(parentpath, true, false, elevel) != 0) return -1; return 0; } /* * Create a PostgreSQL data sub-directory * * The data directory itself, and most of its sub-directories, are created at * initdb time, but we do have some occasions when we create directories in * the backend (CREATE TABLESPACE, for example). In those cases, we want to * make sure that those directories are created consistently. Today, that means * making sure that the created directory has the correct permissions, which is * what pg_dir_create_mode tracks for us. * * Note that we also set the umask() based on what we understand the correct * permissions to be (see file_perm.c). * * For permissions other than the default, mkdir() can be used directly, but * be sure to consider carefully such cases -- a sub-directory with incorrect * permissions in a PostgreSQL data directory could cause backups and other * processes to fail. */ int MakePGDirectory(const char *directoryName) { return mkdir(directoryName, pg_dir_create_mode); } /* * Return the passed-in error level, or PANIC if data_sync_retry is off. * * Failure to fsync any data file is cause for immediate panic, unless * data_sync_retry is enabled. Data may have been written to the operating * system and removed from our buffer pool already, and if we are running on * an operating system that forgets dirty data on write-back failure, there * may be only one copy of the data remaining: in the WAL. A later attempt to * fsync again might falsely report success. Therefore we must not allow any * further checkpoints to be attempted. data_sync_retry can in theory be * enabled on systems known not to drop dirty buffered data on write-back * failure (with the likely outcome that checkpoints will continue to fail * until the underlying problem is fixed). * * Any code that reports a failure from fsync() or related functions should * filter the error level with this function. */ int data_sync_elevel(int elevel) { return data_sync_retry ? elevel : PANIC; } /* * A convenience wrapper for pg_pwritev() that retries on partial write. If an * error is returned, it is unspecified how much has been written. */ ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) { struct iovec iov_copy[PG_IOV_MAX]; ssize_t sum = 0; ssize_t part; /* We'd better have space to make a copy, in case we need to retry. */ if (iovcnt > PG_IOV_MAX) { errno = EINVAL; return -1; } for (;;) { /* Write as much as we can. */ part = pg_pwritev(fd, iov, iovcnt, offset); if (part < 0) return -1; #ifdef SIMULATE_SHORT_WRITE part = Min(part, 4096); #endif /* Count our progress. */ sum += part; offset += part; /* Step over iovecs that are done. */ while (iovcnt > 0 && iov->iov_len <= part) { part -= iov->iov_len; ++iov; --iovcnt; } /* Are they all done? */ if (iovcnt == 0) { /* We don't expect the kernel to write more than requested. */ Assert(part == 0); break; } /* * Move whatever's left to the front of our mutable copy and adjust * the leading iovec. */ Assert(iovcnt > 0); memmove(iov_copy, iov, sizeof(*iov) * iovcnt); Assert(iov->iov_len > part); iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part; iov_copy[0].iov_len -= part; iov = iov_copy; } return sum; }