diff options
Diffstat (limited to 'src/backend/storage/sync/sync.c')
-rw-r--r-- | src/backend/storage/sync/sync.c | 624 |
1 files changed, 624 insertions, 0 deletions
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c new file mode 100644 index 0000000..04fcb06 --- /dev/null +++ b/src/backend/storage/sync/sync.c @@ -0,0 +1,624 @@ +/*------------------------------------------------------------------------- + * + * sync.c + * File synchronization management code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/sync/sync.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <fcntl.h> +#include <sys/file.h> + +#include "access/commit_ts.h" +#include "access/clog.h" +#include "access/multixact.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/instr_time.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/md.h" +#include "utils/hsearch.h" +#include "utils/inval.h" +#include "utils/memutils.h" + +/* + * In some contexts (currently, standalone backends and the checkpointer) + * we keep track of pending fsync operations: we need to remember all relation + * segments that have been written since the last checkpoint, so that we can + * fsync them down to disk before completing the next checkpoint. This hash + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of merging duplicate requests. + * + * We use a similar mechanism to remember no-longer-needed files that can + * be deleted after the next checkpoint, but we use a linked list instead of + * a hash table, because we don't expect there to be any duplicate requests. + * + * These mechanisms are only used for non-temp relations; we never fsync + * temp rels, nor do we need to postpone their deletion (see comments in + * mdunlink). + * + * (Regular backends do not track pending operations locally, but forward + * them to the checkpointer.) + */ +typedef uint16 CycleCtr; /* can be any convenient integer size */ + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ + bool canceled; /* canceled is true if we canceled "recently" */ +} PendingFsyncEntry; + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ + bool canceled; /* true if request has been canceled */ +} PendingUnlinkEntry; + +static HTAB *pendingOps = NULL; +static List *pendingUnlinks = NIL; +static MemoryContext pendingOpsCxt; /* context for the above */ + +static CycleCtr sync_cycle_ctr = 0; +static CycleCtr checkpoint_cycle_ctr = 0; + +/* Intervals for calling AbsorbSyncRequests */ +#define FSYNCS_PER_ABSORB 10 +#define UNLINKS_PER_ABSORB 10 + +/* + * Function pointers for handling sync and unlink requests. + */ +typedef struct SyncOps +{ + int (*sync_syncfiletag) (const FileTag *ftag, char *path); + int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); + bool (*sync_filetagmatches) (const FileTag *ftag, + const FileTag *candidate); +} SyncOps; + +/* + * These indexes must correspond to the values of the SyncRequestHandler enum. + */ +static const SyncOps syncsw[] = { + /* magnetic disk */ + [SYNC_HANDLER_MD] = { + .sync_syncfiletag = mdsyncfiletag, + .sync_unlinkfiletag = mdunlinkfiletag, + .sync_filetagmatches = mdfiletagmatches + }, + /* pg_xact */ + [SYNC_HANDLER_CLOG] = { + .sync_syncfiletag = clogsyncfiletag + }, + /* pg_commit_ts */ + [SYNC_HANDLER_COMMIT_TS] = { + .sync_syncfiletag = committssyncfiletag + }, + /* pg_multixact/offsets */ + [SYNC_HANDLER_MULTIXACT_OFFSET] = { + .sync_syncfiletag = multixactoffsetssyncfiletag + }, + /* pg_multixact/members */ + [SYNC_HANDLER_MULTIXACT_MEMBER] = { + .sync_syncfiletag = multixactmemberssyncfiletag + } +}; + +/* + * Initialize data structures for the file sync tracking. + */ +void +InitSync(void) +{ + /* + * Create pending-operations hashtable if we need it. Currently, we need + * it if we are standalone (not under a postmaster) or if we are a + * checkpointer auxiliary process. + */ + if (!IsUnderPostmaster || AmCheckpointerProcess()) + { + HASHCTL hash_ctl; + + /* + * XXX: The checkpointer needs to add entries to the pending ops table + * when absorbing fsync requests. That is done within a critical + * section, which isn't usually allowed, but we make an exception. It + * means that there's a theoretical possibility that you run out of + * memory while absorbing fsync requests, which leads to a PANIC. + * Fortunately the hash table is small so that's unlikely to happen in + * practice. + */ + pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + + hash_ctl.keysize = sizeof(FileTag); + hash_ctl.entrysize = sizeof(PendingFsyncEntry); + hash_ctl.hcxt = pendingOpsCxt; + pendingOps = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + pendingUnlinks = NIL; + } +} + +/* + * SyncPreCheckpoint() -- Do pre-checkpoint work + * + * To distinguish unlink requests that arrived before this checkpoint + * started from those that arrived during the checkpoint, we use a cycle + * counter similar to the one we use for fsync requests. That cycle + * counter is incremented here. + * + * This must be called *before* the checkpoint REDO point is determined. + * That ensures that we won't delete files too soon. Since this calls + * AbsorbSyncRequests(), which performs memory allocations, it cannot be + * called within a critical section. + * + * Note that we can't do anything here that depends on the assumption + * that the checkpoint will be completed. + */ +void +SyncPreCheckpoint(void) +{ + /* + * Operations such as DROP TABLESPACE assume that the next checkpoint will + * process all recently forwarded unlink requests, but if they aren't + * absorbed prior to advancing the cycle counter, they won't be processed + * until a future checkpoint. The following absorb ensures that any + * unlink requests forwarded before the checkpoint began will be processed + * in the current checkpoint. + */ + AbsorbSyncRequests(); + + /* + * Any unlink requests arriving after this point will be assigned the next + * cycle counter, and won't be unlinked until next checkpoint. + */ + checkpoint_cycle_ctr++; +} + +/* + * SyncPostCheckpoint() -- Do post-checkpoint work + * + * Remove any lingering files that can now be safely removed. + */ +void +SyncPostCheckpoint(void) +{ + int absorb_counter; + ListCell *lc; + + absorb_counter = UNLINKS_PER_ABSORB; + foreach(lc, pendingUnlinks) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc); + char path[MAXPGPATH]; + + /* Skip over any canceled entries */ + if (entry->canceled) + continue; + + /* + * New entries are appended to the end, so if the entry is new we've + * reached the end of old entries. + * + * Note: if just the right number of consecutive checkpoints fail, we + * could be fooled here by cycle_ctr wraparound. However, the only + * consequence is that we'd delay unlinking for one more checkpoint, + * which is perfectly tolerable. + */ + if (entry->cycle_ctr == checkpoint_cycle_ctr) + break; + + /* Unlink the file */ + if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, + path) < 0) + { + /* + * There's a race condition, when the database is dropped at the + * same time that we process the pending unlink requests. If the + * DROP DATABASE deletes the file before we do, we will get ENOENT + * here. rmtree() also has to ignore ENOENT errors, to deal with + * the possibility that we delete the file first. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } + + /* Mark the list entry as canceled, just in case */ + entry->canceled = true; + + /* + * As in ProcessSyncRequests, we don't want to stop absorbing fsync + * requests for a long time when there are many deletions to be done. + * We can safely call AbsorbSyncRequests() at this point in the loop. + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = UNLINKS_PER_ABSORB; + } + } + + /* + * If we reached the end of the list, we can just remove the whole list + * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise, + * we must keep the entries at or after "lc". + */ + if (lc == NULL) + { + list_free_deep(pendingUnlinks); + pendingUnlinks = NIL; + } + else + { + int ntodelete = list_cell_number(pendingUnlinks, lc); + + for (int i = 0; i < ntodelete; i++) + pfree(list_nth(pendingUnlinks, i)); + + pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete); + } +} + +/* + * ProcessSyncRequests() -- Process queued fsync requests. + */ +void +ProcessSyncRequests(void) +{ + static bool sync_in_progress = false; + + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *entry; + int absorb_counter; + + /* Statistics on sync times */ + int processed = 0; + instr_time sync_start, + sync_end, + sync_diff; + uint64 elapsed; + uint64 longest = 0; + uint64 total_elapsed = 0; + + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOps. + */ + if (!pendingOps) + elog(ERROR, "cannot sync without a pendingOps table"); + + /* + * If we are in the checkpointer, the sync had better include all fsync + * requests that were queued by backends up to this point. The tightest + * race condition that could occur is that a buffer that must be written + * and fsync'd for the checkpoint could have been dumped by a backend just + * before it was visited by BufferSync(). We know the backend will have + * queued an fsync request before clearing the buffer's dirtybit, so we + * are safe as long as we do an Absorb after completing BufferSync(). + */ + AbsorbSyncRequests(); + + /* + * To avoid excess fsync'ing (in the worst case, maybe a never-terminating + * checkpoint), we want to ignore fsync requests that are entered into the + * hashtable after this point --- they should be processed next time, + * instead. We use sync_cycle_ctr to tell old entries apart from new + * ones: new ones will have cycle_ctr equal to the incremented value of + * sync_cycle_ctr. + * + * In normal circumstances, all entries present in the table at this point + * will have cycle_ctr exactly equal to the current (about to be old) + * value of sync_cycle_ctr. However, if we fail partway through the + * fsync'ing loop, then older values of cycle_ctr might remain when we + * come back here to try again. Repeated checkpoint failures would + * eventually wrap the counter around to the point where an old entry + * might appear new, causing us to skip it, possibly allowing a checkpoint + * to succeed that should not have. To forestall wraparound, any time the + * previous ProcessSyncRequests() failed to complete, run through the + * table and forcibly set cycle_ctr = sync_cycle_ctr. + * + * Think not to merge this loop with the main loop, as the problem is + * exactly that that loop may fail before having visited all the entries. + * From a performance point of view it doesn't matter anyway, as this path + * will never be taken in a system that's functioning normally. + */ + if (sync_in_progress) + { + /* prior try failed, so update any stale cycle_ctr values */ + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + entry->cycle_ctr = sync_cycle_ctr; + } + } + + /* Advance counter so that new hashtable entries are distinguishable */ + sync_cycle_ctr++; + + /* Set flag to detect failure if we don't reach the end of the loop */ + sync_in_progress = true; + + /* Now scan the hashtable for fsync requests to process */ + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + int failures; + + /* + * If the entry is new then don't process it this time; it is new. + * Note "continue" bypasses the hash-remove call at the bottom of the + * loop. + */ + if (entry->cycle_ctr == sync_cycle_ctr) + continue; + + /* Else assert we haven't missed it */ + Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); + + /* + * If fsync is off then we don't have to bother opening the file at + * all. (We delay checking until this point so that changing fsync on + * the fly behaves sensibly.) + */ + if (enableFsync) + { + /* + * If in checkpointer, we want to absorb pending requests every so + * often to prevent overflow of the fsync request queue. It is + * unspecified whether newly-added entries will be visited by + * hash_seq_search, but we don't care since we don't need to + * process them anyway. + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + + /* + * The fsync table could contain requests to fsync segments that + * have been deleted (unlinked) by the time we get to them. Rather + * than just hoping an ENOENT (or EACCES on Windows) error can be + * ignored, what we do on error is absorb pending requests and + * then retry. Since mdunlink() queues a "cancel" message before + * actually unlinking, the fsync request is guaranteed to be + * marked canceled after the absorb if it really was this case. + * DROP DATABASE likewise has to tell us to forget fsync requests + * before it starts deletions. + */ + for (failures = 0; !entry->canceled; failures++) + { + char path[MAXPGPATH]; + + INSTR_TIME_SET_CURRENT(sync_start); + if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, + path) == 0) + { + /* Success; update statistics about sync timing */ + INSTR_TIME_SET_CURRENT(sync_end); + sync_diff = sync_end; + INSTR_TIME_SUBTRACT(sync_diff, sync_start); + elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); + if (elapsed > longest) + longest = elapsed; + total_elapsed += elapsed; + processed++; + + if (log_checkpoints) + elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms", + processed, + path, + (double) elapsed / 1000); + + break; /* out of retry loop */ + } + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. Therefore, + * allow ENOENT, but only if we didn't fail already on this + * file. + */ + if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg_internal("could not fsync file \"%s\" but retrying: %m", + path))); + + /* + * Absorb incoming requests and check to see if a cancel + * arrived for this relation fork. + */ + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ + } /* end retry loop */ + } + + /* We are done with this entry, remove it */ + if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOps corrupted"); + } /* end loop over hashtable entries */ + + /* Return sync performance metrics for report at checkpoint end */ + CheckpointStats.ckpt_sync_rels = processed; + CheckpointStats.ckpt_longest_sync = longest; + CheckpointStats.ckpt_agg_sync_time = total_elapsed; + + /* Flag successful completion of ProcessSyncRequests */ + sync_in_progress = false; +} + +/* + * RememberSyncRequest() -- callback from checkpointer side of sync request + * + * We stuff fsync requests into the local hash table for execution + * during the checkpointer's next checkpoint. UNLINK requests go into a + * separate linked list, however, because they get processed separately. + * + * See sync.h for more information on the types of sync requests supported. + */ +void +RememberSyncRequest(const FileTag *ftag, SyncRequestType type) +{ + Assert(pendingOps); + + if (type == SYNC_FORGET_REQUEST) + { + PendingFsyncEntry *entry; + + /* Cancel previously entered request */ + entry = (PendingFsyncEntry *) hash_search(pendingOps, + ftag, + HASH_FIND, + NULL); + if (entry != NULL) + entry->canceled = true; + } + else if (type == SYNC_FILTER_REQUEST) + { + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *pfe; + ListCell *cell; + + /* Cancel matching fsync requests */ + hash_seq_init(&hstat, pendingOps); + while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + if (pfe->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag)) + pfe->canceled = true; + } + + /* Cancel matching unlink requests */ + foreach(cell, pendingUnlinks) + { + PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell); + + if (pue->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag)) + pue->canceled = true; + } + } + else if (type == SYNC_UNLINK_REQUEST) + { + /* Unlink request: put it in the linked list */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingUnlinkEntry *entry; + + entry = palloc(sizeof(PendingUnlinkEntry)); + entry->tag = *ftag; + entry->cycle_ctr = checkpoint_cycle_ctr; + entry->canceled = false; + + pendingUnlinks = lappend(pendingUnlinks, entry); + + MemoryContextSwitchTo(oldcxt); + } + else + { + /* Normal case: enter a request to fsync this segment */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingFsyncEntry *entry; + bool found; + + Assert(type == SYNC_REQUEST); + + entry = (PendingFsyncEntry *) hash_search(pendingOps, + ftag, + HASH_ENTER, + &found); + /* if new entry, or was previously canceled, initialize it */ + if (!found || entry->canceled) + { + entry->cycle_ctr = sync_cycle_ctr; + entry->canceled = false; + } + + /* + * NB: it's intentional that we don't change cycle_ctr if the entry + * already exists. The cycle_ctr must represent the oldest fsync + * request that could be in the entry. + */ + + MemoryContextSwitchTo(oldcxt); + } +} + +/* + * Register the sync request locally, or forward it to the checkpointer. + * + * If retryOnError is true, we'll keep trying if there is no space in the + * queue. Return true if we succeeded, or false if there wasn't space. + */ +bool +RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, + bool retryOnError) +{ + bool ret; + + if (pendingOps != NULL) + { + /* standalone backend or startup process: fsync state is local */ + RememberSyncRequest(ftag, type); + return true; + } + + for (;;) + { + /* + * Notify the checkpointer about it. If we fail to queue a message in + * retryOnError mode, we have to sleep and try again ... ugly, but + * hopefully won't happen often. + * + * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an + * error in the case of SYNC_UNLINK_REQUEST would leave the + * no-longer-used file still present on disk, which would be bad, so + * I'm inclined to assume that the checkpointer will always empty the + * queue soon. + */ + ret = ForwardSyncRequest(ftag, type); + + /* + * If we are successful in queueing the request, or we failed and were + * instructed not to retry on error, break. + */ + if (ret || (!ret && !retryOnError)) + break; + + WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10, + WAIT_EVENT_REGISTER_SYNC_REQUEST); + } + + return ret; +} |