summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/sync/sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/sync/sync.c')
-rw-r--r--src/backend/storage/sync/sync.c624
1 files changed, 624 insertions, 0 deletions
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
new file mode 100644
index 0000000..04fcb06
--- /dev/null
+++ b/src/backend/storage/sync/sync.c
@@ -0,0 +1,624 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.c
+ * File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/sync/sync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint. This hash
+ * table remembers the pending operations. We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr; /* can be any convenient integer size */
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
+ bool canceled; /* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
+ bool canceled; /* true if request has been canceled */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbSyncRequests */
+#define FSYNCS_PER_ABSORB 10
+#define UNLINKS_PER_ABSORB 10
+
+/*
+ * Function pointers for handling sync and unlink requests.
+ */
+typedef struct SyncOps
+{
+ int (*sync_syncfiletag) (const FileTag *ftag, char *path);
+ int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+ bool (*sync_filetagmatches) (const FileTag *ftag,
+ const FileTag *candidate);
+} SyncOps;
+
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
+static const SyncOps syncsw[] = {
+ /* magnetic disk */
+ [SYNC_HANDLER_MD] = {
+ .sync_syncfiletag = mdsyncfiletag,
+ .sync_unlinkfiletag = mdunlinkfiletag,
+ .sync_filetagmatches = mdfiletagmatches
+ },
+ /* pg_xact */
+ [SYNC_HANDLER_CLOG] = {
+ .sync_syncfiletag = clogsyncfiletag
+ },
+ /* pg_commit_ts */
+ [SYNC_HANDLER_COMMIT_TS] = {
+ .sync_syncfiletag = committssyncfiletag
+ },
+ /* pg_multixact/offsets */
+ [SYNC_HANDLER_MULTIXACT_OFFSET] = {
+ .sync_syncfiletag = multixactoffsetssyncfiletag
+ },
+ /* pg_multixact/members */
+ [SYNC_HANDLER_MULTIXACT_MEMBER] = {
+ .sync_syncfiletag = multixactmemberssyncfiletag
+ }
+};
+
+/*
+ * Initialize data structures for the file sync tracking.
+ */
+void
+InitSync(void)
+{
+ /*
+ * Create pending-operations hashtable if we need it. Currently, we need
+ * it if we are standalone (not under a postmaster) or if we are a
+ * checkpointer auxiliary process.
+ */
+ if (!IsUnderPostmaster || AmCheckpointerProcess())
+ {
+ HASHCTL hash_ctl;
+
+ /*
+ * XXX: The checkpointer needs to add entries to the pending ops table
+ * when absorbing fsync requests. That is done within a critical
+ * section, which isn't usually allowed, but we make an exception. It
+ * means that there's a theoretical possibility that you run out of
+ * memory while absorbing fsync requests, which leads to a PANIC.
+ * Fortunately the hash table is small so that's unlikely to happen in
+ * practice.
+ */
+ pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+ "Pending ops context",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+ hash_ctl.keysize = sizeof(FileTag);
+ hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+ hash_ctl.hcxt = pendingOpsCxt;
+ pendingOps = hash_create("Pending Ops Table",
+ 100L,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ pendingUnlinks = NIL;
+ }
+}
+
+/*
+ * SyncPreCheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon. Since this calls
+ * AbsorbSyncRequests(), which performs memory allocations, it cannot be
+ * called within a critical section.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+SyncPreCheckpoint(void)
+{
+ /*
+ * Operations such as DROP TABLESPACE assume that the next checkpoint will
+ * process all recently forwarded unlink requests, but if they aren't
+ * absorbed prior to advancing the cycle counter, they won't be processed
+ * until a future checkpoint. The following absorb ensures that any
+ * unlink requests forwarded before the checkpoint began will be processed
+ * in the current checkpoint.
+ */
+ AbsorbSyncRequests();
+
+ /*
+ * Any unlink requests arriving after this point will be assigned the next
+ * cycle counter, and won't be unlinked until next checkpoint.
+ */
+ checkpoint_cycle_ctr++;
+}
+
+/*
+ * SyncPostCheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+SyncPostCheckpoint(void)
+{
+ int absorb_counter;
+ ListCell *lc;
+
+ absorb_counter = UNLINKS_PER_ABSORB;
+ foreach(lc, pendingUnlinks)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
+ char path[MAXPGPATH];
+
+ /* Skip over any canceled entries */
+ if (entry->canceled)
+ continue;
+
+ /*
+ * New entries are appended to the end, so if the entry is new we've
+ * reached the end of old entries.
+ *
+ * Note: if just the right number of consecutive checkpoints fail, we
+ * could be fooled here by cycle_ctr wraparound. However, the only
+ * consequence is that we'd delay unlinking for one more checkpoint,
+ * which is perfectly tolerable.
+ */
+ if (entry->cycle_ctr == checkpoint_cycle_ctr)
+ break;
+
+ /* Unlink the file */
+ if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+ path) < 0)
+ {
+ /*
+ * There's a race condition, when the database is dropped at the
+ * same time that we process the pending unlink requests. If the
+ * DROP DATABASE deletes the file before we do, we will get ENOENT
+ * here. rmtree() also has to ignore ENOENT errors, to deal with
+ * the possibility that we delete the file first.
+ */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path)));
+ }
+
+ /* Mark the list entry as canceled, just in case */
+ entry->canceled = true;
+
+ /*
+ * As in ProcessSyncRequests, we don't want to stop absorbing fsync
+ * requests for a long time when there are many deletions to be done.
+ * We can safely call AbsorbSyncRequests() at this point in the loop.
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = UNLINKS_PER_ABSORB;
+ }
+ }
+
+ /*
+ * If we reached the end of the list, we can just remove the whole list
+ * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
+ * we must keep the entries at or after "lc".
+ */
+ if (lc == NULL)
+ {
+ list_free_deep(pendingUnlinks);
+ pendingUnlinks = NIL;
+ }
+ else
+ {
+ int ntodelete = list_cell_number(pendingUnlinks, lc);
+
+ for (int i = 0; i < ntodelete; i++)
+ pfree(list_nth(pendingUnlinks, i));
+
+ pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
+ }
+}
+
+/*
+ * ProcessSyncRequests() -- Process queued fsync requests.
+ */
+void
+ProcessSyncRequests(void)
+{
+ static bool sync_in_progress = false;
+
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *entry;
+ int absorb_counter;
+
+ /* Statistics on sync times */
+ int processed = 0;
+ instr_time sync_start,
+ sync_end,
+ sync_diff;
+ uint64 elapsed;
+ uint64 longest = 0;
+ uint64 total_elapsed = 0;
+
+ /*
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOps.
+ */
+ if (!pendingOps)
+ elog(ERROR, "cannot sync without a pendingOps table");
+
+ /*
+ * If we are in the checkpointer, the sync had better include all fsync
+ * requests that were queued by backends up to this point. The tightest
+ * race condition that could occur is that a buffer that must be written
+ * and fsync'd for the checkpoint could have been dumped by a backend just
+ * before it was visited by BufferSync(). We know the backend will have
+ * queued an fsync request before clearing the buffer's dirtybit, so we
+ * are safe as long as we do an Absorb after completing BufferSync().
+ */
+ AbsorbSyncRequests();
+
+ /*
+ * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+ * checkpoint), we want to ignore fsync requests that are entered into the
+ * hashtable after this point --- they should be processed next time,
+ * instead. We use sync_cycle_ctr to tell old entries apart from new
+ * ones: new ones will have cycle_ctr equal to the incremented value of
+ * sync_cycle_ctr.
+ *
+ * In normal circumstances, all entries present in the table at this point
+ * will have cycle_ctr exactly equal to the current (about to be old)
+ * value of sync_cycle_ctr. However, if we fail partway through the
+ * fsync'ing loop, then older values of cycle_ctr might remain when we
+ * come back here to try again. Repeated checkpoint failures would
+ * eventually wrap the counter around to the point where an old entry
+ * might appear new, causing us to skip it, possibly allowing a checkpoint
+ * to succeed that should not have. To forestall wraparound, any time the
+ * previous ProcessSyncRequests() failed to complete, run through the
+ * table and forcibly set cycle_ctr = sync_cycle_ctr.
+ *
+ * Think not to merge this loop with the main loop, as the problem is
+ * exactly that that loop may fail before having visited all the entries.
+ * From a performance point of view it doesn't matter anyway, as this path
+ * will never be taken in a system that's functioning normally.
+ */
+ if (sync_in_progress)
+ {
+ /* prior try failed, so update any stale cycle_ctr values */
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ }
+ }
+
+ /* Advance counter so that new hashtable entries are distinguishable */
+ sync_cycle_ctr++;
+
+ /* Set flag to detect failure if we don't reach the end of the loop */
+ sync_in_progress = true;
+
+ /* Now scan the hashtable for fsync requests to process */
+ absorb_counter = FSYNCS_PER_ABSORB;
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ int failures;
+
+ /*
+ * If the entry is new then don't process it this time; it is new.
+ * Note "continue" bypasses the hash-remove call at the bottom of the
+ * loop.
+ */
+ if (entry->cycle_ctr == sync_cycle_ctr)
+ continue;
+
+ /* Else assert we haven't missed it */
+ Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+ /*
+ * If fsync is off then we don't have to bother opening the file at
+ * all. (We delay checking until this point so that changing fsync on
+ * the fly behaves sensibly.)
+ */
+ if (enableFsync)
+ {
+ /*
+ * If in checkpointer, we want to absorb pending requests every so
+ * often to prevent overflow of the fsync request queue. It is
+ * unspecified whether newly-added entries will be visited by
+ * hash_seq_search, but we don't care since we don't need to
+ * process them anyway.
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB;
+ }
+
+ /*
+ * The fsync table could contain requests to fsync segments that
+ * have been deleted (unlinked) by the time we get to them. Rather
+ * than just hoping an ENOENT (or EACCES on Windows) error can be
+ * ignored, what we do on error is absorb pending requests and
+ * then retry. Since mdunlink() queues a "cancel" message before
+ * actually unlinking, the fsync request is guaranteed to be
+ * marked canceled after the absorb if it really was this case.
+ * DROP DATABASE likewise has to tell us to forget fsync requests
+ * before it starts deletions.
+ */
+ for (failures = 0; !entry->canceled; failures++)
+ {
+ char path[MAXPGPATH];
+
+ INSTR_TIME_SET_CURRENT(sync_start);
+ if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
+ path) == 0)
+ {
+ /* Success; update statistics about sync timing */
+ INSTR_TIME_SET_CURRENT(sync_end);
+ sync_diff = sync_end;
+ INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+ elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+ if (elapsed > longest)
+ longest = elapsed;
+ total_elapsed += elapsed;
+ processed++;
+
+ if (log_checkpoints)
+ elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
+ processed,
+ path,
+ (double) elapsed / 1000);
+
+ break; /* out of retry loop */
+ }
+
+ /*
+ * It is possible that the relation has been dropped or
+ * truncated since the fsync request was entered. Therefore,
+ * allow ENOENT, but only if we didn't fail already on this
+ * file.
+ */
+ if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ path)));
+ else
+ ereport(DEBUG1,
+ (errcode_for_file_access(),
+ errmsg_internal("could not fsync file \"%s\" but retrying: %m",
+ path)));
+
+ /*
+ * Absorb incoming requests and check to see if a cancel
+ * arrived for this relation fork.
+ */
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+ } /* end retry loop */
+ }
+
+ /* We are done with this entry, remove it */
+ if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOps corrupted");
+ } /* end loop over hashtable entries */
+
+ /* Return sync performance metrics for report at checkpoint end */
+ CheckpointStats.ckpt_sync_rels = processed;
+ CheckpointStats.ckpt_longest_sync = longest;
+ CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+ /* Flag successful completion of ProcessSyncRequests */
+ sync_in_progress = false;
+}
+
+/*
+ * RememberSyncRequest() -- callback from checkpointer side of sync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint. UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * See sync.h for more information on the types of sync requests supported.
+ */
+void
+RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
+{
+ Assert(pendingOps);
+
+ if (type == SYNC_FORGET_REQUEST)
+ {
+ PendingFsyncEntry *entry;
+
+ /* Cancel previously entered request */
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ ftag,
+ HASH_FIND,
+ NULL);
+ if (entry != NULL)
+ entry->canceled = true;
+ }
+ else if (type == SYNC_FILTER_REQUEST)
+ {
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *pfe;
+ ListCell *cell;
+
+ /* Cancel matching fsync requests */
+ hash_seq_init(&hstat, pendingOps);
+ while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ if (pfe->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
+ pfe->canceled = true;
+ }
+
+ /* Cancel matching unlink requests */
+ foreach(cell, pendingUnlinks)
+ {
+ PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
+
+ if (pue->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
+ pue->canceled = true;
+ }
+ }
+ else if (type == SYNC_UNLINK_REQUEST)
+ {
+ /* Unlink request: put it in the linked list */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingUnlinkEntry *entry;
+
+ entry = palloc(sizeof(PendingUnlinkEntry));
+ entry->tag = *ftag;
+ entry->cycle_ctr = checkpoint_cycle_ctr;
+ entry->canceled = false;
+
+ pendingUnlinks = lappend(pendingUnlinks, entry);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+ else
+ {
+ /* Normal case: enter a request to fsync this segment */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingFsyncEntry *entry;
+ bool found;
+
+ Assert(type == SYNC_REQUEST);
+
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ ftag,
+ HASH_ENTER,
+ &found);
+ /* if new entry, or was previously canceled, initialize it */
+ if (!found || entry->canceled)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ entry->canceled = false;
+ }
+
+ /*
+ * NB: it's intentional that we don't change cycle_ctr if the entry
+ * already exists. The cycle_ctr must represent the oldest fsync
+ * request that could be in the entry.
+ */
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * Register the sync request locally, or forward it to the checkpointer.
+ *
+ * If retryOnError is true, we'll keep trying if there is no space in the
+ * queue. Return true if we succeeded, or false if there wasn't space.
+ */
+bool
+RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+ bool retryOnError)
+{
+ bool ret;
+
+ if (pendingOps != NULL)
+ {
+ /* standalone backend or startup process: fsync state is local */
+ RememberSyncRequest(ftag, type);
+ return true;
+ }
+
+ for (;;)
+ {
+ /*
+ * Notify the checkpointer about it. If we fail to queue a message in
+ * retryOnError mode, we have to sleep and try again ... ugly, but
+ * hopefully won't happen often.
+ *
+ * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
+ * error in the case of SYNC_UNLINK_REQUEST would leave the
+ * no-longer-used file still present on disk, which would be bad, so
+ * I'm inclined to assume that the checkpointer will always empty the
+ * queue soon.
+ */
+ ret = ForwardSyncRequest(ftag, type);
+
+ /*
+ * If we are successful in queueing the request, or we failed and were
+ * instructed not to retry on error, break.
+ */
+ if (ret || (!ret && !retryOnError))
+ break;
+
+ WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
+ WAIT_EVENT_REGISTER_SYNC_REQUEST);
+ }
+
+ return ret;
+}