summaryrefslogtreecommitdiffstats
path: root/contrib/pg_prewarm
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/pg_prewarm')
-rw-r--r--contrib/pg_prewarm/Makefile22
-rw-r--r--contrib/pg_prewarm/autoprewarm.c919
-rw-r--r--contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql6
-rw-r--r--contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql14
-rw-r--r--contrib/pg_prewarm/pg_prewarm--1.1.sql14
-rw-r--r--contrib/pg_prewarm/pg_prewarm.c204
-rw-r--r--contrib/pg_prewarm/pg_prewarm.control5
7 files changed, 1184 insertions, 0 deletions
diff --git a/contrib/pg_prewarm/Makefile b/contrib/pg_prewarm/Makefile
new file mode 100644
index 0000000..b13ac3c
--- /dev/null
+++ b/contrib/pg_prewarm/Makefile
@@ -0,0 +1,22 @@
+# contrib/pg_prewarm/Makefile
+
+MODULE_big = pg_prewarm
+OBJS = \
+ $(WIN32RES) \
+ autoprewarm.o \
+ pg_prewarm.o
+
+EXTENSION = pg_prewarm
+DATA = pg_prewarm--1.1--1.2.sql pg_prewarm--1.1.sql pg_prewarm--1.0--1.1.sql
+PGFILEDESC = "pg_prewarm - preload relation data into system buffer cache"
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_prewarm
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
new file mode 100644
index 0000000..c0c4f5d
--- /dev/null
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -0,0 +1,919 @@
+/*-------------------------------------------------------------------------
+ *
+ * autoprewarm.c
+ * Periodically dump information about the blocks present in
+ * shared_buffers, and reload them on server restart.
+ *
+ * Due to locking considerations, we can't actually begin prewarming
+ * until the server reaches a consistent state. We need the catalogs
+ * to be consistent so that we can figure out which relation to lock,
+ * and we need to lock the relations so that we don't try to prewarm
+ * pages from a relation that is in the process of being dropped.
+ *
+ * While prewarming, autoprewarm will use two workers. There's a
+ * leader worker that reads and sorts the list of blocks to be
+ * prewarmed and then launches a per-database worker for each
+ * relevant database in turn. The former keeps running after the
+ * initial prewarm is complete to update the dump file periodically.
+ *
+ * Copyright (c) 2016-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/pg_prewarm/autoprewarm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/relation.h"
+#include "access/xact.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_type.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "storage/buf_internals.h"
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/acl.h"
+#include "utils/datetime.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/relfilenodemap.h"
+#include "utils/resowner.h"
+
+#define AUTOPREWARM_FILE "autoprewarm.blocks"
+
+/* Metadata for each block we dump. */
+typedef struct BlockInfoRecord
+{
+ Oid database;
+ Oid tablespace;
+ Oid filenode;
+ ForkNumber forknum;
+ BlockNumber blocknum;
+} BlockInfoRecord;
+
+/* Shared state information for autoprewarm bgworker. */
+typedef struct AutoPrewarmSharedState
+{
+ LWLock lock; /* mutual exclusion */
+ pid_t bgworker_pid; /* for main bgworker */
+ pid_t pid_using_dumpfile; /* for autoprewarm or block dump */
+
+ /* Following items are for communication with per-database worker */
+ dsm_handle block_info_handle;
+ Oid database;
+ int prewarm_start_idx;
+ int prewarm_stop_idx;
+ int prewarmed_blocks;
+} AutoPrewarmSharedState;
+
+void _PG_init(void);
+void autoprewarm_main(Datum main_arg);
+void autoprewarm_database_main(Datum main_arg);
+
+PG_FUNCTION_INFO_V1(autoprewarm_start_worker);
+PG_FUNCTION_INFO_V1(autoprewarm_dump_now);
+
+static void apw_load_buffers(void);
+static int apw_dump_now(bool is_bgworker, bool dump_unlogged);
+static void apw_start_leader_worker(void);
+static void apw_start_database_worker(void);
+static bool apw_init_shmem(void);
+static void apw_detach_shmem(int code, Datum arg);
+static int apw_compare_blockinfo(const void *p, const void *q);
+static void autoprewarm_shmem_request(void);
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+
+/* Pointer to shared-memory state. */
+static AutoPrewarmSharedState *apw_state = NULL;
+
+/* GUC variables. */
+static bool autoprewarm = true; /* start worker? */
+static int autoprewarm_interval; /* dump interval */
+
+/*
+ * Module load callback.
+ */
+void
+_PG_init(void)
+{
+ DefineCustomIntVariable("pg_prewarm.autoprewarm_interval",
+ "Sets the interval between dumps of shared buffers",
+ "If set to zero, time-based dumping is disabled.",
+ &autoprewarm_interval,
+ 300,
+ 0, INT_MAX / 1000,
+ PGC_SIGHUP,
+ GUC_UNIT_S,
+ NULL,
+ NULL,
+ NULL);
+
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+
+ /* can't define PGC_POSTMASTER variable after startup */
+ DefineCustomBoolVariable("pg_prewarm.autoprewarm",
+ "Starts the autoprewarm worker.",
+ NULL,
+ &autoprewarm,
+ true,
+ PGC_POSTMASTER,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_prewarm");
+
+ prev_shmem_request_hook = shmem_request_hook;
+ shmem_request_hook = autoprewarm_shmem_request;
+
+ /* Register autoprewarm worker, if enabled. */
+ if (autoprewarm)
+ apw_start_leader_worker();
+}
+
+/*
+ * Requests any additional shared memory required for autoprewarm.
+ */
+static void
+autoprewarm_shmem_request(void)
+{
+ if (prev_shmem_request_hook)
+ prev_shmem_request_hook();
+
+ RequestAddinShmemSpace(MAXALIGN(sizeof(AutoPrewarmSharedState)));
+}
+
+/*
+ * Main entry point for the leader autoprewarm process. Per-database workers
+ * have a separate entry point.
+ */
+void
+autoprewarm_main(Datum main_arg)
+{
+ bool first_time = true;
+ bool final_dump_allowed = true;
+ TimestampTz last_dump_time = 0;
+
+ /* Establish signal handlers; once that's done, unblock signals. */
+ pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
+ pqsignal(SIGHUP, SignalHandlerForConfigReload);
+ pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+ BackgroundWorkerUnblockSignals();
+
+ /* Create (if necessary) and attach to our shared memory area. */
+ if (apw_init_shmem())
+ first_time = false;
+
+ /* Set on-detach hook so that our PID will be cleared on exit. */
+ on_shmem_exit(apw_detach_shmem, 0);
+
+ /*
+ * Store our PID in the shared memory area --- unless there's already
+ * another worker running, in which case just exit.
+ */
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ if (apw_state->bgworker_pid != InvalidPid)
+ {
+ LWLockRelease(&apw_state->lock);
+ ereport(LOG,
+ (errmsg("autoprewarm worker is already running under PID %lu",
+ (unsigned long) apw_state->bgworker_pid)));
+ return;
+ }
+ apw_state->bgworker_pid = MyProcPid;
+ LWLockRelease(&apw_state->lock);
+
+ /*
+ * Preload buffers from the dump file only if we just created the shared
+ * memory region. Otherwise, it's either already been done or shouldn't
+ * be done - e.g. because the old dump file has been overwritten since the
+ * server was started.
+ *
+ * There's not much point in performing a dump immediately after we finish
+ * preloading; so, if we do end up preloading, consider the last dump time
+ * to be equal to the current time.
+ *
+ * If apw_load_buffers() is terminated early by a shutdown request,
+ * prevent dumping out our state below the loop, because we'd effectively
+ * just truncate the saved state to however much we'd managed to preload.
+ */
+ if (first_time)
+ {
+ apw_load_buffers();
+ final_dump_allowed = !ShutdownRequestPending;
+ last_dump_time = GetCurrentTimestamp();
+ }
+
+ /* Periodically dump buffers until terminated. */
+ while (!ShutdownRequestPending)
+ {
+ /* In case of a SIGHUP, just reload the configuration. */
+ if (ConfigReloadPending)
+ {
+ ConfigReloadPending = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+
+ if (autoprewarm_interval <= 0)
+ {
+ /* We're only dumping at shutdown, so just wait forever. */
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+ -1L,
+ PG_WAIT_EXTENSION);
+ }
+ else
+ {
+ TimestampTz next_dump_time;
+ long delay_in_ms;
+
+ /* Compute the next dump time. */
+ next_dump_time =
+ TimestampTzPlusMilliseconds(last_dump_time,
+ autoprewarm_interval * 1000);
+ delay_in_ms =
+ TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
+ next_dump_time);
+
+ /* Perform a dump if it's time. */
+ if (delay_in_ms <= 0)
+ {
+ last_dump_time = GetCurrentTimestamp();
+ apw_dump_now(true, false);
+ continue;
+ }
+
+ /* Sleep until the next dump time. */
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ delay_in_ms,
+ PG_WAIT_EXTENSION);
+ }
+
+ /* Reset the latch, loop. */
+ ResetLatch(MyLatch);
+ }
+
+ /*
+ * Dump one last time. We assume this is probably the result of a system
+ * shutdown, although it's possible that we've merely been terminated.
+ */
+ if (final_dump_allowed)
+ apw_dump_now(true, true);
+}
+
+/*
+ * Read the dump file and launch per-database workers one at a time to
+ * prewarm the buffers found there.
+ */
+static void
+apw_load_buffers(void)
+{
+ FILE *file = NULL;
+ int num_elements,
+ i;
+ BlockInfoRecord *blkinfo;
+ dsm_segment *seg;
+
+ /*
+ * Skip the prewarm if the dump file is in use; otherwise, prevent any
+ * other process from writing it while we're using it.
+ */
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ if (apw_state->pid_using_dumpfile == InvalidPid)
+ apw_state->pid_using_dumpfile = MyProcPid;
+ else
+ {
+ LWLockRelease(&apw_state->lock);
+ ereport(LOG,
+ (errmsg("skipping prewarm because block dump file is being written by PID %lu",
+ (unsigned long) apw_state->pid_using_dumpfile)));
+ return;
+ }
+ LWLockRelease(&apw_state->lock);
+
+ /*
+ * Open the block dump file. Exit quietly if it doesn't exist, but report
+ * any other error.
+ */
+ file = AllocateFile(AUTOPREWARM_FILE, "r");
+ if (!file)
+ {
+ if (errno == ENOENT)
+ {
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ apw_state->pid_using_dumpfile = InvalidPid;
+ LWLockRelease(&apw_state->lock);
+ return; /* No file to load. */
+ }
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ AUTOPREWARM_FILE)));
+ }
+
+ /* First line of the file is a record count. */
+ if (fscanf(file, "<<%d>>\n", &num_elements) != 1)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read from file \"%s\": %m",
+ AUTOPREWARM_FILE)));
+
+ /* Allocate a dynamic shared memory segment to store the record data. */
+ seg = dsm_create(sizeof(BlockInfoRecord) * num_elements, 0);
+ blkinfo = (BlockInfoRecord *) dsm_segment_address(seg);
+
+ /* Read records, one per line. */
+ for (i = 0; i < num_elements; i++)
+ {
+ unsigned forknum;
+
+ if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
+ &blkinfo[i].tablespace, &blkinfo[i].filenode,
+ &forknum, &blkinfo[i].blocknum) != 5)
+ ereport(ERROR,
+ (errmsg("autoprewarm block dump file is corrupted at line %d",
+ i + 1)));
+ blkinfo[i].forknum = forknum;
+ }
+
+ FreeFile(file);
+
+ /* Sort the blocks to be loaded. */
+ pg_qsort(blkinfo, num_elements, sizeof(BlockInfoRecord),
+ apw_compare_blockinfo);
+
+ /* Populate shared memory state. */
+ apw_state->block_info_handle = dsm_segment_handle(seg);
+ apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx = 0;
+ apw_state->prewarmed_blocks = 0;
+
+ /* Get the info position of the first block of the next database. */
+ while (apw_state->prewarm_start_idx < num_elements)
+ {
+ int j = apw_state->prewarm_start_idx;
+ Oid current_db = blkinfo[j].database;
+
+ /*
+ * Advance the prewarm_stop_idx to the first BlockInfoRecord that does
+ * not belong to this database.
+ */
+ j++;
+ while (j < num_elements)
+ {
+ if (current_db != blkinfo[j].database)
+ {
+ /*
+ * Combine BlockInfoRecords for global objects with those of
+ * the database.
+ */
+ if (current_db != InvalidOid)
+ break;
+ current_db = blkinfo[j].database;
+ }
+
+ j++;
+ }
+
+ /*
+ * If we reach this point with current_db == InvalidOid, then only
+ * BlockInfoRecords belonging to global objects exist. We can't
+ * prewarm without a database connection, so just bail out.
+ */
+ if (current_db == InvalidOid)
+ break;
+
+ /* Configure stop point and database for next per-database worker. */
+ apw_state->prewarm_stop_idx = j;
+ apw_state->database = current_db;
+ Assert(apw_state->prewarm_start_idx < apw_state->prewarm_stop_idx);
+
+ /* If we've run out of free buffers, don't launch another worker. */
+ if (!have_free_buffer())
+ break;
+
+ /*
+ * Likewise, don't launch if we've already been told to shut down.
+ * (The launch would fail anyway, but we might as well skip it.)
+ */
+ if (ShutdownRequestPending)
+ break;
+
+ /*
+ * Start a per-database worker to load blocks for this database; this
+ * function will return once the per-database worker exits.
+ */
+ apw_start_database_worker();
+
+ /* Prepare for next database. */
+ apw_state->prewarm_start_idx = apw_state->prewarm_stop_idx;
+ }
+
+ /* Clean up. */
+ dsm_detach(seg);
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ apw_state->block_info_handle = DSM_HANDLE_INVALID;
+ apw_state->pid_using_dumpfile = InvalidPid;
+ LWLockRelease(&apw_state->lock);
+
+ /* Report our success, if we were able to finish. */
+ if (!ShutdownRequestPending)
+ ereport(LOG,
+ (errmsg("autoprewarm successfully prewarmed %d of %d previously-loaded blocks",
+ apw_state->prewarmed_blocks, num_elements)));
+}
+
+/*
+ * Prewarm all blocks for one database (and possibly also global objects, if
+ * those got grouped with this database).
+ */
+void
+autoprewarm_database_main(Datum main_arg)
+{
+ int pos;
+ BlockInfoRecord *block_info;
+ Relation rel = NULL;
+ BlockNumber nblocks = 0;
+ BlockInfoRecord *old_blk = NULL;
+ dsm_segment *seg;
+
+ /* Establish signal handlers; once that's done, unblock signals. */
+ pqsignal(SIGTERM, die);
+ BackgroundWorkerUnblockSignals();
+
+ /* Connect to correct database and get block information. */
+ apw_init_shmem();
+ seg = dsm_attach(apw_state->block_info_handle);
+ if (seg == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not map dynamic shared memory segment")));
+ BackgroundWorkerInitializeConnectionByOid(apw_state->database, InvalidOid, 0);
+ block_info = (BlockInfoRecord *) dsm_segment_address(seg);
+ pos = apw_state->prewarm_start_idx;
+
+ /*
+ * Loop until we run out of blocks to prewarm or until we run out of free
+ * buffers.
+ */
+ while (pos < apw_state->prewarm_stop_idx && have_free_buffer())
+ {
+ BlockInfoRecord *blk = &block_info[pos++];
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Quit if we've reached records for another database. If previous
+ * blocks are of some global objects, then continue pre-warming.
+ */
+ if (old_blk != NULL && old_blk->database != blk->database &&
+ old_blk->database != 0)
+ break;
+
+ /*
+ * As soon as we encounter a block of a new relation, close the old
+ * relation. Note that rel will be NULL if try_relation_open failed
+ * previously; in that case, there is nothing to close.
+ */
+ if (old_blk != NULL && old_blk->filenode != blk->filenode &&
+ rel != NULL)
+ {
+ relation_close(rel, AccessShareLock);
+ rel = NULL;
+ CommitTransactionCommand();
+ }
+
+ /*
+ * Try to open each new relation, but only once, when we first
+ * encounter it. If it's been dropped, skip the associated blocks.
+ */
+ if (old_blk == NULL || old_blk->filenode != blk->filenode)
+ {
+ Oid reloid;
+
+ Assert(rel == NULL);
+ StartTransactionCommand();
+ reloid = RelidByRelfilenode(blk->tablespace, blk->filenode);
+ if (OidIsValid(reloid))
+ rel = try_relation_open(reloid, AccessShareLock);
+
+ if (!rel)
+ CommitTransactionCommand();
+ }
+ if (!rel)
+ {
+ old_blk = blk;
+ continue;
+ }
+
+ /* Once per fork, check for fork existence and size. */
+ if (old_blk == NULL ||
+ old_blk->filenode != blk->filenode ||
+ old_blk->forknum != blk->forknum)
+ {
+ /*
+ * smgrexists is not safe for illegal forknum, hence check whether
+ * the passed forknum is valid before using it in smgrexists.
+ */
+ if (blk->forknum > InvalidForkNumber &&
+ blk->forknum <= MAX_FORKNUM &&
+ smgrexists(RelationGetSmgr(rel), blk->forknum))
+ nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum);
+ else
+ nblocks = 0;
+ }
+
+ /* Check whether blocknum is valid and within fork file size. */
+ if (blk->blocknum >= nblocks)
+ {
+ /* Move to next forknum. */
+ old_blk = blk;
+ continue;
+ }
+
+ /* Prewarm buffer. */
+ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
+ NULL);
+ if (BufferIsValid(buf))
+ {
+ apw_state->prewarmed_blocks++;
+ ReleaseBuffer(buf);
+ }
+
+ old_blk = blk;
+ }
+
+ dsm_detach(seg);
+
+ /* Release lock on previous relation. */
+ if (rel)
+ {
+ relation_close(rel, AccessShareLock);
+ CommitTransactionCommand();
+ }
+}
+
+/*
+ * Dump information on blocks in shared buffers. We use a text format here
+ * so that it's easy to understand and even change the file contents if
+ * necessary.
+ * Returns the number of blocks dumped.
+ */
+static int
+apw_dump_now(bool is_bgworker, bool dump_unlogged)
+{
+ int num_blocks;
+ int i;
+ int ret;
+ BlockInfoRecord *block_info_array;
+ BufferDesc *bufHdr;
+ FILE *file;
+ char transient_dump_file_path[MAXPGPATH];
+ pid_t pid;
+
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ pid = apw_state->pid_using_dumpfile;
+ if (apw_state->pid_using_dumpfile == InvalidPid)
+ apw_state->pid_using_dumpfile = MyProcPid;
+ LWLockRelease(&apw_state->lock);
+
+ if (pid != InvalidPid)
+ {
+ if (!is_bgworker)
+ ereport(ERROR,
+ (errmsg("could not perform block dump because dump file is being used by PID %lu",
+ (unsigned long) apw_state->pid_using_dumpfile)));
+
+ ereport(LOG,
+ (errmsg("skipping block dump because it is already being performed by PID %lu",
+ (unsigned long) apw_state->pid_using_dumpfile)));
+ return 0;
+ }
+
+ block_info_array =
+ (BlockInfoRecord *) palloc(sizeof(BlockInfoRecord) * NBuffers);
+
+ for (num_blocks = 0, i = 0; i < NBuffers; i++)
+ {
+ uint32 buf_state;
+
+ CHECK_FOR_INTERRUPTS();
+
+ bufHdr = GetBufferDescriptor(i);
+
+ /* Lock each buffer header before inspecting. */
+ buf_state = LockBufHdr(bufHdr);
+
+ /*
+ * Unlogged tables will be automatically truncated after a crash or
+ * unclean shutdown. In such cases we need not prewarm them. Dump them
+ * only if requested by caller.
+ */
+ if (buf_state & BM_TAG_VALID &&
+ ((buf_state & BM_PERMANENT) || dump_unlogged))
+ {
+ block_info_array[num_blocks].database = bufHdr->tag.rnode.dbNode;
+ block_info_array[num_blocks].tablespace = bufHdr->tag.rnode.spcNode;
+ block_info_array[num_blocks].filenode = bufHdr->tag.rnode.relNode;
+ block_info_array[num_blocks].forknum = bufHdr->tag.forkNum;
+ block_info_array[num_blocks].blocknum = bufHdr->tag.blockNum;
+ ++num_blocks;
+ }
+
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ snprintf(transient_dump_file_path, MAXPGPATH, "%s.tmp", AUTOPREWARM_FILE);
+ file = AllocateFile(transient_dump_file_path, "w");
+ if (!file)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ transient_dump_file_path)));
+
+ ret = fprintf(file, "<<%d>>\n", num_blocks);
+ if (ret < 0)
+ {
+ int save_errno = errno;
+
+ FreeFile(file);
+ unlink(transient_dump_file_path);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m",
+ transient_dump_file_path)));
+ }
+
+ for (i = 0; i < num_blocks; i++)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ ret = fprintf(file, "%u,%u,%u,%u,%u\n",
+ block_info_array[i].database,
+ block_info_array[i].tablespace,
+ block_info_array[i].filenode,
+ (uint32) block_info_array[i].forknum,
+ block_info_array[i].blocknum);
+ if (ret < 0)
+ {
+ int save_errno = errno;
+
+ FreeFile(file);
+ unlink(transient_dump_file_path);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m",
+ transient_dump_file_path)));
+ }
+ }
+
+ pfree(block_info_array);
+
+ /*
+ * Rename transient_dump_file_path to AUTOPREWARM_FILE to make things
+ * permanent.
+ */
+ ret = FreeFile(file);
+ if (ret != 0)
+ {
+ int save_errno = errno;
+
+ unlink(transient_dump_file_path);
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m",
+ transient_dump_file_path)));
+ }
+
+ (void) durable_rename(transient_dump_file_path, AUTOPREWARM_FILE, ERROR);
+ apw_state->pid_using_dumpfile = InvalidPid;
+
+ ereport(DEBUG1,
+ (errmsg_internal("wrote block details for %d blocks", num_blocks)));
+ return num_blocks;
+}
+
+/*
+ * SQL-callable function to launch autoprewarm.
+ */
+Datum
+autoprewarm_start_worker(PG_FUNCTION_ARGS)
+{
+ pid_t pid;
+
+ if (!autoprewarm)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("autoprewarm is disabled")));
+
+ apw_init_shmem();
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ pid = apw_state->bgworker_pid;
+ LWLockRelease(&apw_state->lock);
+
+ if (pid != InvalidPid)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("autoprewarm worker is already running under PID %lu",
+ (unsigned long) pid)));
+
+ apw_start_leader_worker();
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL-callable function to perform an immediate block dump.
+ *
+ * Note: this is declared to return int8, as insurance against some
+ * very distant day when we might make NBuffers wider than int.
+ */
+Datum
+autoprewarm_dump_now(PG_FUNCTION_ARGS)
+{
+ int num_blocks;
+
+ apw_init_shmem();
+
+ PG_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
+ {
+ num_blocks = apw_dump_now(false, true);
+ }
+ PG_END_ENSURE_ERROR_CLEANUP(apw_detach_shmem, 0);
+
+ PG_RETURN_INT64((int64) num_blocks);
+}
+
+/*
+ * Allocate and initialize autoprewarm related shared memory, if not already
+ * done, and set up backend-local pointer to that state. Returns true if an
+ * existing shared memory segment was found.
+ */
+static bool
+apw_init_shmem(void)
+{
+ bool found;
+
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+ apw_state = ShmemInitStruct("autoprewarm",
+ sizeof(AutoPrewarmSharedState),
+ &found);
+ if (!found)
+ {
+ /* First time through ... */
+ LWLockInitialize(&apw_state->lock, LWLockNewTrancheId());
+ apw_state->bgworker_pid = InvalidPid;
+ apw_state->pid_using_dumpfile = InvalidPid;
+ }
+ LWLockRelease(AddinShmemInitLock);
+
+ LWLockRegisterTranche(apw_state->lock.tranche, "autoprewarm");
+
+ return found;
+}
+
+/*
+ * Clear our PID from autoprewarm shared state.
+ */
+static void
+apw_detach_shmem(int code, Datum arg)
+{
+ LWLockAcquire(&apw_state->lock, LW_EXCLUSIVE);
+ if (apw_state->pid_using_dumpfile == MyProcPid)
+ apw_state->pid_using_dumpfile = InvalidPid;
+ if (apw_state->bgworker_pid == MyProcPid)
+ apw_state->bgworker_pid = InvalidPid;
+ LWLockRelease(&apw_state->lock);
+}
+
+/*
+ * Start autoprewarm leader worker process.
+ */
+static void
+apw_start_leader_worker(void)
+{
+ BackgroundWorker worker;
+ BackgroundWorkerHandle *handle;
+ BgwHandleStatus status;
+ pid_t pid;
+
+ MemSet(&worker, 0, sizeof(BackgroundWorker));
+ worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
+ worker.bgw_start_time = BgWorkerStart_ConsistentState;
+ strcpy(worker.bgw_library_name, "pg_prewarm");
+ strcpy(worker.bgw_function_name, "autoprewarm_main");
+ strcpy(worker.bgw_name, "autoprewarm leader");
+ strcpy(worker.bgw_type, "autoprewarm leader");
+
+ if (process_shared_preload_libraries_in_progress)
+ {
+ RegisterBackgroundWorker(&worker);
+ return;
+ }
+
+ /* must set notify PID to wait for startup */
+ worker.bgw_notify_pid = MyProcPid;
+
+ if (!RegisterDynamicBackgroundWorker(&worker, &handle))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("could not register background process"),
+ errhint("You may need to increase max_worker_processes.")));
+
+ status = WaitForBackgroundWorkerStartup(handle, &pid);
+ if (status != BGWH_STARTED)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("could not start background process"),
+ errhint("More details may be available in the server log.")));
+}
+
+/*
+ * Start autoprewarm per-database worker process.
+ */
+static void
+apw_start_database_worker(void)
+{
+ BackgroundWorker worker;
+ BackgroundWorkerHandle *handle;
+
+ MemSet(&worker, 0, sizeof(BackgroundWorker));
+ worker.bgw_flags =
+ BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ worker.bgw_start_time = BgWorkerStart_ConsistentState;
+ worker.bgw_restart_time = BGW_NEVER_RESTART;
+ strcpy(worker.bgw_library_name, "pg_prewarm");
+ strcpy(worker.bgw_function_name, "autoprewarm_database_main");
+ strcpy(worker.bgw_name, "autoprewarm worker");
+ strcpy(worker.bgw_type, "autoprewarm worker");
+
+ /* must set notify PID to wait for shutdown */
+ worker.bgw_notify_pid = MyProcPid;
+
+ if (!RegisterDynamicBackgroundWorker(&worker, &handle))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("registering dynamic bgworker autoprewarm failed"),
+ errhint("Consider increasing configuration parameter \"max_worker_processes\".")));
+
+ /*
+ * Ignore return value; if it fails, postmaster has died, but we have
+ * checks for that elsewhere.
+ */
+ WaitForBackgroundWorkerShutdown(handle);
+}
+
+/* Compare member elements to check whether they are not equal. */
+#define cmp_member_elem(fld) \
+do { \
+ if (a->fld < b->fld) \
+ return -1; \
+ else if (a->fld > b->fld) \
+ return 1; \
+} while(0)
+
+/*
+ * apw_compare_blockinfo
+ *
+ * We depend on all records for a particular database being consecutive
+ * in the dump file; each per-database worker will preload blocks until
+ * it sees a block for some other database. Sorting by tablespace,
+ * filenode, forknum, and blocknum isn't critical for correctness, but
+ * helps us get a sequential I/O pattern.
+ */
+static int
+apw_compare_blockinfo(const void *p, const void *q)
+{
+ const BlockInfoRecord *a = (const BlockInfoRecord *) p;
+ const BlockInfoRecord *b = (const BlockInfoRecord *) q;
+
+ cmp_member_elem(database);
+ cmp_member_elem(tablespace);
+ cmp_member_elem(filenode);
+ cmp_member_elem(forknum);
+ cmp_member_elem(blocknum);
+
+ return 0;
+}
diff --git a/contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql b/contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql
new file mode 100644
index 0000000..9966054
--- /dev/null
+++ b/contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql
@@ -0,0 +1,6 @@
+/* contrib/pg_prewarm/pg_prewarm--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_prewarm UPDATE TO '1.1'" to load this file. \quit
+
+ALTER FUNCTION pg_prewarm(regclass, text, text, int8, int8) PARALLEL SAFE;
diff --git a/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql
new file mode 100644
index 0000000..2381c06
--- /dev/null
+++ b/contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql
@@ -0,0 +1,14 @@
+/* contrib/pg_prewarm/pg_prewarm--1.1--1.2.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_prewarm UPDATE TO '1.2'" to load this file. \quit
+
+CREATE FUNCTION autoprewarm_start_worker()
+RETURNS VOID STRICT
+AS 'MODULE_PATHNAME', 'autoprewarm_start_worker'
+LANGUAGE C;
+
+CREATE FUNCTION autoprewarm_dump_now()
+RETURNS pg_catalog.int8 STRICT
+AS 'MODULE_PATHNAME', 'autoprewarm_dump_now'
+LANGUAGE C;
diff --git a/contrib/pg_prewarm/pg_prewarm--1.1.sql b/contrib/pg_prewarm/pg_prewarm--1.1.sql
new file mode 100644
index 0000000..b150895
--- /dev/null
+++ b/contrib/pg_prewarm/pg_prewarm--1.1.sql
@@ -0,0 +1,14 @@
+/* contrib/pg_prewarm/pg_prewarm--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_prewarm" to load this file. \quit
+
+-- Register the function.
+CREATE FUNCTION pg_prewarm(regclass,
+ mode text default 'buffer',
+ fork text default 'main',
+ first_block int8 default null,
+ last_block int8 default null)
+RETURNS int8
+AS 'MODULE_PATHNAME', 'pg_prewarm'
+LANGUAGE C PARALLEL SAFE;
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
new file mode 100644
index 0000000..caff5c4
--- /dev/null
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -0,0 +1,204 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_prewarm.c
+ * prewarming utilities
+ *
+ * Copyright (c) 2010-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * contrib/pg_prewarm/pg_prewarm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/relation.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(pg_prewarm);
+
+typedef enum
+{
+ PREWARM_PREFETCH,
+ PREWARM_READ,
+ PREWARM_BUFFER
+} PrewarmType;
+
+static PGAlignedBlock blockbuffer;
+
+/*
+ * pg_prewarm(regclass, mode text, fork text,
+ * first_block int8, last_block int8)
+ *
+ * The first argument is the relation to be prewarmed; the second controls
+ * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'.
+ * The third is the name of the relation fork to be prewarmed. The fourth
+ * and fifth arguments specify the first and last block to be prewarmed.
+ * If the fourth argument is NULL, it will be taken as 0; if the fifth argument
+ * is NULL, it will be taken as the number of blocks in the relation. The
+ * return value is the number of blocks successfully prewarmed.
+ */
+Datum
+pg_prewarm(PG_FUNCTION_ARGS)
+{
+ Oid relOid;
+ text *forkName;
+ text *type;
+ int64 first_block;
+ int64 last_block;
+ int64 nblocks;
+ int64 blocks_done = 0;
+ int64 block;
+ Relation rel;
+ ForkNumber forkNumber;
+ char *forkString;
+ char *ttype;
+ PrewarmType ptype;
+ AclResult aclresult;
+
+ /* Basic sanity checking. */
+ if (PG_ARGISNULL(0))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("relation cannot be null")));
+ relOid = PG_GETARG_OID(0);
+ if (PG_ARGISNULL(1))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("prewarm type cannot be null")));
+ type = PG_GETARG_TEXT_PP(1);
+ ttype = text_to_cstring(type);
+ if (strcmp(ttype, "prefetch") == 0)
+ ptype = PREWARM_PREFETCH;
+ else if (strcmp(ttype, "read") == 0)
+ ptype = PREWARM_READ;
+ else if (strcmp(ttype, "buffer") == 0)
+ ptype = PREWARM_BUFFER;
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid prewarm type"),
+ errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\".")));
+ PG_RETURN_INT64(0); /* Placate compiler. */
+ }
+ if (PG_ARGISNULL(2))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("relation fork cannot be null")));
+ forkName = PG_GETARG_TEXT_PP(2);
+ forkString = text_to_cstring(forkName);
+ forkNumber = forkname_to_number(forkString);
+
+ /* Open relation and check privileges. */
+ rel = relation_open(relOid, AccessShareLock);
+ aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT);
+ if (aclresult != ACLCHECK_OK)
+ aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid));
+
+ /* Check that the fork exists. */
+ if (!smgrexists(RelationGetSmgr(rel), forkNumber))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("fork \"%s\" does not exist for this relation",
+ forkString)));
+
+ /* Validate block numbers, or handle nulls. */
+ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber);
+ if (PG_ARGISNULL(3))
+ first_block = 0;
+ else
+ {
+ first_block = PG_GETARG_INT64(3);
+ if (first_block < 0 || first_block >= nblocks)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("starting block number must be between 0 and %lld",
+ (long long) (nblocks - 1))));
+ }
+ if (PG_ARGISNULL(4))
+ last_block = nblocks - 1;
+ else
+ {
+ last_block = PG_GETARG_INT64(4);
+ if (last_block < 0 || last_block >= nblocks)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ending block number must be between 0 and %lld",
+ (long long) (nblocks - 1))));
+ }
+
+ /* Now we're ready to do the real work. */
+ if (ptype == PREWARM_PREFETCH)
+ {
+#ifdef USE_PREFETCH
+
+ /*
+ * In prefetch mode, we just hint the OS to read the blocks, but we
+ * don't know whether it really does it, and we don't wait for it to
+ * finish.
+ *
+ * It would probably be better to pass our prefetch requests in chunks
+ * of a megabyte or maybe even a whole segment at a time, but there's
+ * no practical way to do that at present without a gross modularity
+ * violation, so we just do this.
+ */
+ for (block = first_block; block <= last_block; ++block)
+ {
+ CHECK_FOR_INTERRUPTS();
+ PrefetchBuffer(rel, forkNumber, block);
+ ++blocks_done;
+ }
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("prefetch is not supported by this build")));
+#endif
+ }
+ else if (ptype == PREWARM_READ)
+ {
+ /*
+ * In read mode, we actually read the blocks, but not into shared
+ * buffers. This is more portable than prefetch mode (it works
+ * everywhere) and is synchronous.
+ */
+ for (block = first_block; block <= last_block; ++block)
+ {
+ CHECK_FOR_INTERRUPTS();
+ smgrread(RelationGetSmgr(rel), forkNumber, block, blockbuffer.data);
+ ++blocks_done;
+ }
+ }
+ else if (ptype == PREWARM_BUFFER)
+ {
+ /*
+ * In buffer mode, we actually pull the data into shared_buffers.
+ */
+ for (block = first_block; block <= last_block; ++block)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+ buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
+ ReleaseBuffer(buf);
+ ++blocks_done;
+ }
+ }
+
+ /* Close relation, release lock. */
+ relation_close(rel, AccessShareLock);
+
+ PG_RETURN_INT64(blocks_done);
+}
diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control
new file mode 100644
index 0000000..40e3add
--- /dev/null
+++ b/contrib/pg_prewarm/pg_prewarm.control
@@ -0,0 +1,5 @@
+# pg_prewarm extension
+comment = 'prewarm relation data'
+default_version = '1.2'
+module_pathname = '$libdir/pg_prewarm'
+relocatable = true